Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit f953f4a

Browse files
[chore]: Added scrapper module
1 parent 19495cd commit f953f4a

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

‎Google-Image-Scrapper/scrapper.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import os
2+
import time
3+
import urllib
4+
import requests
5+
from urllib.parse import quote
6+
import array as arr
7+
8+
class simple_image_download:
9+
def __init__(self):
10+
pass
11+
12+
def urls(self, keywords, limit):
13+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
14+
i = 0
15+
links = []
16+
while i < len(keyword_to_search):
17+
url = 'https://www.google.com/search?q=' + quote(
18+
keyword_to_search[i].encode(
19+
'utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
20+
raw_html = self._download_page(url)
21+
22+
end_object = -1;
23+
24+
j = 0
25+
while j < limit:
26+
while (True):
27+
try:
28+
new_line = raw_html.find('"https://', end_object + 1)
29+
end_object = raw_html.find('"', new_line + 1)
30+
31+
buffor = raw_html.find('\\', new_line + 1, end_object)
32+
if buffor != -1:
33+
object_raw = (raw_html[new_line + 1:buffor])
34+
else:
35+
object_raw = (raw_html[new_line + 1:end_object])
36+
37+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
38+
break
39+
40+
except Exception as e:
41+
print(e)
42+
break
43+
44+
links.append(object_raw)
45+
j += 1
46+
47+
i += 1
48+
return(links)
49+
50+
51+
def download(self, keywords, limit):
52+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
53+
main_directory = "simple_images/"
54+
i = 0
55+
56+
while i < len(keyword_to_search):
57+
self._create_directories(main_directory, keyword_to_search[i])
58+
url = 'https://www.google.com/search?q=' + quote(
59+
keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
60+
raw_html = self._download_page(url)
61+
62+
end_object = -1;
63+
64+
j = 0
65+
while j < limit:
66+
while (True):
67+
try:
68+
new_line = raw_html.find('"https://', end_object + 1)
69+
end_object = raw_html.find('"', new_line + 1)
70+
71+
buffor = raw_html.find('\\', new_line + 1, end_object)
72+
if buffor != -1:
73+
object_raw = (raw_html[new_line+1:buffor])
74+
else:
75+
object_raw = (raw_html[new_line+1:end_object])
76+
77+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
78+
break
79+
80+
except Exception as e:
81+
print(e)
82+
break
83+
84+
path = main_directory + keyword_to_search[i]
85+
86+
#print(object_raw)
87+
88+
if not os.path.exists(path):
89+
os.makedirs(path)
90+
91+
filename = str(keyword_to_search[i]) + "_" + str(j + 1) + ".jpg"
92+
93+
try:
94+
r = requests.get(object_raw, allow_redirects=True)
95+
open(os.path.join(path, filename), 'wb').write(r.content)
96+
except Exception as e:
97+
print(e)
98+
j -= 1
99+
j += 1
100+
101+
i += 1
102+
103+
104+
def _create_directories(self, main_directory, name):
105+
try:
106+
if not os.path.exists(main_directory):
107+
os.makedirs(main_directory)
108+
time.sleep(0.2)
109+
path = (name)
110+
sub_directory = os.path.join(main_directory, path)
111+
if not os.path.exists(sub_directory):
112+
os.makedirs(sub_directory)
113+
else:
114+
path = (name)
115+
sub_directory = os.path.join(main_directory, path)
116+
if not os.path.exists(sub_directory):
117+
os.makedirs(sub_directory)
118+
119+
except OSError as e:
120+
if e.errno != 17:
121+
raise
122+
pass
123+
return
124+
125+
def _download_page(self,url):
126+
127+
try:
128+
headers = {}
129+
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
130+
req = urllib.request.Request(url, headers=headers)
131+
resp = urllib.request.urlopen(req)
132+
respData = str(resp.read())
133+
return respData
134+
135+
except Exception as e:
136+
print(e)
137+
exit(0)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /