Commit f953f4a

committed

[chore]: Added scrapper module

1 parent 19495cd commit f953f4aCopy full SHA for f953f4a

File tree

1 file changed

+137

-0

lines changed

Google-Image-Scrapper
- scrapper.py

1 file changed

+137

-0

lines changed

`‎Google-Image-Scrapper/scrapper.py`

Lines changed: 137 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,137 @@`
	`1`	`+import os`
	`2`	`+import time`
	`3`	`+import urllib`
	`4`	`+import requests`
	`5`	`+from urllib.parse import quote`
	`6`	`+import array as arr`
	`7`	`+`
	`8`	`+class simple_image_download:`
	`9`	`+ def __init__(self):`
	`10`	`+ pass`
	`11`	`+`
	`12`	`+ def urls(self, keywords, limit):`
	`13`	`+ keyword_to_search = [str(item).strip() for item in keywords.split(',')]`
	`14`	`+ i = 0`
	`15`	`+ links = []`
	`16`	`+ while i < len(keyword_to_search):`
	`17`	`+ url = 'https://www.google.com/search?q=' + quote(`
	`18`	`+ keyword_to_search[i].encode(`
	`19`	`+ 'utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'`
	`20`	`+ raw_html = self._download_page(url)`
	`21`	`+`
	`22`	`+ end_object = -1;`
	`23`	`+`
	`24`	`+ j = 0`
	`25`	`+ while j < limit:`
	`26`	`+ while (True):`
	`27`	`+ try:`
	`28`	`+ new_line = raw_html.find('"https://', end_object + 1)`
	`29`	`+ end_object = raw_html.find('"', new_line + 1)`
	`30`	`+`
	`31`	`+ buffor = raw_html.find('\\', new_line + 1, end_object)`
	`32`	`+ if buffor != -1:`
	`33`	`+ object_raw = (raw_html[new_line + 1:buffor])`
	`34`	`+ else:`
	`35`	`+ object_raw = (raw_html[new_line + 1:end_object])`
	`36`	`+`
	`37`	`+ if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:`
	`38`	`+ break`
	`39`	`+`
	`40`	`+ except Exception as e:`
	`41`	`+ print(e)`
	`42`	`+ break`
	`43`	`+`
	`44`	`+ links.append(object_raw)`
	`45`	`+ j += 1`
	`46`	`+`
	`47`	`+ i += 1`
	`48`	`+ return(links)`
	`49`	`+`
	`50`	`+`
	`51`	`+ def download(self, keywords, limit):`
	`52`	`+ keyword_to_search = [str(item).strip() for item in keywords.split(',')]`
	`53`	`+ main_directory = "simple_images/"`
	`54`	`+ i = 0`
	`55`	`+`
	`56`	`+ while i < len(keyword_to_search):`
	`57`	`+ self._create_directories(main_directory, keyword_to_search[i])`
	`58`	`+ url = 'https://www.google.com/search?q=' + quote(`
	`59`	`+ keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'`
	`60`	`+ raw_html = self._download_page(url)`
	`61`	`+`
	`62`	`+ end_object = -1;`
	`63`	`+`
	`64`	`+ j = 0`
	`65`	`+ while j < limit:`
	`66`	`+ while (True):`
	`67`	`+ try:`
	`68`	`+ new_line = raw_html.find('"https://', end_object + 1)`
	`69`	`+ end_object = raw_html.find('"', new_line + 1)`
	`70`	`+`
	`71`	`+ buffor = raw_html.find('\\', new_line + 1, end_object)`
	`72`	`+ if buffor != -1:`
	`73`	`+ object_raw = (raw_html[new_line+1:buffor])`
	`74`	`+ else:`
	`75`	`+ object_raw = (raw_html[new_line+1:end_object])`
	`76`	`+`
	`77`	`+ if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:`
	`78`	`+ break`
	`79`	`+`
	`80`	`+ except Exception as e:`
	`81`	`+ print(e)`
	`82`	`+ break`
	`83`	`+`
	`84`	`+ path = main_directory + keyword_to_search[i]`
	`85`	`+`
	`86`	`+ #print(object_raw)`
	`87`	`+`
	`88`	`+ if not os.path.exists(path):`
	`89`	`+ os.makedirs(path)`
	`90`	`+`
	`91`	`+ filename = str(keyword_to_search[i]) + "_" + str(j + 1) + ".jpg"`
	`92`	`+`
	`93`	`+ try:`
	`94`	`+ r = requests.get(object_raw, allow_redirects=True)`
	`95`	`+ open(os.path.join(path, filename), 'wb').write(r.content)`
	`96`	`+ except Exception as e:`
	`97`	`+ print(e)`
	`98`	`+ j -= 1`
	`99`	`+ j += 1`
	`100`	`+`
	`101`	`+ i += 1`
	`102`	`+`
	`103`	`+`
	`104`	`+ def _create_directories(self, main_directory, name):`
	`105`	`+ try:`
	`106`	`+ if not os.path.exists(main_directory):`
	`107`	`+ os.makedirs(main_directory)`
	`108`	`+ time.sleep(0.2)`
	`109`	`+ path = (name)`
	`110`	`+ sub_directory = os.path.join(main_directory, path)`
	`111`	`+ if not os.path.exists(sub_directory):`
	`112`	`+ os.makedirs(sub_directory)`
	`113`	`+ else:`
	`114`	`+ path = (name)`
	`115`	`+ sub_directory = os.path.join(main_directory, path)`
	`116`	`+ if not os.path.exists(sub_directory):`
	`117`	`+ os.makedirs(sub_directory)`
	`118`	`+`
	`119`	`+ except OSError as e:`
	`120`	`+ if e.errno != 17:`
	`121`	`+ raise`
	`122`	`+ pass`
	`123`	`+ return`
	`124`	`+`
	`125`	`+ def _download_page(self,url):`
	`126`	`+`
	`127`	`+ try:`
	`128`	`+ headers = {}`
	`129`	`+ headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"`
	`130`	`+ req = urllib.request.Request(url, headers=headers)`
	`131`	`+ resp = urllib.request.urlopen(req)`
	`132`	`+ respData = str(resp.read())`
	`133`	`+ return respData`
	`134`	`+`
	`135`	`+ except Exception as e:`
	`136`	`+ print(e)`
	`137`	`+ exit(0)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit f953f4a

File tree

1 file changed

1 file changed

`‎Google-Image-Scrapper/scrapper.py`

0 commit comments