Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 3d4e7ff

Browse files
Add the solve_captcha method.
1 parent 178e0a1 commit 3d4e7ff

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

‎zipru_scraper/middlewares.py‎

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,40 @@ def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
5757
return self.dryscrape_session.url()
5858
logger.error(f'Maybe {self.dryscrape_session.url()} isn\'t a redirect URL?')
5959
raise Exception('Timed out on the zipru redirect page.')
60+
61+
def solve_captcha(self, img, width=1280, height=800):
62+
# take a screenshot of the page
63+
self.dryscrape_session.set_viewport_size(width, height)
64+
filename = tempfile.mktemp('.png')
65+
self.dryscrape_session.render(filename, width, height)
66+
67+
# inject javascript to find the bounds of the captcha
68+
js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
69+
rect = self.dryscrape_session.eval_script(js)
70+
box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
71+
72+
# solve the captcha in the screenshot
73+
image = Image.open(filename)
74+
os.unlink(filename)
75+
captcha_image = image.crop(box)
76+
captcha = pytesseract.image_to_string(captcha_image)
77+
logger.debug(f'Solved the Zipru captcha: "{captcha}"')
78+
79+
# submit the captcha
80+
input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
81+
input.set(captcha)
82+
button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
83+
url = self.dryscrape_session.url()
84+
button.click()
85+
86+
# try again if it we redirect to a threat defense URL
87+
if self.is_threat_defense_url(self.wait_for_redirect(url)):
88+
return self.bypass_threat_defense()
89+
90+
# otherwise return the cookies as a dict
91+
cookies = {}
92+
for cookie_string in self.dryscrape_session.cookies():
93+
if 'domain=zipru.to' in cookie_string:
94+
key, value = cookie_string.split(';')[0].split('=')
95+
cookies[key] = value
96+
return cookies

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /