Web scraper running extremely slow

Question 1

I am making my first web scraper in Python. It works great but it runs extremely slow. The website loads in about 10ms but it only does like 1 every couple of seconds. There are about 4-6 million records I need to scrape through. Any ideas?

from bs4 import BeautifulSoup
import requests
import json
import re
import urllib
import threading
prox = {"http" : "127.0.0.1:8888", "https" : "127.0.0.1:8888"}
def GetVS(Soup):
 return Soup.find('input', {'name' : '__VIEWSTATE'})['value']
def GetEV(Soup):
 return Soup.find('input', {'name' : '__EVENTVALIDATION'})['value']
def GetSearch(Viewstate, Eventvalidation):
 return requests.post('website', 
 data="__EVENTTARGET=ctl00%24cpMain%24ctl01%24rblSearchType%241&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtOwner=",
 verify=False,
 headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def PropertySearch(PropertyID, Viewstate, Eventvalidation):
 return requests.post('website', 
 data="__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtPropertyID="+urllib.quote(PropertyID, '')+"&ctl00%24cpMain%24ctl01%24btnSearch=Click+Here+to+Search+for+Property", 
 verify=False,
 headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def getAmmount(Selid, PropertyID, Viewstate, Eventvalidation):
 return requests.post('website', 
 data="__LASTFOCUS=&__EVENTTARGET=ctl00%24cpMain%24ctl01%24grdResults&__EVENTARGUMENT=Select%24"+urllib.quote(Selid, '')+"&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtPropertyID="+urllib.quote(PropertyID, ''), 
 verify=False,
 headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def GetData(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblAmountValue"}).text
def GetPropertyID(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblPropertyIDValue"}).text
def GetReportYear(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblReportYearValue"}).text
def GetName(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblNameValue"}).text
def GetAddress1(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblAddress1Value"}).text
def GetAddress2(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblAddress2Value"}).text
def GetCity(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblCityValue"}).text
def GetState(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblStateValue"}).text
def GetZip(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblZipValue"}).text
def GetInst(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblInstitutionValue"}).text
def GetDesc(Html):
 Soup = BeautifulSoup(Html)
 return Soup.find('span', {"id" : "cpMain_ctl01_lblDescriptionValue"}).text
Request_1 = requests.get('website', verify=False)
Soup_1 = BeautifulSoup(Request_1.text)
Viewstate_1 = GetVS(Soup_1)
Eventvalidation_1 = GetEV(Soup_1)
 Request_2 = GetSearch(Viewstate_1, Eventvalidation_1)
 Soup_2 = BeautifulSoup(Request_2.text)
 Viewstate_2 = GetVS(Soup_2)
 Eventvalidation_2 = GetEV(Soup_2)
 def dowork(start):
 while start < 4000000: 
 start = start + 1
 Request = PropertySearch(str(start), Viewstate_2, Eventvalidation_2)
 Soup = BeautifulSoup(Request.text)
 Viewstate = GetVS(Soup)
 Eventvalidation = GetEV(Soup)
 regex = re.compile("Over \100ドル.*__doPostBack.*Select\$(.*)\&")
 r = regex.findall(Request.text)
 for i in r:
 print GetData(getAmmount(i, str(start), Viewstate, Eventvalidation).text)
 threads = []
 for i in range(50):
 t = threading.Thread(target=dowork, args=(i*100000+1000000,))
 threads.append(t)
 t.start()

Question 2

There are a couple of strange things about this piece of code:

def dowork(start):
 while start < 4000000: 
 start = start + 1
 # ...
 regex = re.compile("Over \100ドル.*__doPostBack.*Select\$(.*)\&")
 r = regex.findall(Request.text)
 for i in r:
 # ...
threads = []
for i in range(50):
 t = threading.Thread(target=dowork, args=(i*100000+1000000,))

First of all, you don't need to compile the regex in every iteration, and not even in every thread. It seems this can be a global constant, compiled only once.

The threads run dowork with a different start parameter: 1m, 1.1m, 1.2m, ..., 5.8m, 5.9m. The smaller problem is that dowork only runs until 4m, so threads 30~49 will do nothing. The big problem is that they all run until 4m. I think you really meant this instead:

def dowork(start0, maxcnt):
 counter = 0
 while counter < maxcnt: 
 counter += 1
 start = str(start0 + counter)
 # ...

This has some other improvements as well:

counter += 1 simpler than counter = counter + 1
Convert start to string once, reuse multiple times within the function
maxcnt is a parameter instead of hardcoded 10**5, because the caller controls the start0 parameter, and the two are closely related

Coding style

Please follow PEP8, the official Python coding style guide. Especially, snake_case is preferred for method names, instead of CamelCase.

janos janos 113k15 gold badges154 silver badges396 bronze badges · Accepted Answer · 2014-09-06 07:13:12Z

There are a couple of strange things about this piece of code:

def dowork(start):
 while start < 4000000: 
 start = start + 1
 # ...
 regex = re.compile("Over \100ドル.*__doPostBack.*Select\$(.*)\&")
 r = regex.findall(Request.text)
 for i in r:
 # ...
threads = []
for i in range(50):
 t = threading.Thread(target=dowork, args=(i*100000+1000000,))

First of all, you don't need to compile the regex in every iteration, and not even in every thread. It seems this can be a global constant, compiled only once.

The threads run dowork with a different start parameter: 1m, 1.1m, 1.2m, ..., 5.8m, 5.9m. The smaller problem is that dowork only runs until 4m, so threads 30~49 will do nothing. The big problem is that they all run until 4m. I think you really meant this instead:

def dowork(start0, maxcnt):
 counter = 0
 while counter < maxcnt: 
 counter += 1
 start = str(start0 + counter)
 # ...

This has some other improvements as well:

counter += 1 simpler than counter = counter + 1
Convert start to string once, reuse multiple times within the function
maxcnt is a parameter instead of hardcoded 10**5, because the caller controls the start0 parameter, and the two are closely related

Coding style

Please follow PEP8, the official Python coding style guide. Especially, snake_case is preferred for method names, instead of CamelCase.

Stack Exchange Network

Web scraper running extremely slow

1 Answer 1

Coding style

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Web scraper running extremely slow

1 Answer 1

Coding style

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions