I am making my first web scraper in Python. It works great but it runs extremely slow. The website loads in about 10ms but it only does like 1 every couple of seconds. There are about 4-6 million records I need to scrape through. Any ideas?
from bs4 import BeautifulSoup
import requests
import json
import re
import urllib
import threading
prox = {"http" : "127.0.0.1:8888", "https" : "127.0.0.1:8888"}
def GetVS(Soup):
return Soup.find('input', {'name' : '__VIEWSTATE'})['value']
def GetEV(Soup):
return Soup.find('input', {'name' : '__EVENTVALIDATION'})['value']
def GetSearch(Viewstate, Eventvalidation):
return requests.post('website',
data="__EVENTTARGET=ctl00%24cpMain%24ctl01%24rblSearchType%241&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtOwner=",
verify=False,
headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def PropertySearch(PropertyID, Viewstate, Eventvalidation):
return requests.post('website',
data="__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtPropertyID="+urllib.quote(PropertyID, '')+"&ctl00%24cpMain%24ctl01%24btnSearch=Click+Here+to+Search+for+Property",
verify=False,
headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def getAmmount(Selid, PropertyID, Viewstate, Eventvalidation):
return requests.post('website',
data="__LASTFOCUS=&__EVENTTARGET=ctl00%24cpMain%24ctl01%24grdResults&__EVENTARGUMENT=Select%24"+urllib.quote(Selid, '')+"&__VIEWSTATE="+urllib.quote(Viewstate, '')+"&__EVENTVALIDATION="+urllib.quote(Eventvalidation, '')+"&ctl00%24txtsearch=&ctl00%24rdoSearch=rdoSite&ctl00%24cpMain%24ctl01%24rblSearchType=PropertyID&ctl00%24cpMain%24ctl01%24txtPropertyID="+urllib.quote(PropertyID, ''),
verify=False,
headers={"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Referer" : "https://nevadatreasurer.gov/UPSearch/", "Content-Type" : "application/x-www-form-urlencoded"})
def GetData(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblAmountValue"}).text
def GetPropertyID(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblPropertyIDValue"}).text
def GetReportYear(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblReportYearValue"}).text
def GetName(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblNameValue"}).text
def GetAddress1(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblAddress1Value"}).text
def GetAddress2(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblAddress2Value"}).text
def GetCity(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblCityValue"}).text
def GetState(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblStateValue"}).text
def GetZip(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblZipValue"}).text
def GetInst(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblInstitutionValue"}).text
def GetDesc(Html):
Soup = BeautifulSoup(Html)
return Soup.find('span', {"id" : "cpMain_ctl01_lblDescriptionValue"}).text
Request_1 = requests.get('website', verify=False)
Soup_1 = BeautifulSoup(Request_1.text)
Viewstate_1 = GetVS(Soup_1)
Eventvalidation_1 = GetEV(Soup_1)
Request_2 = GetSearch(Viewstate_1, Eventvalidation_1)
Soup_2 = BeautifulSoup(Request_2.text)
Viewstate_2 = GetVS(Soup_2)
Eventvalidation_2 = GetEV(Soup_2)
def dowork(start):
while start < 4000000:
start = start + 1
Request = PropertySearch(str(start), Viewstate_2, Eventvalidation_2)
Soup = BeautifulSoup(Request.text)
Viewstate = GetVS(Soup)
Eventvalidation = GetEV(Soup)
regex = re.compile("Over \100ドル.*__doPostBack.*Select\$(.*)\&")
r = regex.findall(Request.text)
for i in r:
print GetData(getAmmount(i, str(start), Viewstate, Eventvalidation).text)
threads = []
for i in range(50):
t = threading.Thread(target=dowork, args=(i*100000+1000000,))
threads.append(t)
t.start()
1 Answer 1
There are a couple of strange things about this piece of code:
def dowork(start): while start < 4000000: start = start + 1 # ... regex = re.compile("Over \100ドル.*__doPostBack.*Select\$(.*)\&") r = regex.findall(Request.text) for i in r: # ... threads = [] for i in range(50): t = threading.Thread(target=dowork, args=(i*100000+1000000,))
First of all, you don't need to compile the regex in every iteration, and not even in every thread. It seems this can be a global constant, compiled only once.
The threads run dowork
with a different start
parameter: 1m, 1.1m, 1.2m, ..., 5.8m, 5.9m. The smaller problem is that dowork
only runs until 4m, so threads 30~49 will do nothing. The big problem is that they all run until 4m. I think you really meant this instead:
def dowork(start0, maxcnt): counter = 0 while counter < maxcnt: counter += 1 start = str(start0 + counter) # ...
This has some other improvements as well:
counter += 1
simpler thancounter = counter + 1
- Convert
start
to string once, reuse multiple times within the function maxcnt
is a parameter instead of hardcoded 10**5, because the caller controls thestart0
parameter, and the two are closely related
Coding style
Please follow PEP8, the official Python coding style guide. Especially, snake_case
is preferred for method names, instead of CamelCase
.