I'm currently scraping this website Bureau of Labor Statistics and I just want some feedback on how my approach was. I'm mainly focusing on the occupation groups table to the left of the website and the content inside those links. It would be great if you could give me feedback on my code and how it could be improved.
The code below is my table scraper which scrapes the tables.
class TableScraper(object):
def __init__(self, html, classIdentifier, idName, linkFileName=None, dataFileName=None):
# run BeautifulSoup on the html
try:
self.soup = BeautifulSoup(html, 'html.parser')
except:
self.soup = html
self.requests_objects = []
# class or id of the table
self.classIdentifier = classIdentifier
# class or id name of the table
self.idName = idName
# file names
self.linkFileName = linkFileName
self.dataFileName = dataFileName
def scrapeHeader(self, classIdentifier, idName):
# scrape the header of table
# find table
table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''})
if table is None:
table = self.soup
else:
pass
header = ''
try:
# find header
header = table.find('thead').find('tr')
except:
header = table.find('tbody').find('tr')
# loop through each row
count = 0
# list to hold headers
header_list = []
# get the headers
for head in header:
try:
title = ''
try:
# get title of header
title = head.text.encode('utf-8')
except:
# move on to the next loop if we can't find the title
continue
# set length of array
colspan = 0
# try to find length
try:
colspan = int(head.get('colspan'))
except:
# if we can find it set it to one
colspan = 1
# set an array
array = []
for header in header_list:
array.append(header.requests_objects)
header = TableHeader(colspan, title, count, array)
self.requests_objects.append(header.findIndexes())
count += 1
print header.requests_objects
header_list.append(header)
except Exception as e:
print e
return header_list
def init_list_of_objects(self, size):
list_of_objects = list()
for i in range(0,size):
list_of_objects.append( list() ) # different object reference each time
return list_of_objects
def scrapeContent(self, header_list, classIdentifier, idName):
# find table
table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''})
if table is None:
table = self.soup
else:
pass
# scrape the contents of the header
contents = table.find('tbody')
rows = contents.find_all('tr')
num_rows = 0
# return array
return_array = self.init_list_of_objects(len(self.requests_objects))
for row in rows:
num_rows += 1
# array to store all of our data
children = []
# for item in row:
items = row.find_all('td')
print items[0].nextSibling
# recursive loop to find a element with text
for item in items:
print str(len(items)) + "this"
nextNode = item
while True:
# get he next
nextNode = nextNode.findNext()
try:
# try getting a text attribute
nextNode.text
except AttributeError:
# if there is a error
pass
else:
# if we found the text
children.append(nextNode)
break
print len(children)
# set count s
print str(children) + 'sparta'
print str(num_rows) + "rows"
count = 0
# after appending them to children we add to return array
for num_array in self.requests_objects:
append_array = []
for num in num_array:
print str(num) + 'num'
try:
append_array.append(str(children[num].text.encode('utf-8')).strip())
except Exception as e:
print e
print 'we could not fit it in header_list'
return_array[count].append(append_array)
count += 1
print return_array
return return_array
def combineArrays(self, arrays):
print "incombinearrays"
# create a variable of the length of the arrays
length_of_all_arrays = 0
for array in arrays:
for array2 in arrays:
length_of_all_arrays = len(array.children)
len(array.children) == len(array2.children)
print("okay")
# set an empty array of slots for future functions
occupations = self.init_list_of_objects(length_of_all_arrays) #[None] * len(header_list[0].children) # Create list of 100 'None's
print str(len(occupations)) + " length"
# check if we have the same amount
print str(len(arrays)) + ' len of arrays'
for array in arrays:
count = 0
print len(array.children)
for child in array.children:
print str(count) + "count"
child_index = array.children.index(child)
print str(child_index) + 'index'
occupations[count].append(child)
count += 1
for array in arrays:
print str(array.children) + array.title
print len(array.children)
print len(occupations)
for occupation in occupations:
print occupation
# print str(occupations[9]) + 'hi'
return arrays, occupations
def getLinks(self, classIdentifier, idName):
# find table
table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''})
if table is None:
table = self.soup
else:
pass
# scrape the contents of the header
contents = table.find('tbody')
link_header = contents.find_all('h4')
# list of all the occupations
occupation_links = []
for header in link_header:
# get a element which contains the link
atag = header.find('a')
link = atag['href']
# get the title
title = atag.text
# create the blslink object
blslink = BLSLink(self.url, link)
# add title
blslink.addChild(title)
# append object to the array
occupation_links.append(blslink)
return occupation_links
def jsonData(self, header_list=None, occupations=None):
json_occupations_data = []
json_links_data = []
# write it to a json file
for occupation in occupations:
json_array = []
for header in header_list:
json_data = {
header.title : occupation[header_list.index(header)]
}
json_array.append(json_data)
json_occupations_data.append(json_array)
# write links to a json file
links = self.getLinks(self.classIdentifier, self.idName)
for link in links:
json_links_data.append(link.createjson())
print json_occupations_data
return json_occupations_data, json_links_data
def writeToJSON(self, array):
for data in array:
filename = ''+data.file+''
f = open(filename, "w")
jsonstuff = json.dumps(data.data, indent=4)
f.write(jsonstuff)
# # write it in json file
# filename = ''+dataFileName+''
# f = open(filename, "w")
# jsonstuff = json.dumps(json_occupations_data, indent=4)
# f.write(jsonstuff)
# filename = ''+linkFileName+''
# f = open(filename, "w")
# json_data = json.dumps(json_links_data, indent=4)
# f.write(json_data)
def scrape(self):
headers = self.scrapeHeader(self.classIdentifier, self.idName)
contents = self.scrapeContent(headers, self.classIdentifier, self.idName)
count = 0
for content in contents:
headers[count].addChild(content)
count += 1
header_list, occupations = self.combineArrays(headers)
json_occupations_data, json_links_data = self.jsonData(header_list, occupations)
BLSData = namedtuple('BLSData', 'data file')
content1 = BLSData(json_occupations_data, self.dataFileName)
print str(json_occupations_data) + "hi and stuff"
content2 = BLSData(json_links_data, self.linkFileName)
return [content1, content2]
The code below is the code which calls the class. The link file contains a bunch of links which leads to the careers in the occupation.
# make program look like a browser, user_agent
user_agent = 'Mozilla/5 (Solaris 10) Gecko'
headers = { 'User-Agent' : user_agent }
# search keys
search_urls = []
# get json file name
jsonfilename = "links.json"
# open json file as var json_data
with open(jsonfilename) as json_data:
# store it in variable d
d = json.load(json_data)
# get second object
for link in d:
for child in link:
title = link[child]
for url in title:
blslink = title[url]
search_urls.append(blslink)
# set the url we want to scrape
search_url = search_urls[9]
# get webdriver and call phantomjs
driver = webdriver.PhantomJS()
driver.get(''+search_url+'')
# waiting for the page to load
wait = WebDriverWait(driver, 10)
# find an element
wait.until(EC.visibility_of_element_located((By.ID, "wrapper-outer")))
link = driver.find_element_by_css_selector(".sorting")
# simulate a click on the button
link.click()
# get the page source of the website
html = driver.page_source
I have another piece of code which scrapes the last page when you click an occupation from the table of careers.
# scrape occupations
# make program look like a browser, user_agent
user_agent = 'Mozilla/5 (Solaris 10) Gecko'
headers = { 'User-Agent' : user_agent }
# search keys
search_urls = []
# get json file name
jsonfilename = "occupationlinks.json"
# open json file as var json_data
with open(jsonfilename) as json_data:
# store it in variable d
d = json.load(json_data)
# get second object
for link in d:
for child in link:
title = link[child]
for url in title:
blslink = title[url]
search_urls.append(blslink)
search_url = search_urls[1]
page = urllib.urlopen(search_url)
soup = BeautifulSoup(page.read(), 'html.parser')
contents = soup.find('div', attrs={'id' : 'panes'})
occupation_info = []
for content in contents:
article = content.find("article")
try:
for items in article:
continue
except:
continue
for items in article:
# to keep track if we added a class or not
addedContainer = False
try:
# print items
# loop all types of headers
for i in range(1,7):
# if node is a h tag
if items.name == 'h'+str(i):
# make a new blscontainer object
title = items.text
# give it a new title
new_container = BLSContent(title)
# add the new container to the temp container
occupation_info.append(new_container)
# append the text of the title and break the loop
addedContainer = True
break
# if its content add it
if addedContainer == False:
# if the element is a table
if items.name == 'table':
print items.get('class')[0]
table_scraper = TableScraper(search_url, items, 'class', items.get('class')[0], linkFileName=None, dataFileName=None)
scraped_data = table_scraper.scrape()
print str(scraped_data[0].data) + "asdf"
occupation_info[-1].addChild(scraped_data[0].data)
break
# get last appended container and add to it
occupation_info[-1].addChild(items.text)
except Exception as e:
print str(e) + "error is"
print occupation_info[7].children
jsonstuff = []
for info in occupation_info:
json_data = {
info.title : info.children
}
jsonstuff.append(json_data)
json_data = json.dumps(jsonstuff, indent=4)
filename = "info.json"
f = open(filename, 'w')
f.write(json_data)
1 Answer 1
Error handling?
This doesn't make a lot of sense to me:
try: self.soup = BeautifulSoup(html, 'html.parser') except: self.soup = html
This looks like the code expects the html
parameter to be either of two things:
- An HTML document that can be parsed by
BeautifulSoup
- A
BeautifulSoup
instance
I find this a confusing API.
It may seem like nice "magic" that the code will work whatever you throw at it,
but it would be cleaner and better to have different API to handle the different kinds of inputs.
The API that handles html can internally call the other one that handles BeautifulSoup
instances.
String conversion
I don't understand the purpose of the ''+...+''
here:
table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''}) if table is None: table = self.soup else: pass
If you want convert foo
to string, use str(foo)
, for example str(classIdentifier)
.
If classIdentifier
is already a string, then prepending an empty string and appending an empty string is completely pointless.
Also, the else: pass
is pointless, it's better to omit it.
Use list comprehensions
This is a perfect candidate for using list comprehensions:
# set an array array = [] for header in header_list: array.append(header.requests_objects)
Like this:
headers = [header.requests_objects for header in header_list]
I also renamed the variables, because there are no arrays in Python.
Another example:
list_of_objects = list() for i in range(0,size): list_of_objects.append( list() ) # different object reference each time return list_of_objects
Using a list comprehension:
# different object reference each time
return [list() for _ in range(size)]
Strange code
This looks strange, and I doubt it works as intended:
length_of_all_arrays = 0 for array in arrays: for array2 in arrays: length_of_all_arrays = len(array.children) len(array.children) == len(array2.children) print("okay")
For one thing,
the statement len(array.children) == len(array2.children)
is pointless.
For another,
length_of_all_arrays
is overwritten for each value in arrays
.
This would make more sense, but it's hard to tell what your real intention was here:
length_of_all_arrays = [len(a.children) for a in arrays]
Explore related questions
See similar questions with these tags.