4
\$\begingroup\$

I'm currently scraping this website Bureau of Labor Statistics and I just want some feedback on how my approach was. I'm mainly focusing on the occupation groups table to the left of the website and the content inside those links. It would be great if you could give me feedback on my code and how it could be improved.

The code below is my table scraper which scrapes the tables.

class TableScraper(object):
 def __init__(self, html, classIdentifier, idName, linkFileName=None, dataFileName=None):
 # run BeautifulSoup on the html
 try:
 self.soup = BeautifulSoup(html, 'html.parser')
 except:
 self.soup = html
 self.requests_objects = []
 # class or id of the table
 self.classIdentifier = classIdentifier
 # class or id name of the table
 self.idName = idName
 # file names
 self.linkFileName = linkFileName
 self.dataFileName = dataFileName
 def scrapeHeader(self, classIdentifier, idName):
 # scrape the header of table
 # find table
 table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''}) 
 if table is None:
 table = self.soup
 else:
 pass
 header = ''
 try:
 # find header
 header = table.find('thead').find('tr')
 except:
 header = table.find('tbody').find('tr')
 # loop through each row
 count = 0
 # list to hold headers
 header_list = []
 # get the headers
 for head in header:
 try:
 title = ''
 try:
 # get title of header
 title = head.text.encode('utf-8')
 except:
 # move on to the next loop if we can't find the title
 continue
 # set length of array
 colspan = 0
 # try to find length
 try:
 colspan = int(head.get('colspan'))
 except:
 # if we can find it set it to one
 colspan = 1
 # set an array
 array = []
 for header in header_list:
 array.append(header.requests_objects)
 header = TableHeader(colspan, title, count, array)
 self.requests_objects.append(header.findIndexes())
 count += 1
 print header.requests_objects
 header_list.append(header)
 except Exception as e:
 print e
 return header_list
 def init_list_of_objects(self, size):
 list_of_objects = list()
 for i in range(0,size):
 list_of_objects.append( list() ) # different object reference each time
 return list_of_objects
 def scrapeContent(self, header_list, classIdentifier, idName):
 # find table
 table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''})
 if table is None:
 table = self.soup
 else:
 pass
 # scrape the contents of the header
 contents = table.find('tbody')
 rows = contents.find_all('tr')
 num_rows = 0
 # return array
 return_array = self.init_list_of_objects(len(self.requests_objects))
 for row in rows:
 num_rows += 1
 # array to store all of our data
 children = []
 # for item in row:
 items = row.find_all('td')
 print items[0].nextSibling
 # recursive loop to find a element with text
 for item in items:
 print str(len(items)) + "this"
 nextNode = item
 while True:
 # get he next 
 nextNode = nextNode.findNext()
 try:
 # try getting a text attribute 
 nextNode.text
 except AttributeError:
 # if there is a error
 pass
 else:
 # if we found the text
 children.append(nextNode)
 break
 print len(children)
 # set count s
 print str(children) + 'sparta'
 print str(num_rows) + "rows"
 count = 0
 # after appending them to children we add to return array
 for num_array in self.requests_objects:
 append_array = []
 for num in num_array:
 print str(num) + 'num'
 try:
 append_array.append(str(children[num].text.encode('utf-8')).strip())
 except Exception as e:
 print e
 print 'we could not fit it in header_list'
 return_array[count].append(append_array)
 count += 1
 print return_array
 return return_array
 def combineArrays(self, arrays):
 print "incombinearrays"
 # create a variable of the length of the arrays
 length_of_all_arrays = 0
 for array in arrays:
 for array2 in arrays:
 length_of_all_arrays = len(array.children)
 len(array.children) == len(array2.children)
 print("okay")
 # set an empty array of slots for future functions
 occupations = self.init_list_of_objects(length_of_all_arrays) #[None] * len(header_list[0].children) # Create list of 100 'None's
 print str(len(occupations)) + " length"
 # check if we have the same amount
 print str(len(arrays)) + ' len of arrays'
 for array in arrays:
 count = 0 
 print len(array.children)
 for child in array.children:
 print str(count) + "count"
 child_index = array.children.index(child)
 print str(child_index) + 'index'
 occupations[count].append(child)
 count += 1
 for array in arrays:
 print str(array.children) + array.title
 print len(array.children)
 print len(occupations)
 for occupation in occupations:
 print occupation
 # print str(occupations[9]) + 'hi'
 return arrays, occupations
 def getLinks(self, classIdentifier, idName):
 # find table
 table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''})
 if table is None:
 table = self.soup
 else:
 pass
 # scrape the contents of the header
 contents = table.find('tbody')
 link_header = contents.find_all('h4')
 # list of all the occupations
 occupation_links = []
 for header in link_header:
 # get a element which contains the link
 atag = header.find('a')
 link = atag['href']
 # get the title
 title = atag.text
 # create the blslink object
 blslink = BLSLink(self.url, link)
 # add title
 blslink.addChild(title)
 # append object to the array
 occupation_links.append(blslink)
 return occupation_links
 def jsonData(self, header_list=None, occupations=None):
 json_occupations_data = []
 json_links_data = []
 # write it to a json file
 for occupation in occupations:
 json_array = []
 for header in header_list:
 json_data = {
 header.title : occupation[header_list.index(header)]
 }
 json_array.append(json_data)
 json_occupations_data.append(json_array)
 # write links to a json file
 links = self.getLinks(self.classIdentifier, self.idName)
 for link in links:
 json_links_data.append(link.createjson())
 print json_occupations_data
 return json_occupations_data, json_links_data
 def writeToJSON(self, array):
 for data in array:
 filename = ''+data.file+''
 f = open(filename, "w")
 jsonstuff = json.dumps(data.data, indent=4)
 f.write(jsonstuff)
 # # write it in json file
 # filename = ''+dataFileName+''
 # f = open(filename, "w")
 # jsonstuff = json.dumps(json_occupations_data, indent=4)
 # f.write(jsonstuff)
 # filename = ''+linkFileName+''
 # f = open(filename, "w")
 # json_data = json.dumps(json_links_data, indent=4)
 # f.write(json_data)
 def scrape(self):
 headers = self.scrapeHeader(self.classIdentifier, self.idName)
 contents = self.scrapeContent(headers, self.classIdentifier, self.idName)
 count = 0
 for content in contents:
 headers[count].addChild(content)
 count += 1
 header_list, occupations = self.combineArrays(headers)
 json_occupations_data, json_links_data = self.jsonData(header_list, occupations)
 BLSData = namedtuple('BLSData', 'data file')
 content1 = BLSData(json_occupations_data, self.dataFileName)
 print str(json_occupations_data) + "hi and stuff"
 content2 = BLSData(json_links_data, self.linkFileName)
 return [content1, content2]

The code below is the code which calls the class. The link file contains a bunch of links which leads to the careers in the occupation.

# make program look like a browser, user_agent
user_agent = 'Mozilla/5 (Solaris 10) Gecko'
headers = { 'User-Agent' : user_agent }
# search keys
search_urls = []
# get json file name 
jsonfilename = "links.json"
# open json file as var json_data
with open(jsonfilename) as json_data:
 # store it in variable d
 d = json.load(json_data)
 # get second object
 for link in d:
 for child in link:
 title = link[child]
 for url in title:
 blslink = title[url]
 search_urls.append(blslink)
# set the url we want to scrape
search_url = search_urls[9]
# get webdriver and call phantomjs
driver = webdriver.PhantomJS()
driver.get(''+search_url+'')
# waiting for the page to load
wait = WebDriverWait(driver, 10)
# find an element
wait.until(EC.visibility_of_element_located((By.ID, "wrapper-outer")))
link = driver.find_element_by_css_selector(".sorting")
# simulate a click on the button
link.click()
# get the page source of the website
html = driver.page_source

I have another piece of code which scrapes the last page when you click an occupation from the table of careers.

# scrape occupations
# make program look like a browser, user_agent
user_agent = 'Mozilla/5 (Solaris 10) Gecko'
headers = { 'User-Agent' : user_agent }
# search keys
search_urls = []
# get json file name 
jsonfilename = "occupationlinks.json"
# open json file as var json_data
with open(jsonfilename) as json_data:
 # store it in variable d
 d = json.load(json_data)
 # get second object
 for link in d:
 for child in link:
 title = link[child]
 for url in title:
 blslink = title[url]
 search_urls.append(blslink)
search_url = search_urls[1]
page = urllib.urlopen(search_url)
soup = BeautifulSoup(page.read(), 'html.parser')
contents = soup.find('div', attrs={'id' : 'panes'})
occupation_info = []
for content in contents:
 article = content.find("article")
 try: 
 for items in article:
 continue
 except:
 continue
 for items in article:
 # to keep track if we added a class or not
 addedContainer = False
 try:
 # print items
 # loop all types of headers
 for i in range(1,7):
 # if node is a h tag
 if items.name == 'h'+str(i):
 # make a new blscontainer object
 title = items.text
 # give it a new title
 new_container = BLSContent(title)
 # add the new container to the temp container
 occupation_info.append(new_container)
 # append the text of the title and break the loop
 addedContainer = True
 break
 # if its content add it
 if addedContainer == False:
 # if the element is a table
 if items.name == 'table':
 print items.get('class')[0]
 table_scraper = TableScraper(search_url, items, 'class', items.get('class')[0], linkFileName=None, dataFileName=None)
 scraped_data = table_scraper.scrape()
 print str(scraped_data[0].data) + "asdf"
 occupation_info[-1].addChild(scraped_data[0].data)
 break
 # get last appended container and add to it
 occupation_info[-1].addChild(items.text)
 except Exception as e:
 print str(e) + "error is"
print occupation_info[7].children
jsonstuff = []
for info in occupation_info:
 json_data = {
 info.title : info.children
 }
 jsonstuff.append(json_data)
json_data = json.dumps(jsonstuff, indent=4)
filename = "info.json"
f = open(filename, 'w')
f.write(json_data)
200_success
145k22 gold badges190 silver badges478 bronze badges
asked Jan 2, 2018 at 18:07
\$\endgroup\$
0

1 Answer 1

4
\$\begingroup\$

Error handling?

This doesn't make a lot of sense to me:

try:
 self.soup = BeautifulSoup(html, 'html.parser')
except:
 self.soup = html

This looks like the code expects the html parameter to be either of two things:

  • An HTML document that can be parsed by BeautifulSoup
  • A BeautifulSoup instance

I find this a confusing API. It may seem like nice "magic" that the code will work whatever you throw at it, but it would be cleaner and better to have different API to handle the different kinds of inputs. The API that handles html can internally call the other one that handles BeautifulSoup instances.

String conversion

I don't understand the purpose of the ''+...+'' here:

table = self.soup.find('table', attrs={''+classIdentifier+'' : ''+idName+''}) 
if table is None:
 table = self.soup
else:
 pass

If you want convert foo to string, use str(foo), for example str(classIdentifier).

If classIdentifier is already a string, then prepending an empty string and appending an empty string is completely pointless.

Also, the else: pass is pointless, it's better to omit it.

Use list comprehensions

This is a perfect candidate for using list comprehensions:

# set an array
array = []
for header in header_list:
 array.append(header.requests_objects)

Like this:

headers = [header.requests_objects for header in header_list]

I also renamed the variables, because there are no arrays in Python.

Another example:

list_of_objects = list()
for i in range(0,size):
 list_of_objects.append( list() ) # different object reference each time
return list_of_objects

Using a list comprehension:

# different object reference each time
return [list() for _ in range(size)]

Strange code

This looks strange, and I doubt it works as intended:

length_of_all_arrays = 0
for array in arrays:
 for array2 in arrays:
 length_of_all_arrays = len(array.children)
 len(array.children) == len(array2.children)
 print("okay")

For one thing, the statement len(array.children) == len(array2.children) is pointless. For another, length_of_all_arrays is overwritten for each value in arrays. This would make more sense, but it's hard to tell what your real intention was here:

length_of_all_arrays = [len(a.children) for a in arrays]
answered Jan 2, 2018 at 19:40
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.