I've created a crawler using class. This crawler is able to scrape a certain webpage. Total data out there is 249 and the data are displayed there through different pages. I tried to make it accurately. Here is what I did.
import requests
from lxml import html
class wiseowl:
def __init__(self, start_url):
self.start_url = start_url
self.links = [self.start_url] # a list of links to crawl
self.storage = []
def crawl(self): # calling get_link for every link in self.links
for link in self.links :
self.get_link(link)
def get_link(self,link):
print('Crawling: ' + link)
url = "http://www.wiseowl.co.uk"
response = requests.get(link)
tree = html.fromstring(response.text)
for items in tree.xpath("//p[@class='woVideoListDefaultSeriesTitle']"):
name = items.xpath(".//a/text()")[0]
urls = url + items.xpath(".//a/@href")[0]
docs = name , urls
self.storage.append(docs)
next_page = tree.xpath("//div[contains(concat(' ', @class, ' '), ' woPaging ')]//*[@class='woPagingItem' or @class='woPagingNext']/@href") # get links form 'woPagingItem' or 'woPagingNext' #
for npage in next_page:
if npage and url + npage not in self.links : # avoid getting the same link twice
self.links += [url + npage]
def __str__(self):
return "{}".format(self.storage)
crawler=wiseowl("http://www.wiseowl.co.uk/videos/")
crawler.crawl()
for item in crawler.storage:
print(item)
1 Answer 1
General
Imports should be grouped, and groups should be separated by a single blank line.1
Class names should use
CamelCase
.2There shouldn't be a blank line following the class signature.
Top-level function and class definitions should be separated by two blank lines.3 You already correctly separate method definitions by a single blank line. :)
Assignment operators should be separated by whitespace.4
In
wiseowl
, no method ever needs access toself.start_url
(the only exception being__init__()
, of course). You might as well get rid of it.If you want to cast an object to
str
, just pass the object to thestr
constructor:str_obj = str(obj)
Rewrite
I've removed redundant code and improved code style.
from lxml import html
import requests
class WiseOwl:
def __init__(self, start_url):
self.links = [start_url]
self.storage = []
def crawl(self):
# Calling get_link for every link in self.links
for link in self.links :
self.get_link(link)
def get_link(self, link):
print('Crawling: ' + link)
url = "http://www.wiseowl.co.uk"
response = requests.get(link)
tree = html.fromstring(response.text)
for items in tree.xpath("//p[@class='woVideoListDefaultSeriesTitle']"):
name = items.xpath(".//a/text()")[0]
urls = url + items.xpath(".//a/@href")[0]
docs = name , urls
self.storage.append(docs)
next_page = tree.xpath("//div[contains(concat(' ', @class, ' '), '
woPaging ')]//*[@class='woPagingItem' or @class='woPagingNext']/@href")
for npage in next_page:
if npage and url + npage not in self.links:
# Avoid getting the same link twice
self.links += [url + npage]
def __str__(self):
return str(self.storage)
crawler = WiseOwl("http://www.wiseowl.co.uk/videos/")
crawler.crawl()
for item in crawler.storage:
print(item)