I've written some code using python having tried to comply with OOP design for the purpose of collecting name of the app, price and developer name from i-tune site by going deeper until there is a limit [which can be toggled within my script] to stop as it has millions of links to go. It is working impeccably at this moment. I tried my best to make this scraper pythonic. However, suggestions to bring about any change to make this crawler more robust will be highly appreciable. Thanks in advance.
from lxml import html
import requests
class app_scraper:
start_url = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"
def __init__(self):
self.links = [self.start_url]
self.vault = []
def crawler(self):
for url in self.links:
self.get_app(url)
def get_app(self, url):
print("Scraping now "+ url)
page = requests.get(url)
tree = html.fromstring(page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
item = processor(name, developer, price)
self.vault.append(item)
p_links = tree.xpath('//div[@class="lockup-info"]//li/a[@class="name"]/@href')
for p_link in p_links:
if not len(self.links)>=5: # It'll stop crawling until it meets the limit whatever it is
self.links += [p_link]
class processor:
def __init__(self, name, developer, price):
self.name = name
self.developer = developer
self.price = price
def __str__(self):
return (self.name+" "+self.price+" "+self.developer)
crawl = app_scraper()
crawl.crawler()
for info in crawl.vault:
print(info)
1 Answer 1
Good job improving the modularity of your code under reviews.
We can improve the following:
- follow Python's naming convention and use camel-case style names for your classes
- let's as usual reuse a
Session
instance - let's also use
findtext()
to get element texts - I would either use
namedtuple
for your "processor"s or use__slots__
Improved code:
from lxml import html
import requests
class AppScraper:
start_url = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"
def __init__(self):
self.session = requests.Session()
self.links = [self.start_url]
self.vault = []
def crawler(self):
for url in self.links:
self.get_app(url)
def get_app(self, url):
print("Scraping now "+ url)
page = self.session.get(url)
tree = html.fromstring(page.text)
name = tree.findtext('.//h1[@itemprop="name"]')
developer = tree.findtext('.//div[@class="left"]/h2')
price = tree.findtext('.//div[@itemprop="price"]')
item = Processor(name, developer, price)
self.vault.append(item)
p_links = tree.xpath('//div[@class="lockup-info"]//li/a[@class="name"]/@href')
for p_link in p_links:
if len(self.links) < 5: # It'll stop crawling until it meets the limit whatever it is
self.links += [p_link]
class Processor:
__slots__ = ['name', 'developer', 'price']
def __init__(self, name, developer, price):
self.name = name
self.developer = developer
self.price = price
def __str__(self):
return self.name + " " + self.price + " " + self.developer
if __name__ == '__main__':
crawl = AppScraper()
crawl.crawler()
for info in crawl.vault:
print(info)
-
\$\begingroup\$ A little question on this slots stuff. What is it doing here? As a newbie I'm not familiar with this. I hope you will let me know. Thanks sir. \$\endgroup\$SIM– SIM2017年07月19日 19:38:03 +00:00Commented Jul 19, 2017 at 19:38
-
\$\begingroup\$ @SMth80 ah, yes, good findings - on a new machine, have to adjust to the new environment :)
__slots__
should provide a boost in performance if you are going to instantiate lots of instances. Thanks! \$\endgroup\$alecxe– alecxe2017年07月19日 19:55:11 +00:00Commented Jul 19, 2017 at 19:55
Explore related questions
See similar questions with these tags.