Web-crawler for yellowpage designed using python

Question 1

I've written some code to scrape name, address, and phone number from yellowpage using python. This scraper has got input parameter. If the input is properly filled in and the filled in url exists in yellowpage then the scraper will definitely parse the three categories I mentioned earlier. I tried to make it maintaining the guidelines of OOP. It is working specklessly at this moment. Hope there is something to do betterment of this crawler:

import requests
from lxml import html
class YellowPage:
 @classmethod
 def Crawl(self, name, state): 
 page = requests.get("https://www.yellowpages.com/search?search_terms=" + name + "&geo_location_terms=" + state).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 print(name, address, phone)
YellowPage.Crawl("pizza","florida")

Question 2

If you've got a single method, there is probably not much sense in having a class and just have a separate function instead. More info at Stop Writing Classes.
If you do need a classmethod, the first argument to such a method is agreed to be named cls and not self.
Crawl should be named crawl according to Python PEP8 naming practices.
Use string formatting instead of string concatenation to make the final URL.
Or, you can use params to pass GET parameters.
Use if __name__ == '__main__':.
I would return from the crawl() method instead of printing.

Modified code:

import requests
from lxml import html
class YellowPage:
 URL_TEMPLATE = "https://www.yellowpages.com/search?search_terms={name}&geo_location_terms={state}"
 @classmethod
 def crawl(cls, name, state):
 page = requests.get(cls.URL_TEMPLATE.format(name=name, state=state)).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in YellowPage.crawl("pizza", "florida"):
 print(result)

Another version (with params):

import requests
from lxml import html
class YellowPage:
 SEARCH_URL = "https://www.yellowpages.com/search"
 @classmethod
 def crawl(cls, name, state):
 page = requests.get(cls.SEARCH_URL, params={'search_terms': name, 'geo_location_terms': state}).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in YellowPage.crawl("pizza", "florida"):
 print(result)

Version without a class:

import requests
from lxml import html
SEARCH_URL = "https://www.yellowpages.com/search"
def crawl(name, state):
 page = requests.get(SEARCH_URL, params={'search_terms': name, 'geo_location_terms': state}).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in crawl("pizza", "florida"):
 print(result)

Question 3

Alternatively, since Yellow Pages follows the Schema.org item definition specifications, you can make things much more simple if you would use microdata library:

import microdata
import requests
def crawl(name, state):
 page = requests.get("https://www.yellowpages.com/search", params={'search_terms': name, 'geo_location_terms': state}).text
 for item in microdata.get_items(page):
 if item.itemtype[0].string == '//schema.org/Restaurant':
 yield (item.name, item.address, item.telephone)
if __name__ == '__main__':
 for result in crawl("pizza", "florida"):
 print(result)

alecxe alecxe 17.5k8 gold badges52 silver badges93 bronze badges · Accepted Answer · 2017-07-13 14:48:13Z

If you've got a single method, there is probably not much sense in having a class and just have a separate function instead. More info at Stop Writing Classes.
If you do need a classmethod, the first argument to such a method is agreed to be named cls and not self.
Crawl should be named crawl according to Python PEP8 naming practices.
Use string formatting instead of string concatenation to make the final URL.
Or, you can use params to pass GET parameters.
Use if __name__ == '__main__':.
I would return from the crawl() method instead of printing.

Modified code:

import requests
from lxml import html
class YellowPage:
 URL_TEMPLATE = "https://www.yellowpages.com/search?search_terms={name}&geo_location_terms={state}"
 @classmethod
 def crawl(cls, name, state):
 page = requests.get(cls.URL_TEMPLATE.format(name=name, state=state)).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in YellowPage.crawl("pizza", "florida"):
 print(result)

Another version (with params):

import requests
from lxml import html
class YellowPage:
 SEARCH_URL = "https://www.yellowpages.com/search"
 @classmethod
 def crawl(cls, name, state):
 page = requests.get(cls.SEARCH_URL, params={'search_terms': name, 'geo_location_terms': state}).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in YellowPage.crawl("pizza", "florida"):
 print(result)

Version without a class:

import requests
from lxml import html
SEARCH_URL = "https://www.yellowpages.com/search"
def crawl(name, state):
 page = requests.get(SEARCH_URL, params={'search_terms': name, 'geo_location_terms': state}).text
 tree = html.fromstring(page)
 for items in tree.xpath("//div[@class='info']"):
 name = items.findtext(".//span[@itemprop='name']")
 address = items.findtext(".//span[@class='street-address']")
 phone = items.findtext(".//div[@itemprop='telephone']")
 yield (name, address, phone)
if __name__ == '__main__':
 for result in crawl("pizza", "florida"):
 print(result)

Stack Exchange Network

Web-crawler for yellowpage designed using python

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Web-crawler for yellowpage designed using python

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions