Web-scraper for housing web

Question 1

I've made a program that scrapes a famous spanish web of housing ads. The program is formed by two big classes: a Manager, that goes through the list of ads and saves their links in a list, and a class that parses each ad and stores some values into a database (as the price or the number of bedrooms).

I have another auxiliar class, called Annon_Browser, that uses the module requests to download different urls. It swaps between differents proxyes and user-agents to annonimize the search.

I am especially about the way I have handled parallelism. While I have protected the database behind a lock to prevent race conditions, right now I am using exceptions to kill my threads, which I guess is not best practice. I'm also afraid of a memory leak somewhere because of the abrupt termination of the threads.

import logging
import random
import re
import requests
import threading
import time
import web2 as web_tools
from bs4 import BeautifulSoup
from sqlobject import *
DB_LOCK = threading.Lock()
logger = ''
class ThreadEnd(Exception):
 pass
class WebCrawlerManager(object):
 def __init__(self, create_table=False):
 """
 Connects to a SQL database. If specified, table is created from scratch
 :param create_table: Sets if a new SQL table is to be created.
 """
 logger.info("[+] Crawler up. Table creation:{}".format(create_table))
 sqlhub.processConnection = connectionForURI("mysql://root:root@localhost/fotocasa")
 logger.info("[+] Connected to MySQL server")
 if create_table:
 WebCrawlerFotocasa.CasaMadrid.createTable()
 logger.info("[*] Table successfully created")
 self.main_loop()
 @staticmethod
 def get_number_pages(web):
 """
 Parses the number of pages to be read.
 :param web: [str] html to be parsed.
 :return: [int] number of pages to be read.
 """
 num_pages = web.findAll('a', {"class": "sui-Pagination-link"})[-2].get_text()
 num_pages = int(num_pages.replace('.', ''))
 return num_pages
 def main_loop(self):
 processed = 0
 # Import and parse main web.
 url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/madrid-capital/todas-las-zonas/l/"
 web = web_tools.Annon_Browser(url_fotocasa)
 web = BeautifulSoup(web, 'html5lib')
 number_of_pages = self.get_number_pages(web)
 index = 1
 while index < number_of_pages:
 urls = []
 logger.info("[+] Entering main loop. Page:{}/{} Processed until now: {}".format(index, number_of_pages, processed))
 try:
 web = requests.get(url_fotocasa).content
 except Exception:
 logger.critical("Impossible to retrieve URLs. Going to sleep and trying later.")
 time.sleep(random.randrange(20, 30))
 continue
 web = BeautifulSoup(web, 'html5lib')
 web = web.findAll('a', {"class": "re-Card-title"})
 logger.info("[+] Urls retrieved: {}".format(len(web)))
 processed += len(web)
 for x in web:
 x = 'http:' + x['href']
 urls.append(x)
 for count, x in enumerate(urls):
 threading.Thread(target=WebCrawlerFotocasa, args=(x, count)).start()
 dormir = random.randint(5, 20)
 logger.info('[*] Página acabada. Durmiendo durante: {} segundos'.format(dormir))
 time.sleep(dormir)
 if len(web) == 0:
 index -= 1
 index += 1
 url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/espana/todas-las-zonas/l/{}".format(index)
class WebCrawlerFotocasa(object):
 """Descarga un html correspondiente a un enlace de fotocasa. Lo parsea y lo escribe en la base de datos"""
 LABELS = ('ad_title', 'city', 'county', 'city_zone', 'mts2', 'neighbourhood', 'postal_code', 'price',
 'price_max', 'price_min', 'property', 'bathrooms', 'create_date', 'transaction', 'zone1',
 'propertyfeature', 'oasenergeticcert', 'oasantiquity', 'oasheating', 'conservation', 'lat', 'lng',
 'rooms', 'ad_id')
 class CasaMadrid(SQLObject):
 """
 Definición de la base de datos-
 """
 ad_id = IntCol()
 ad_title = StringCol(length=300)
 city = StringCol(length=50, default=None)
 county = StringCol(length=50, default=None)
 city_zone = StringCol(length=100, default=None)
 mts2 = IntCol()
 bathrooms = IntCol()
 rooms = IntCol()
 price = StringCol(length=1000)
 lat = FloatCol(default=None)
 lng = FloatCol(default=None)
 postal_code = IntCol(default=None)
 features = StringCol(length=200, default=None)
 transaction = StringCol(length=30, default=None)
 create_date = DateCol(default=None)
 zone = StringCol(length=50, default=None)
 neighbourhood = StringCol(length=50, default=None)
 price_min = IntCol(default=None)
 price_max = IntCol(default=None)
 conservacion = StringCol(length=50, default=None)
 property1 = StringCol(length=50, default=None)
 last_date = DateCol()
 def __init__(self, url, count):
 """ Downloads html and creates a Beautiful soup object """
 self.DONE = False
 self.url = url
 self.count = count
 self.data = dict()
 logger.info("[{}] Into thread".format(count))
 web = False
 while not web:
 web = web_tools.Annon_Browser(url)
 logger.info("[{}] Attempting to get url.".format(count))
 logger.info("[{}] Web reached. Parsing data".format(count))
 web = BeautifulSoup(web, 'html5lib')
 web.prettify()
 self.get_data(web)
 self.write_data()
 def get_data(self, web):
 """Parses labels into dict"""
 # titulo vivienda
 self.data['ad_title'] = web.findAll(
 'h1', {"class": "property-title"})[0].get_text().strip()
 self.data['last_date'] = time.strftime('%Y-%m-%d')
 re1 = re.compile(r'(\w+)\s*?:\s*?("[\w\d\s,-\\]*")')
 re2 = re.compile(r'("\w*")\s*?:\s*?("[w,-|]*")')
 residual_data = web.findAll("script", {"type": "text/javascript"})
 text_residual_data = ''
 for x in residual_data:
 text_residual_data += str(x)
 residual_data = re.findall(
 re1, text_residual_data) + re.findall(re2, text_residual_data)
 for x, y in residual_data:
 x = x.strip('"')
 if x == 'price':
 self.data[x] = '{},{};'.format(
 str(y), time.strftime('%Y-%m-%d'))
 elif x == 'create_date':
 y = y.strip('"')
 dia, mes, año = y.split('/')
 self.data[x] = '{}-{}-{}'.format(año[:4], mes, dia)
 elif x in WebCrawlerFotocasa.LABELS:
 y = y.strip('"')
 y = y.strip(' ')
 try:
 y = int(y)
 except:
 pass
 if x == 'property':
 x = 'property1'
 if y:
 self.data[x] = y
 logger.info("[{}] Data parsed. Labels parsed".format(self.count, len(self.data)))
 return
 def write_data(self):
 try:
 WebCrawlerFotocasa.CasaMadrid(**self.data)
 except TypeError:
 logger.warning("[!{}]Lacking critical information. URL = {}".format(self.count, self.url))
 raise Exception
 if DB_LOCK.acquire(timeout=100):
 DB_LOCK.release()
 logger.info("[{}] Data saved into database".format(self.count))
 logger.info("[{}] Web data retrieved. Killing thread".format(self.count))
 raise ThreadEnd
def __main__():
 global logger
 FORMAT = '%(asctime)-15s %(message)s'
 logging.basicConfig(format=FORMAT, filename="logger2.txt")
 logger = logging.getLogger('web_crawler')
 logger.setLevel(10)
 WebCrawlerManager(create_table=True)
__main__()

Question 2

Let me comment on the web-scraping part specifically without touching the "threading" and "memory" related issues:

switch from html5lib to lxml for faster HTML parsing
instead of using requests.get(url_fotocasa) for all the requests in the main loop - initialize a Session and reuse - under the hood requests will reuse the same TCP connection which would result into a performance boost
use SoupStrainer class to parse only the desired parts of the document

Overall, I am afraid you are reinventing what Scrapy offers from out-of-the-box - switching to it would help to remove a lot of boilerplate threading and mysql lock related code, make the project more modular having the data processing and pipelining part defined separately in a clear and modular manner.

alecxe alecxe 17.5k8 gold badges52 silver badges93 bronze badges · Answer 1 · 2017-06-22 17:45:19Z

Let me comment on the web-scraping part specifically without touching the "threading" and "memory" related issues:

switch from html5lib to lxml for faster HTML parsing
instead of using requests.get(url_fotocasa) for all the requests in the main loop - initialize a Session and reuse - under the hood requests will reuse the same TCP connection which would result into a performance boost
use SoupStrainer class to parse only the desired parts of the document

Overall, I am afraid you are reinventing what Scrapy offers from out-of-the-box - switching to it would help to remove a lot of boilerplate threading and mysql lock related code, make the project more modular having the data processing and pipelining part defined separately in a clear and modular manner.

Stack Exchange Network

Web-scraper for housing web

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Web-scraper for housing web

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions