I've made a program that scrapes a famous spanish web of housing ads. The program is formed by two big classes: a Manager, that goes through the list of ads and saves their links in a list, and a class that parses each ad and stores some values into a database (as the price or the number of bedrooms).
I have another auxiliar class, called Annon_Browser
, that uses the module requests
to download different urls. It swaps between differents proxyes and user-agents to annonimize the search.
I am especially about the way I have handled parallelism. While I have protected the database behind a lock to prevent race conditions, right now I am using exceptions to kill my threads, which I guess is not best practice. I'm also afraid of a memory leak somewhere because of the abrupt termination of the threads.
import logging
import random
import re
import requests
import threading
import time
import web2 as web_tools
from bs4 import BeautifulSoup
from sqlobject import *
DB_LOCK = threading.Lock()
logger = ''
class ThreadEnd(Exception):
pass
class WebCrawlerManager(object):
def __init__(self, create_table=False):
"""
Connects to a SQL database. If specified, table is created from scratch
:param create_table: Sets if a new SQL table is to be created.
"""
logger.info("[+] Crawler up. Table creation:{}".format(create_table))
sqlhub.processConnection = connectionForURI("mysql://root:root@localhost/fotocasa")
logger.info("[+] Connected to MySQL server")
if create_table:
WebCrawlerFotocasa.CasaMadrid.createTable()
logger.info("[*] Table successfully created")
self.main_loop()
@staticmethod
def get_number_pages(web):
"""
Parses the number of pages to be read.
:param web: [str] html to be parsed.
:return: [int] number of pages to be read.
"""
num_pages = web.findAll('a', {"class": "sui-Pagination-link"})[-2].get_text()
num_pages = int(num_pages.replace('.', ''))
return num_pages
def main_loop(self):
processed = 0
# Import and parse main web.
url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/madrid-capital/todas-las-zonas/l/"
web = web_tools.Annon_Browser(url_fotocasa)
web = BeautifulSoup(web, 'html5lib')
number_of_pages = self.get_number_pages(web)
index = 1
while index < number_of_pages:
urls = []
logger.info("[+] Entering main loop. Page:{}/{} Processed until now: {}".format(index, number_of_pages, processed))
try:
web = requests.get(url_fotocasa).content
except Exception:
logger.critical("Impossible to retrieve URLs. Going to sleep and trying later.")
time.sleep(random.randrange(20, 30))
continue
web = BeautifulSoup(web, 'html5lib')
web = web.findAll('a', {"class": "re-Card-title"})
logger.info("[+] Urls retrieved: {}".format(len(web)))
processed += len(web)
for x in web:
x = 'http:' + x['href']
urls.append(x)
for count, x in enumerate(urls):
threading.Thread(target=WebCrawlerFotocasa, args=(x, count)).start()
dormir = random.randint(5, 20)
logger.info('[*] Página acabada. Durmiendo durante: {} segundos'.format(dormir))
time.sleep(dormir)
if len(web) == 0:
index -= 1
index += 1
url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/espana/todas-las-zonas/l/{}".format(index)
class WebCrawlerFotocasa(object):
"""Descarga un html correspondiente a un enlace de fotocasa. Lo parsea y lo escribe en la base de datos"""
LABELS = ('ad_title', 'city', 'county', 'city_zone', 'mts2', 'neighbourhood', 'postal_code', 'price',
'price_max', 'price_min', 'property', 'bathrooms', 'create_date', 'transaction', 'zone1',
'propertyfeature', 'oasenergeticcert', 'oasantiquity', 'oasheating', 'conservation', 'lat', 'lng',
'rooms', 'ad_id')
class CasaMadrid(SQLObject):
"""
Definición de la base de datos-
"""
ad_id = IntCol()
ad_title = StringCol(length=300)
city = StringCol(length=50, default=None)
county = StringCol(length=50, default=None)
city_zone = StringCol(length=100, default=None)
mts2 = IntCol()
bathrooms = IntCol()
rooms = IntCol()
price = StringCol(length=1000)
lat = FloatCol(default=None)
lng = FloatCol(default=None)
postal_code = IntCol(default=None)
features = StringCol(length=200, default=None)
transaction = StringCol(length=30, default=None)
create_date = DateCol(default=None)
zone = StringCol(length=50, default=None)
neighbourhood = StringCol(length=50, default=None)
price_min = IntCol(default=None)
price_max = IntCol(default=None)
conservacion = StringCol(length=50, default=None)
property1 = StringCol(length=50, default=None)
last_date = DateCol()
def __init__(self, url, count):
""" Downloads html and creates a Beautiful soup object """
self.DONE = False
self.url = url
self.count = count
self.data = dict()
logger.info("[{}] Into thread".format(count))
web = False
while not web:
web = web_tools.Annon_Browser(url)
logger.info("[{}] Attempting to get url.".format(count))
logger.info("[{}] Web reached. Parsing data".format(count))
web = BeautifulSoup(web, 'html5lib')
web.prettify()
self.get_data(web)
self.write_data()
def get_data(self, web):
"""Parses labels into dict"""
# titulo vivienda
self.data['ad_title'] = web.findAll(
'h1', {"class": "property-title"})[0].get_text().strip()
self.data['last_date'] = time.strftime('%Y-%m-%d')
re1 = re.compile(r'(\w+)\s*?:\s*?("[\w\d\s,-\\]*")')
re2 = re.compile(r'("\w*")\s*?:\s*?("[w,-|]*")')
residual_data = web.findAll("script", {"type": "text/javascript"})
text_residual_data = ''
for x in residual_data:
text_residual_data += str(x)
residual_data = re.findall(
re1, text_residual_data) + re.findall(re2, text_residual_data)
for x, y in residual_data:
x = x.strip('"')
if x == 'price':
self.data[x] = '{},{};'.format(
str(y), time.strftime('%Y-%m-%d'))
elif x == 'create_date':
y = y.strip('"')
dia, mes, año = y.split('/')
self.data[x] = '{}-{}-{}'.format(año[:4], mes, dia)
elif x in WebCrawlerFotocasa.LABELS:
y = y.strip('"')
y = y.strip(' ')
try:
y = int(y)
except:
pass
if x == 'property':
x = 'property1'
if y:
self.data[x] = y
logger.info("[{}] Data parsed. Labels parsed".format(self.count, len(self.data)))
return
def write_data(self):
try:
WebCrawlerFotocasa.CasaMadrid(**self.data)
except TypeError:
logger.warning("[!{}]Lacking critical information. URL = {}".format(self.count, self.url))
raise Exception
if DB_LOCK.acquire(timeout=100):
DB_LOCK.release()
logger.info("[{}] Data saved into database".format(self.count))
logger.info("[{}] Web data retrieved. Killing thread".format(self.count))
raise ThreadEnd
def __main__():
global logger
FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT, filename="logger2.txt")
logger = logging.getLogger('web_crawler')
logger.setLevel(10)
WebCrawlerManager(create_table=True)
__main__()
1 Answer 1
Let me comment on the web-scraping part specifically without touching the "threading" and "memory" related issues:
- switch from
html5lib
tolxml
for faster HTML parsing - instead of using
requests.get(url_fotocasa)
for all the requests in the main loop - initialize aSession
and reuse - under the hoodrequests
will reuse the same TCP connection which would result into a performance boost - use
SoupStrainer
class to parse only the desired parts of the document
Overall, I am afraid you are reinventing what Scrapy
offers from out-of-the-box - switching to it would help to remove a lot of boilerplate threading and mysql lock related code, make the project more modular having the data processing and pipelining part defined separately in a clear and modular manner.