5
\$\begingroup\$

I've made a program that scrapes a famous spanish web of housing ads. The program is formed by two big classes: a Manager, that goes through the list of ads and saves their links in a list, and a class that parses each ad and stores some values into a database (as the price or the number of bedrooms).

I have another auxiliar class, called Annon_Browser, that uses the module requests to download different urls. It swaps between differents proxyes and user-agents to annonimize the search.

I am especially about the way I have handled parallelism. While I have protected the database behind a lock to prevent race conditions, right now I am using exceptions to kill my threads, which I guess is not best practice. I'm also afraid of a memory leak somewhere because of the abrupt termination of the threads.

import logging
import random
import re
import requests
import threading
import time
import web2 as web_tools
from bs4 import BeautifulSoup
from sqlobject import *
DB_LOCK = threading.Lock()
logger = ''
class ThreadEnd(Exception):
 pass
class WebCrawlerManager(object):
 def __init__(self, create_table=False):
 """
 Connects to a SQL database. If specified, table is created from scratch
 :param create_table: Sets if a new SQL table is to be created.
 """
 logger.info("[+] Crawler up. Table creation:{}".format(create_table))
 sqlhub.processConnection = connectionForURI("mysql://root:root@localhost/fotocasa")
 logger.info("[+] Connected to MySQL server")
 if create_table:
 WebCrawlerFotocasa.CasaMadrid.createTable()
 logger.info("[*] Table successfully created")
 self.main_loop()
 @staticmethod
 def get_number_pages(web):
 """
 Parses the number of pages to be read.
 :param web: [str] html to be parsed.
 :return: [int] number of pages to be read.
 """
 num_pages = web.findAll('a', {"class": "sui-Pagination-link"})[-2].get_text()
 num_pages = int(num_pages.replace('.', ''))
 return num_pages
 def main_loop(self):
 processed = 0
 # Import and parse main web.
 url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/madrid-capital/todas-las-zonas/l/"
 web = web_tools.Annon_Browser(url_fotocasa)
 web = BeautifulSoup(web, 'html5lib')
 number_of_pages = self.get_number_pages(web)
 index = 1
 while index < number_of_pages:
 urls = []
 logger.info("[+] Entering main loop. Page:{}/{} Processed until now: {}".format(index, number_of_pages, processed))
 try:
 web = requests.get(url_fotocasa).content
 except Exception:
 logger.critical("Impossible to retrieve URLs. Going to sleep and trying later.")
 time.sleep(random.randrange(20, 30))
 continue
 web = BeautifulSoup(web, 'html5lib')
 web = web.findAll('a', {"class": "re-Card-title"})
 logger.info("[+] Urls retrieved: {}".format(len(web)))
 processed += len(web)
 for x in web:
 x = 'http:' + x['href']
 urls.append(x)
 for count, x in enumerate(urls):
 threading.Thread(target=WebCrawlerFotocasa, args=(x, count)).start()
 dormir = random.randint(5, 20)
 logger.info('[*] Página acabada. Durmiendo durante: {} segundos'.format(dormir))
 time.sleep(dormir)
 if len(web) == 0:
 index -= 1
 index += 1
 url_fotocasa = "http://www.fotocasa.es/es/comprar/casas/espana/todas-las-zonas/l/{}".format(index)
class WebCrawlerFotocasa(object):
 """Descarga un html correspondiente a un enlace de fotocasa. Lo parsea y lo escribe en la base de datos"""
 LABELS = ('ad_title', 'city', 'county', 'city_zone', 'mts2', 'neighbourhood', 'postal_code', 'price',
 'price_max', 'price_min', 'property', 'bathrooms', 'create_date', 'transaction', 'zone1',
 'propertyfeature', 'oasenergeticcert', 'oasantiquity', 'oasheating', 'conservation', 'lat', 'lng',
 'rooms', 'ad_id')
 class CasaMadrid(SQLObject):
 """
 Definición de la base de datos-
 """
 ad_id = IntCol()
 ad_title = StringCol(length=300)
 city = StringCol(length=50, default=None)
 county = StringCol(length=50, default=None)
 city_zone = StringCol(length=100, default=None)
 mts2 = IntCol()
 bathrooms = IntCol()
 rooms = IntCol()
 price = StringCol(length=1000)
 lat = FloatCol(default=None)
 lng = FloatCol(default=None)
 postal_code = IntCol(default=None)
 features = StringCol(length=200, default=None)
 transaction = StringCol(length=30, default=None)
 create_date = DateCol(default=None)
 zone = StringCol(length=50, default=None)
 neighbourhood = StringCol(length=50, default=None)
 price_min = IntCol(default=None)
 price_max = IntCol(default=None)
 conservacion = StringCol(length=50, default=None)
 property1 = StringCol(length=50, default=None)
 last_date = DateCol()
 def __init__(self, url, count):
 """ Downloads html and creates a Beautiful soup object """
 self.DONE = False
 self.url = url
 self.count = count
 self.data = dict()
 logger.info("[{}] Into thread".format(count))
 web = False
 while not web:
 web = web_tools.Annon_Browser(url)
 logger.info("[{}] Attempting to get url.".format(count))
 logger.info("[{}] Web reached. Parsing data".format(count))
 web = BeautifulSoup(web, 'html5lib')
 web.prettify()
 self.get_data(web)
 self.write_data()
 def get_data(self, web):
 """Parses labels into dict"""
 # titulo vivienda
 self.data['ad_title'] = web.findAll(
 'h1', {"class": "property-title"})[0].get_text().strip()
 self.data['last_date'] = time.strftime('%Y-%m-%d')
 re1 = re.compile(r'(\w+)\s*?:\s*?("[\w\d\s,-\\]*")')
 re2 = re.compile(r'("\w*")\s*?:\s*?("[w,-|]*")')
 residual_data = web.findAll("script", {"type": "text/javascript"})
 text_residual_data = ''
 for x in residual_data:
 text_residual_data += str(x)
 residual_data = re.findall(
 re1, text_residual_data) + re.findall(re2, text_residual_data)
 for x, y in residual_data:
 x = x.strip('"')
 if x == 'price':
 self.data[x] = '{},{};'.format(
 str(y), time.strftime('%Y-%m-%d'))
 elif x == 'create_date':
 y = y.strip('"')
 dia, mes, año = y.split('/')
 self.data[x] = '{}-{}-{}'.format(año[:4], mes, dia)
 elif x in WebCrawlerFotocasa.LABELS:
 y = y.strip('"')
 y = y.strip(' ')
 try:
 y = int(y)
 except:
 pass
 if x == 'property':
 x = 'property1'
 if y:
 self.data[x] = y
 logger.info("[{}] Data parsed. Labels parsed".format(self.count, len(self.data)))
 return
 def write_data(self):
 try:
 WebCrawlerFotocasa.CasaMadrid(**self.data)
 except TypeError:
 logger.warning("[!{}]Lacking critical information. URL = {}".format(self.count, self.url))
 raise Exception
 if DB_LOCK.acquire(timeout=100):
 DB_LOCK.release()
 logger.info("[{}] Data saved into database".format(self.count))
 logger.info("[{}] Web data retrieved. Killing thread".format(self.count))
 raise ThreadEnd
def __main__():
 global logger
 FORMAT = '%(asctime)-15s %(message)s'
 logging.basicConfig(format=FORMAT, filename="logger2.txt")
 logger = logging.getLogger('web_crawler')
 logger.setLevel(10)
 WebCrawlerManager(create_table=True)
__main__()
asked Jun 22, 2017 at 17:27
\$\endgroup\$

1 Answer 1

4
\$\begingroup\$

Let me comment on the web-scraping part specifically without touching the "threading" and "memory" related issues:

Overall, I am afraid you are reinventing what Scrapy offers from out-of-the-box - switching to it would help to remove a lot of boilerplate threading and mysql lock related code, make the project more modular having the data processing and pipelining part defined separately in a clear and modular manner.

answered Jun 22, 2017 at 17:45
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.