A web crawler in Python

Question 1

The crawler crawls for a set of keywords and saves the count in a database:

import re
import time
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import os
import httplib2
#import Links
#import Keywords
import MySQLdb
import peewee
from peewee import *
from datetime import datetime
import argparse
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.DEBUG)
#ch = logging.StreamHandler()
#ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#ch.setFormatter(formatter)
#logger.addHandler(ch)
fh.setFormatter(formatter)
logger.addHandler(fh)
parser = argparse.ArgumentParser()
parser.add_argument('-l', '--url', help="The base link to be crawled", required=True)
parser.add_argument('-k', '--keywords', help="Keywords to search", required=True)
args = parser.parse_args()
keywords = (args.keywords).split(',')
mapping = dict()
mapping[args.url] = keywords
logger.info(mapping)
db = MySQLDatabase('WebSpider', user='ruut', passwd='ruut')
parsed = set()
class DATA(peewee.Model):
 parent_link = peewee.CharField()
 sub_link = peewee.CharField()
 keyword = peewee.CharField()
 count = peewee.IntegerField()
 class Meta:
 database = db
 db_table = 'DATA'
def make_soup(s):
 match=re.compile('https://|http://')
 if re.search(match,s):
 try:
 http = httplib2.Http()
 status, response = http.request(s)
 page = BeautifulSoup(response,'lxml')
 return page
 except:
 return None
 else:
 return None
def get_list_of_urls(url):
 match = re.compile('(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
 soup = make_soup(url)
 l = set()
 try:
 for a in soup.find_all('a'):
 try:
 if '?' not in a['href'] and re.search(match,a['href']) and re.search(re.compile(url),a['href']) and a['href']!=url:
 l.add(str(a['href']))
 except Exception as e:
 logger.info('Exception ' + str(a)+' has no href')
 logger.info(e)
 continue
 except Exception as e:
 logger.info('Exception ' + url+' has no links')
 logger.info(e)
 pass
 return l
def get_all_the_urls(base,list_of_urls,depth):
 logger.info(depth)
 if depth == 10:
 return 
 else: 
 depth = depth + 1
 for i in list_of_urls: #scan the list of urls 
 s = get_list_of_urls(i)
 get_all_the_urls(base,s,depth)
 for j in s: #scan the sublinks
 try:
 if j in parsed:
 continue
 soup = make_soup(j)
 logger.info('url is '+ j)
 for k in mapping[base]: #look for keys on the webpage
 key_count = len(soup(text=re.compile(k, re.IGNORECASE)))
 logger.info('Key count is '+str(key_count))
 if(key_count>0):
 record = DATA(parent_link = base,sub_link = j ,keyword = k ,count = key_count) #i,j,k,key_count
 record.save()
 parsed.add(j)
 logger.info('saved data successfully ' +str(key_count))
 except Exception as e:
 logger.info('Exception ' +str(e)+' in keywords searching')
 continue
def populate_db():
 k = set()
 k.add(args.url)
 temp = time.time()
 logger.info(str(datetime.now()))
 get_all_the_urls(args.url,k,0)
 logger.info('time taken '+str(time.time()-temp))
populate_db()

Question 2

Some of the general things I would work on:

split the code into separate modules logically. Currently, you have all the code mixed up in single file - you have argument parsing, database interactions, web-scraping code blocks in one place
consistent indentation. Use 4 spaces for indentation
variable naming. Use descriptive variable names. Variable names like l, i or j are not meaningful and raise questions when reading the code

Code Style

avoid handling broad exceptions with a bare except
remove unused imports and re-group them based on PEP8 recommendations
make sure to properly use whitespaces in expressions in statements
put the main execution logic of the program into the if __name__ == '__main__':
you don't need that pass in the get_list_of_urls() function
depth = depth + 1 could be shortened to depth += 1

Performance

since you are requesting the pages from the same host multiple times, consider switching to requests making use of a single session instance which allows to re-use an underlying TCP connection making subsequent requests to the same host faster
importing SoupStrainer was actually a good idea. You can use it and scope the parsing to only the desired parts of the HTML
since you are using regular expressions checks here and there, consider pre-compiling them and using the compiled patterns for searching and matching

Question 3

Follow pep8 style guide for python's coding guide lines. This would increase readability
Leave a blank line after all imports.try to organize imports like first standard library, then third-party library like BeautifulSoup
Do not use a general Exception, catch only the specific exception you expect
continue is not needed

alecxe alecxe 17.5k8 gold badges52 silver badges93 bronze badges · Accepted Answer · 2017-12-18 22:24:35Z

Some of the general things I would work on:

split the code into separate modules logically. Currently, you have all the code mixed up in single file - you have argument parsing, database interactions, web-scraping code blocks in one place
consistent indentation. Use 4 spaces for indentation
variable naming. Use descriptive variable names. Variable names like l, i or j are not meaningful and raise questions when reading the code

Code Style

avoid handling broad exceptions with a bare except
remove unused imports and re-group them based on PEP8 recommendations
make sure to properly use whitespaces in expressions in statements
put the main execution logic of the program into the if __name__ == '__main__':
you don't need that pass in the get_list_of_urls() function
depth = depth + 1 could be shortened to depth += 1

Performance

since you are requesting the pages from the same host multiple times, consider switching to requests making use of a single session instance which allows to re-use an underlying TCP connection making subsequent requests to the same host faster
importing SoupStrainer was actually a good idea. You can use it and scope the parsing to only the desired parts of the HTML
since you are using regular expressions checks here and there, consider pre-compiling them and using the compiled patterns for searching and matching

Stack Exchange Network

A web crawler in Python

2 Answers 2

Code Style

Performance

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

A web crawler in Python

2 Answers 2

Code Style

Performance

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions