3
\$\begingroup\$

The crawler crawls for a set of keywords and saves the count in a database:

import re
import time
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import os
import httplib2
#import Links
#import Keywords
import MySQLdb
import peewee
from peewee import *
from datetime import datetime
import argparse
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.DEBUG)
#ch = logging.StreamHandler()
#ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#ch.setFormatter(formatter)
#logger.addHandler(ch)
fh.setFormatter(formatter)
logger.addHandler(fh)
parser = argparse.ArgumentParser()
parser.add_argument('-l', '--url', help="The base link to be crawled", required=True)
parser.add_argument('-k', '--keywords', help="Keywords to search", required=True)
args = parser.parse_args()
keywords = (args.keywords).split(',')
mapping = dict()
mapping[args.url] = keywords
logger.info(mapping)
db = MySQLDatabase('WebSpider', user='ruut', passwd='ruut')
parsed = set()
class DATA(peewee.Model):
 parent_link = peewee.CharField()
 sub_link = peewee.CharField()
 keyword = peewee.CharField()
 count = peewee.IntegerField()
 class Meta:
 database = db
 db_table = 'DATA'
def make_soup(s):
 match=re.compile('https://|http://')
 if re.search(match,s):
 try:
 http = httplib2.Http()
 status, response = http.request(s)
 page = BeautifulSoup(response,'lxml')
 return page
 except:
 return None
 else:
 return None
def get_list_of_urls(url):
 match = re.compile('(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
 soup = make_soup(url)
 l = set()
 try:
 for a in soup.find_all('a'):
 try:
 if '?' not in a['href'] and re.search(match,a['href']) and re.search(re.compile(url),a['href']) and a['href']!=url:
 l.add(str(a['href']))
 except Exception as e:
 logger.info('Exception ' + str(a)+' has no href')
 logger.info(e)
 continue
 except Exception as e:
 logger.info('Exception ' + url+' has no links')
 logger.info(e)
 pass
 return l
def get_all_the_urls(base,list_of_urls,depth):
 logger.info(depth)
 if depth == 10:
 return 
 else: 
 depth = depth + 1
 for i in list_of_urls: #scan the list of urls 
 s = get_list_of_urls(i)
 get_all_the_urls(base,s,depth)
 for j in s: #scan the sublinks
 try:
 if j in parsed:
 continue
 soup = make_soup(j)
 logger.info('url is '+ j)
 for k in mapping[base]: #look for keys on the webpage
 key_count = len(soup(text=re.compile(k, re.IGNORECASE)))
 logger.info('Key count is '+str(key_count))
 if(key_count>0):
 record = DATA(parent_link = base,sub_link = j ,keyword = k ,count = key_count) #i,j,k,key_count
 record.save()
 parsed.add(j)
 logger.info('saved data successfully ' +str(key_count))
 except Exception as e:
 logger.info('Exception ' +str(e)+' in keywords searching')
 continue
def populate_db():
 k = set()
 k.add(args.url)
 temp = time.time()
 logger.info(str(datetime.now()))
 get_all_the_urls(args.url,k,0)
 logger.info('time taken '+str(time.time()-temp))
populate_db()
Jamal
35.2k13 gold badges134 silver badges238 bronze badges
asked Dec 18, 2017 at 22:08
\$\endgroup\$

2 Answers 2

2
\$\begingroup\$

Some of the general things I would work on:

  • split the code into separate modules logically. Currently, you have all the code mixed up in single file - you have argument parsing, database interactions, web-scraping code blocks in one place
  • consistent indentation. Use 4 spaces for indentation
  • variable naming. Use descriptive variable names. Variable names like l, i or j are not meaningful and raise questions when reading the code

Code Style

Performance

answered Dec 18, 2017 at 22:24
\$\endgroup\$
2
\$\begingroup\$
  1. Follow pep8 style guide for python's coding guide lines. This would increase readability

  2. Leave a blank line after all imports.try to organize imports like first standard library, then third-party library like BeautifulSoup

  3. Do not use a general Exception, catch only the specific exception you expect

  4. continue is not needed

alecxe
17.5k8 gold badges52 silver badges93 bronze badges
answered Dec 19, 2017 at 1:30
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.