Simple recursive web crawler

Question 1

I made a simple web crawler, I know there's many better ones out there, but I thought rolling my own would be a valuable learning experience.

The problem is that I think there's some things I could improve here. I commented the code best I could to explain what it's doing:

import re, random, requests, threading, collections, queue
class Crawler():
 def __init__(self):
 self.data = set() # this will store our crawled urls, avoiding duplicates
 self.terminate = False # flag to end the program
 self.lock = threading.Lock()
 self.print_queue = queue.Queue() # this is for our prints
 self.work = collections.defaultdict(int) # store some data about the work done (number of urls stored) by each worker
 def run(self, threads=15, limit=1000, urls=()):
 if(not urls): print('[-] Provide start urls'); return # one of the ways ternary operator in python, ';' if continue in same line
 self.urls_max = limit # each thread is killed when we have 1000 urls stored
 for i, url in enumerate(urls): # placing the threads, 15 for each url by default
 for j in range(threads):
 tName = '{}.{}'.format(i+1, j+1)
 t = threading.Thread(target=self.producer, args=(url,), name=tName)
 t.start()
 t = threading.Thread(target=self.print_manager)
 t.daemon = True
 t.start()
 del t
 def wait_kill_threads(self): # waits for all the threads are killed
 while(threading.active_count() > 2): # main thread (this one) count as 1 and deamon too
 continue
 def print_manager(self): # our deamon to print
 while True:
 msg = self.print_queue.get()
 print(msg)
 self.print_queue.task_done()
 def renew_url(self, list_urls): # choose another url to work with
 return random.choice(list_urls)
 def worker(self): # get the thread details
 return threading.currentThread()
 def get_work_done(self):
 self.wait_kill_threads()
 return self.work
 def get_data(self): # our final data
 self.wait_kill_threads()
 return self.data
 def get_links(self, url):
 req = self.get_req(url)
 if(req is not None):
 html = req.text # the html of the page crawled
 urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
 return urls
 return []
 def get_req(self, url):
 if(self.terminate): return None
 with requests.Session() as s:
 try:
 headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'}
 return s.get(url, headers=headers, timeout=2)
 except Exception as e:
 self.print_queue.put('[-] Thread {} [-] Error getting page: {} [-] ERROR: {}'.format(self.worker().getName(), url, e))
 return self.get_req(self.renew_url(list(self.data)))
 def verify_and_add(self, url):
 with self.lock:
 len_data = len(self.data)
 self.terminate = len_data >= self.urls_max # true if we reach the self.urls_max, 1000 by default
 if(not self.terminate):
 if(url not in self.data):
 self.data.add(url) # add to urls crawled
 self.work[self.worker().getName()] += 1 # plus 1 on work done by this thread
 response = '[+] Total stored: {: <4} [+] Thread {: >5} [+] {: <4} urls stored [+] putting: {}'
 self.print_queue.put(response.format(len_data+1, self.worker().getName(), self.work[self.worker().getName()], url))
 return True
 return False
 def inject_links(self, url):
 if(self.terminate): return
 urls = self.get_links(url) # get all links from a page
 for url in urls:
 if(not self.verify_and_add(url)): # check if url already exists on our set of urls, if false we will crawl another random one from our set
 if(self.terminate): return
 change_urls = list(set(urls) - self.data) # eles that are in urls but not in data
 if change_urls: # check if we have some new urls to scrape
 return self.inject_links(self.renew_url(change_urls)) # repeat with the new url
 return self.inject_links(self.renew_url(list(self.data))) # if we dont have any new url lets a get a random from our data
 return
 def producer(self, url):
 while(not self.terminate): # this is the trigger to stop all threads
 self.inject_links(url)
 url = self.renew_url(list(self.data)) # start with another random url on our data
 continue
 with self.lock: # LOCK THIS BLOCK
 response = '[+] killing thread {: >5} [+] {: <4} urls stored [+] waiting for {: >3} threads to finish'
 self.print_queue.put(response.format(self.worker().getName(), self.work[self.worker().getName()], threading.active_count()-3)) # -3 because mainthread and deamon thread
crawler = Crawler()
target_urls = ('http://stackoverflow.com/', 'http://edition.cnn.com/', 'https://www.reddit.com/')
crawler.run(threads=15, limit=1000, urls=target_urls)
data = crawler.get_data()
print('\nyap got', len(data))

You can also had this at the end of the code to check in detail the best/worst work done:

workers = crawler.get_work_done()
work_items = workers.items()
print('Work done, total urls stored by thread: ', dict(workers))
print('yap got {} of work done (sum of all workers stored urls)'.format(sum(workers[i] for i in workers)))
best_worker, max_urls_stored = max((i for i in work_items), key=lambda x: x[1])
MVPS = [i for i in workers if workers[i] == max_urls_stored]
print('And the best workers are: {} with {} urls stored'.format(MVPS, max_urls_stored))
worst_worker, min_urls_stored = min((i for i in work_items), key=lambda x: x[1])
LVPS = [i for i in workers if workers[i] == min_urls_stored]
print('And the worst workers are: {} with {} urls stored\n'.format(LVPS, min_urls_stored))

Question 2

1.- Yes, you can use semicolon to have more than one sentence in the same line, but..

Compound statements (multiple statements on the same line) are generally discouraged.

if(not urls): print('[-] Provide start urls'); return # one of the ways ternary operator in python, ';' if continue in same line

Source: PEP-8

2.- Wait some milliseconds all those while True: (ie: wait_kill_threads, print_manager, maybe in producer too) your processor will thank you.

3.- Avoid Magic Numbers or Hardcoded numbers. On the last line in producer:

.....threading.active_count()-3)) # -3 because mainthread and deamon thread

a better practice is use a CONSTANT_VALUE at the beginning of the file;

QTY_THREAD_OFFSET = 3
.....threading.active_count() - QTY_THREAD_OFFSET))

(maybe you can think in a better constant name, but you get the idea right?)

The same for:

while(threading.active_count() > 2): # main thread (this one) count as 1 and deamon too

using the previous const it will be:

while(threading.active_count() >= QTY_THREAD_OFFSET):

4.- Regex: If you are going to use a regex multiple times a good practice is compile the regex before. Instead of:

re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls

you can do this:

URL_RE_PATTERN = '(?<=href=["\'])https?://.+?(?=["\'])'
...
class Crawler():
 regex_urls = re.compile(URL_RE_PATTERN)
 def __init__(self):
 ... 
...
...
def get_links(self, url):
....
 urls = urls_re.match(regex_urls)

Also you can use BeautifulSoup to get urls.

5.- In verify_and_add you can try to avoid nested if's:

if(not self.terminate):
 if(url not in self.data):
 self.data.add(url) #

It could be replaced by:

if self.terminate or url in self.data:
 return False
self.data.add(url)
...

6.- A little detail:

if(req is not None):

It could be:

if req:

-- Edit: As Guimo pointed out, the previous sentences are not equal.

So I would suggest to change

def get_links(self, url):
 req = self.get_req(url)
 if(req is not None):
 html = req.text # the html of the page crawled
 urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
 return urls
 return []

by

def get_links(self, url):
 req = self.get_req(url)
 if req is None:
 return []
 html = req.text # the html of the page crawled
 urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
 return urls

Ie, remove parenthesis and an early return.

7.- Are you removing the visited urls from list_urls after is visited? In renew_url you are getting a random url but I can't see if you are removing it from that list.

8.- Wait some random secs between requesting pages on the same server, you are crawling a website most of them aren't happy with that.

That is what I see in a first approach! Hope it helps !

Question 3

About number 6, note that they are not equivalent: suppose req is an empty list (or anything that evaluates to "False" apart from None). The first one will evaluate to "if True" and the second one will evaluate to "if False". Related to this, I recommend looking for information about __nonzero__() function. Also, no need to write parenthesis, you could go with ''if req is not None:`.

Question 4

@Guimo; Thanks for the feedback! you are right!

Question 5

Thanks for all advices, nice ones, good call on the regex

Question 6

About number 7, all urls will be distinct because of : "self.data = set() # this will store our crawled urls, avoiding duplicates"

Question 7

Is this in response to the answer by mayo? If so, this should not be an answer to the original question by Mayo.... it would be appropriate to comment on the answer by Mayo but you will need to earn 50 reputation points first...

mayo mayo 5014 silver badges6 bronze badges · Accepted Answer · 2017-02-09 00:51:29Z