I made a simple web crawler, I know there's many better ones out there, but I thought rolling my own would be a valuable learning experience.
The problem is that I think there's some things I could improve here. I commented the code best I could to explain what it's doing:
import re, random, requests, threading, collections, queue
class Crawler():
def __init__(self):
self.data = set() # this will store our crawled urls, avoiding duplicates
self.terminate = False # flag to end the program
self.lock = threading.Lock()
self.print_queue = queue.Queue() # this is for our prints
self.work = collections.defaultdict(int) # store some data about the work done (number of urls stored) by each worker
def run(self, threads=15, limit=1000, urls=()):
if(not urls): print('[-] Provide start urls'); return # one of the ways ternary operator in python, ';' if continue in same line
self.urls_max = limit # each thread is killed when we have 1000 urls stored
for i, url in enumerate(urls): # placing the threads, 15 for each url by default
for j in range(threads):
tName = '{}.{}'.format(i+1, j+1)
t = threading.Thread(target=self.producer, args=(url,), name=tName)
t.start()
t = threading.Thread(target=self.print_manager)
t.daemon = True
t.start()
del t
def wait_kill_threads(self): # waits for all the threads are killed
while(threading.active_count() > 2): # main thread (this one) count as 1 and deamon too
continue
def print_manager(self): # our deamon to print
while True:
msg = self.print_queue.get()
print(msg)
self.print_queue.task_done()
def renew_url(self, list_urls): # choose another url to work with
return random.choice(list_urls)
def worker(self): # get the thread details
return threading.currentThread()
def get_work_done(self):
self.wait_kill_threads()
return self.work
def get_data(self): # our final data
self.wait_kill_threads()
return self.data
def get_links(self, url):
req = self.get_req(url)
if(req is not None):
html = req.text # the html of the page crawled
urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
return urls
return []
def get_req(self, url):
if(self.terminate): return None
with requests.Session() as s:
try:
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'}
return s.get(url, headers=headers, timeout=2)
except Exception as e:
self.print_queue.put('[-] Thread {} [-] Error getting page: {} [-] ERROR: {}'.format(self.worker().getName(), url, e))
return self.get_req(self.renew_url(list(self.data)))
def verify_and_add(self, url):
with self.lock:
len_data = len(self.data)
self.terminate = len_data >= self.urls_max # true if we reach the self.urls_max, 1000 by default
if(not self.terminate):
if(url not in self.data):
self.data.add(url) # add to urls crawled
self.work[self.worker().getName()] += 1 # plus 1 on work done by this thread
response = '[+] Total stored: {: <4} [+] Thread {: >5} [+] {: <4} urls stored [+] putting: {}'
self.print_queue.put(response.format(len_data+1, self.worker().getName(), self.work[self.worker().getName()], url))
return True
return False
def inject_links(self, url):
if(self.terminate): return
urls = self.get_links(url) # get all links from a page
for url in urls:
if(not self.verify_and_add(url)): # check if url already exists on our set of urls, if false we will crawl another random one from our set
if(self.terminate): return
change_urls = list(set(urls) - self.data) # eles that are in urls but not in data
if change_urls: # check if we have some new urls to scrape
return self.inject_links(self.renew_url(change_urls)) # repeat with the new url
return self.inject_links(self.renew_url(list(self.data))) # if we dont have any new url lets a get a random from our data
return
def producer(self, url):
while(not self.terminate): # this is the trigger to stop all threads
self.inject_links(url)
url = self.renew_url(list(self.data)) # start with another random url on our data
continue
with self.lock: # LOCK THIS BLOCK
response = '[+] killing thread {: >5} [+] {: <4} urls stored [+] waiting for {: >3} threads to finish'
self.print_queue.put(response.format(self.worker().getName(), self.work[self.worker().getName()], threading.active_count()-3)) # -3 because mainthread and deamon thread
crawler = Crawler()
target_urls = ('http://stackoverflow.com/', 'http://edition.cnn.com/', 'https://www.reddit.com/')
crawler.run(threads=15, limit=1000, urls=target_urls)
data = crawler.get_data()
print('\nyap got', len(data))
You can also had this at the end of the code to check in detail the best/worst work done:
workers = crawler.get_work_done()
work_items = workers.items()
print('Work done, total urls stored by thread: ', dict(workers))
print('yap got {} of work done (sum of all workers stored urls)'.format(sum(workers[i] for i in workers)))
best_worker, max_urls_stored = max((i for i in work_items), key=lambda x: x[1])
MVPS = [i for i in workers if workers[i] == max_urls_stored]
print('And the best workers are: {} with {} urls stored'.format(MVPS, max_urls_stored))
worst_worker, min_urls_stored = min((i for i in work_items), key=lambda x: x[1])
LVPS = [i for i in workers if workers[i] == min_urls_stored]
print('And the worst workers are: {} with {} urls stored\n'.format(LVPS, min_urls_stored))
2 Answers 2
1.- Yes, you can use semicolon to have more than one sentence in the same line, but..
Compound statements (multiple statements on the same line) are generally discouraged.
if(not urls): print('[-] Provide start urls'); return # one of the ways ternary operator in python, ';' if continue in same line
Source: PEP-8
2.- Wait some milliseconds all those
while True:
(ie: wait_kill_threads
, print_manager
, maybe in producer
too) your processor will thank you.
3.- Avoid Magic Numbers or Hardcoded numbers. On the last line in
producer
:
.....threading.active_count()-3)) # -3 because mainthread and deamon thread
a better practice is use a CONSTANT_VALUE at the beginning of the file;
QTY_THREAD_OFFSET = 3
.....threading.active_count() - QTY_THREAD_OFFSET))
(maybe you can think in a better constant name, but you get the idea right?)
The same for:
while(threading.active_count() > 2): # main thread (this one) count as 1 and deamon too
using the previous const it will be:
while(threading.active_count() >= QTY_THREAD_OFFSET):
4.- Regex: If you are going to use a regex multiple times a good practice is compile the regex before. Instead of:
re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
you can do this:
URL_RE_PATTERN = '(?<=href=["\'])https?://.+?(?=["\'])'
...
class Crawler():
regex_urls = re.compile(URL_RE_PATTERN)
def __init__(self):
...
...
...
def get_links(self, url):
....
urls = urls_re.match(regex_urls)
Also you can use BeautifulSoup to get urls.
5.- In
verify_and_add
you can try to avoid nested if's:
if(not self.terminate):
if(url not in self.data):
self.data.add(url) #
It could be replaced by:
if self.terminate or url in self.data:
return False
self.data.add(url)
...
6.- A little detail:
if(req is not None):
It could be:
if req:
-- Edit: As Guimo pointed out, the previous sentences are not equal.
So I would suggest to change
def get_links(self, url):
req = self.get_req(url)
if(req is not None):
html = req.text # the html of the page crawled
urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
return urls
return []
by
def get_links(self, url):
req = self.get_req(url)
if req is None:
return []
html = req.text # the html of the page crawled
urls = re.findall('(?<=href=["\'])https?://.+?(?=["\'])', html) # extracting a list with the urls
return urls
Ie, remove parenthesis and an early return.
7.- Are you removing the visited urls from list_urls after is visited? In
renew_url
you are getting a random url but I can't see if you are removing it from that list.
8.- Wait some random secs between requesting pages on the same server, you are crawling a website most of them aren't happy with that.
That is what I see in a first approach! Hope it helps !
-
\$\begingroup\$ About number 6, note that they are not equivalent: suppose req is an empty list (or anything that evaluates to "False" apart from None). The first one will evaluate to "if True" and the second one will evaluate to "if False". Related to this, I recommend looking for information about
__nonzero__()
function. Also, no need to write parenthesis, you could go with ''if req is not None:`. \$\endgroup\$Guimo– Guimo2017年02月09日 08:17:06 +00:00Commented Feb 9, 2017 at 8:17 -
\$\begingroup\$ @Guimo; Thanks for the feedback! you are right! \$\endgroup\$mayo– mayo2017年02月09日 08:28:00 +00:00Commented Feb 9, 2017 at 8:28
-
\$\begingroup\$ Thanks for all advices, nice ones, good call on the regex \$\endgroup\$Miguel– Miguel2017年02月10日 01:40:00 +00:00Commented Feb 10, 2017 at 1:40
About number 7, all urls will be distinct because of : "self.data = set() # this will store our crawled urls, avoiding duplicates"
-
\$\begingroup\$ Is this in response to the answer by mayo? If so, this should not be an answer to the original question by Mayo.... it would be appropriate to comment on the answer by Mayo but you will need to earn 50 reputation points first... \$\endgroup\$2018年03月26日 14:59:43 +00:00Commented Mar 26, 2018 at 14:59