OC

Knowledge OS
登录 注册
全部话题 移民 创业 iOS Mac Objective-C Swift Android 招聘 求职

我正在写一个爬虫(gevent+requests+redis-py),出现了一些问题,看看各位有啥好的解决方案没?

penkzhou
penkzhou 发布于 2014年06月05日
无人欣赏。

我的爬虫大致思想的是这样的,我想爬取某些列表页面上所有的列表url,有很多页,我遍历这些页面,然后抓去这些页面上的内容,当某个页面请求出错的时候,我就将它保存到一个数据库,下次从这个数据库里面把错误的取出来,然后再处理,这样一直循环,直到所有的都被处理完。不多说了,直接代码吧(更详细的问题描述见代码的注释)代码有点乱了,gist地址:https://gist.github.com/penkzhou/a657720be302f72269ca :

 # _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import requests
import redis
import gevent
from gevent.pool import Pool
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient, ReadPreference
import json
import redis.connection
redis.connection.socket = gevent.socket
mongo_connection = MongoClient(
 '%s:%d' % (
 JobProjectConfiguration.save_mongo_host,
 JobProjectConfiguration.save_mongo_port),
 read_preference=ReadPreference.SECONDARY,
 max_pool_size=10, use_greenlets=True)
mongo_db = mongo_connection.jobdigg
redis_connection = redis.ConnectionPool(
 host=JobProjectConfiguration.url_queue_redis_host,
 port=JobProjectConfiguration.url_queue_redis_port,
 db=JobProjectConfiguration.url_queue_redis_db
 )
redis_proxy_pool = redis.ConnectionPool(
 host=JobProjectConfiguration.proxy_queue_redis_host,
 port=JobProjectConfiguration.proxy_queue_redis_port,
 db=JobProjectConfiguration.proxy_queue_redis_db
 )
proxy_pool = []
pool_num = 100
header = {
 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 "Accept-Encoding": "gzip,deflate,sdch",
 "Accept-Language": "zh-CN,zh;q=0.8",
 "Cache-Control": "max-age=0",
 "Connection": "keep-alive",
 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36"
 }
def WYUrlGenerator():
 print '51 Dig start : the url...'
 start = time.time()
 redis_db = redis.Redis(connection_pool=redis_connection)
 urllist = WYJobUrlYield()
 gpool = Pool(pool_num)
 for uargs in urllist:
 gpool.spawn(GenerateUrl, uargs)
 gpool.join()
 # 从这里开始,循环的从错误url集合里面取url,直至取完所有的
 length = redis_db.scard("error_url_list")
 while length > 0:
 errorlist = ErrorUrlGenerator()
 epool = Pool(pool_num)
 for url in errorlist:
 epool.spawn(GenerateUrl, url)
 epool.join()
 length = redis_db.scard("error_url_list")
 end = time.time()
 print 'dig end : the url...all spend time is %0.2f' % (end - start)
def WYJobUrlYield():
 for page in xrange(3000):
 page += 1
 url = "http://some.crawl.url with page num %s" % page
 jobitem = {
 "url": url,
 "type": "jobtype"
 } 
 jobvalue = json.dumps(jobitem)
 yield jobvalue
#从错误url的集合里面取出url 再次处理
def ErrorUrlGenerator():
 redis_db = redis.Redis(connection_pool=redis_connection)
 urllist = redis_db.smembers("error_url_list")
 for url in urllist:
 yield url
def GenerateUrl(sourcejob):
 redis_db = redis.StrictRedis(connection_pool=redis_connection)
 pipe = redis_db.pipeline()
 newitem = json.loads(sourcejob)
 url = newitem["url"]
 urltype = newitem["type"]
 try:
 ip = proxy_pool.getProxy()
 proxy = {"http": "http://"+ip["proxy"]}
 timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误
 timeout.start()
 r = requests.get(url, headers=header, proxies=proxy)
 jobs = BeautifulSoup(r.text)
 if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面
 results = jobs.findAll("a", {"class": "classname"})
 for result in results:
 url = result["href"]
 urlitem = {
 "url": url,
 "type": "urltype"
 }
 urlvalue = json.dumps(urlitem)
 pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面
 pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉
 pipe.execute()
 except Exception as e:
 error_name = e.__class__.__name__
 if error_name == "ConnectionError" or error_name == "ProxyError": 
 #通过判断错误类型(因为一些链接或者代理错误,
 #我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理)
 redis_db.sadd('error_url_list', sourcejob) 
 #现在我面临最恼火的问题就是其它比较正常,就在这里,
 #当程序开启的时候,偶尔会出现sadd抛出异常
 #因为这里是出了异常才在这里处理错误的url的
 #(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常,
 #这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少,
 #异常信息大致为:
# ConnectionError
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError
# Traceback (most recent call last):
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run
# result = self._run(*self.args, **self.kwargs)
# File "61.py", line 147, in GenerateUrl
# redis_db.sadd('error_url_list', sourcejob)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd
# return self.execute_command('SADD', name, *values)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command
# return self.parse_response(connection, command_name, **options)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response
# response = connection.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response
# response = self._parser.read_response()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response
# response = self.read()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read
# return self._fp.readline()[:-2]
# File "/usr/local/lib/python2.7/socket.py", line 447, in readline
# data = self._sock.recv(self._rbufsize)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv
# self._wait(self._read_event)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait
# self.hub.wait(watcher)
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait
# result = waiter.get()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get
# return self.hub.switch()
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch
# return greenlet.switch(self)
if __name__ == '__main__':
 st = time.time()
 time.sleep(5)
 WYUrlGenerator()
 et = time.time()
 print "**************end****************,the spend time is %0.2f" % (et - st)

不知道各位对我这段代码有什么看法,或者吐槽也行,自己找了一些相关资料,成效不大。

共2条回复
楼长 ·
路人甲 回复于 2014年06月05日

python有个爬虫框架,scrapy,可以看看

2楼 ·
penkzhou 回复于 2014年06月05日

1楼 @路人甲 嗯,我想先解决这个问题吧

登录 或者 注册

AltStyle によって変換されたページ (->オリジナル) /