#!/usr/bin/env python
# -*- coding:utf-8 -*-
# https://github.com/forthxu/WechatSearchProjects 还包同样功能改用Scrapy采集
import sys
import re
import urllib, urllib2
import requests
import pymongo
import datetime
from bs4 import BeautifulSoup
import multiprocessing as mp
class MongoDBIO:
# 申明相关的属性
def __init__(self, host, port, name, password, database, collection):
self.host = host
self.port = port
self.name = name
self.password = password
self.database = database
self.collection = collection
# 连接数据库,db和posts为数据库和集合的游标
def Connection(self):
# connection = pymongo.Connection() # 连接本地数据库
connection = pymongo.Connection(host=self.host, port=self.port)
# db = connection.datas
db = connection[self.database]
if self.name or self.password:
db.authenticate(name=self.name, password=self.password) # 验证用户名密码
# print "Database:", db.name
# posts = db.cn_live_news
posts = db[self.collection]
# print "Collection:", posts.name
return posts
# # 保存操作
# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):
# posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
#
# for save_content in save_contents:
# posts.save(save_content)
# 保存操作
def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):
posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
posts.save(save_content)
def GetTitleUrl(url, data):
content = requests.get(url=url, params=data).content # GET请求发送
soup = BeautifulSoup(content)
tags = soup.findAll("h4")
titleurl = []
for tag in tags:
item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""}
titleurl.append(item)
return titleurl
def GetContent(url):
soup = BeautifulSoup(requests.get(url=url).content)
tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签
content_list = [tag_i.text for tag_i in tag.findAll("p")]
content = "".join(content_list)
return content
def ContentSave(item):
# 保存配置
save_host = "localhost"
save_port = 27017
save_name = ""
save_password = ""
save_database = "testwechat"
save_collection = "result"
save_content = {
"title":item["title"],
"link":item["link"],
"content":item["content"]
}
ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)
def func(tuple):
querystring, type, page = tuple[0], tuple[1], tuple[2]
url = "http://weixin.sogou.com/weixin"
# get参数
data = {
"query":querystring,
"type":type,
"page":page
}
titleurl = GetTitleUrl(url, data)
for item in titleurl:
url = item["link"]
print "url:", url
content = GetContent(url)
item["content"] = content
ContentSave(item)
if __name__ == '__main__':
start = datetime.datetime.now()
querystring = u"清华"
type = 2 # 2-文章,1-微信号
# 多进程抓取
p = mp.Pool()
p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)])
p.close()
p.join()
# # 单进程抓取
# for page in range(1, 50, 1):
# tuple = (querystring, type, page)
# func(tuple)
end = datetime.datetime.now()
print "last time: ", end-start
2016年6月
采集传送门(chuansong.me)指定公众号文章
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import time
import csv
import sys,getopt,os
import pymysql
# 获取当前目录
def get_cur_file_dir():
path = sys.path[0]
if os.path.isdir(path):
return path
elif os.path.isfile(path):
return os.path.dirname(path)
# 抓取内容函数
def open_url(url):
req = urllib2.Request(url)
req.add_header('User-agent', 'Mozilla 5.10')
# 尝试三次
for i in range(0, 3):
try:
xhtml = urllib2.urlopen(req)
return xhtml
except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面
print "The server couldn't fulfill the request"
print "Error code:",e.code
if e.code!=503:
return False
time.sleep(5)
print("try again")
except urllib2.URLError,e:
print "Failed to reach the server"
print "The reason:",e.reason
if e.code!=503:
return False
time.sleep(5)
print("try again")
return Fasle
# 处理内容页
def down_content(content_url,path_url):
xhtml=open_url(content_url)
# 抓取内容失败
if False == xhtml :
return False
# 分析内容
soup = BeautifulSoup(xhtml, "html5lib")
titleH2 = soup.find("h2", id="activity-name")
if None == titleH2:
return False
title = titleH2.string.encode('utf-8')
string_time = soup.find("em", id="post-date").string.encode('utf-8')
num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
content = soup.find_all("div", class_="rich_media_content")
if len(content) < 1 :
print(" "+"no contet")
return False
# 记录内容日志
html = """
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>"""+title+"""</title>
<meta name="keywords" content=\""""+keywords+"""\">
<meta name="description" content=\""""+description+"""\">
</head>
<body>
<div id="body">
<h1>"""+title+"""</h1>
<div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div>
<div id="content">
"""+str(content[0])+"""
</div>
</div>
</body>
<script type="text/javascript" src="js/reimg.js"></script>
</html>
"""
f=file(path_url,"w+")
f.write(html)
f.close()
# 写入数据库
cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
#print cur.description
#print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID
#print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0
lastid = int(cur.lastrowid)
cur.execute("INSERT INTO archive_article (archive,intro,content) VALUE (%s,'',%s)",(lastid, str(content[0])))
cur.connection.commit()
return True
# 处理列表页
def down_list(list_url):
# 列表内容
xhtml=open_url(list_url)
if False == xhtml :
return False
# 内容连接
soup = BeautifulSoup(xhtml, "html5lib")
title = soup.title.string.encode('utf-8')
li_a = soup.find_all("a", class_="question_link")
next_list = soup.find_all("a", text="下一页")
# 记录日志
writer = csv.writer(file(datapath+'list.csv', 'a+b'))
x = 0
y = 0
# 循环抓取内容页
print(list_url+" start")
for i in range(0, len(li_a)):
content_id = li_a[i]['href'].encode('utf-8')[3:]
content_title = li_a[i].string.encode('utf-8')
content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
path_url = datapath+content_id+".html"
if not os.path.exists(path_url):
# 抓取内容失败,继续
if False == down_content(content_url,path_url) :
print(" "+str(x)+content_url+" down fail")
continue
#return False
print(" "+str(x)+content_url+" down end")
# 记录日志
writer.writerow([content_id, content_title, content_url])
# 定时休息
x=x+1
if x%2 == 1 :
time.sleep(3)
time.sleep(1)
else:
print(" "+content_url+" exist")
y=y+1
# 重复存在三次结束抓取
if y>2 :
return False
print(list_url+" end")
# 不存在下一个列表
if len(next_list) < 1 :
return False
# print("next "+next_list[0]['href'].encode('utf-8')+"\n")
return True
# 抓取列表页
def get_list(wechart):
start=0
# 循环抓取列表
while True:
if start==0:
url = 'http://chuansong.me/account/'+wechart
else:
url = 'http://chuansong.me/account/'+wechart+'?start='+str(start)
# 完成或者超过2000条数据
start+=12
if False == down_list(url) or start>2000:
break
time.sleep(1)
print("get_list end")
# 帮助
def usage():
help = """
-d temp dir,default: """+get_cur_file_dir()+"""
-w wechart,default: xingdongpai77
-u mysql user,default: root
-p mysql pwd,default:
-h,--help for help
"""
print help
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], "d:w:u:p:h", ["help"])
arg_dir = get_cur_file_dir()
arg_wechart = 'xingdongpai77'
arg_user = 'root'
arg_pwd = ''
for op, value in opts:
if op == "-d":
arg_dir = value
elif op == "-w":
arg_wechart = value
elif op == "-u":
arg_user = value
elif op == "-p":
arg_pwd = value
elif op == "-h" or op == "--help":
usage()
sys.exit()
print time.strftime("%Y-%m-%d %H:%M:%S")
# 初始化临时文件夹
datapath = arg_dir+'/data/'
if not os.path.exists(datapath):
os.makedirs(datapath)
# 初始化数据库
try:
conn = pymysql.connect(host='127.0.0.1', port=3306, user=arg_user, passwd=arg_pwd, db='mysql')
cur = conn.cursor()
cur.execute("SET NAMES utf8")
cur.execute("USE x")
except pymysql.Error, e:
print __file__, e
usage()
sys.exit()
# 开始抓取
get_list(arg_wechart)
# 关闭数据库
cur.close()
conn.close()
# xtime = time.strftime("%Y-%m-%d %H:%M:%S")
# xday = time.strftime("%Y-%m-%d")
# f=file(datapath+xtime+".html","w+")
# f.write(body)
# f.close()
使用phantomjs对网页进行截图和采集
phantomjs对网页进行截图
[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2
[root@vps3 work]# vim screenshots.js
var page = require('webpage').create();
var args = require('system').args;
var url = args[1];
var filename = args[2];
page.open(url, function(status) {
console.log("Status: " + status);
if(status === "success") {
#执行js
var title = page.evaluate(function(){
#滚动加载惰性图片
window.scrollTo(0,10000);
#返回标题
return document.title;
});
#调试信息
console.log('Page title is ' + title);
#延迟处理,以便加载图片执行js
window.setTimeout(function ()
{
#截图渲染
page.render(filename);
#退出
phantom.exit();
}, 5000);
}else{
phantom.exit();
}
});
安装微软雅黑字体(截图无文字时)
[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig
[root@vps3 work]#mkdir /usr/share/fonts/win/
[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf
[root@vps3 work]#mkfontscale
[root@vps3 work]#mkfontdir
[root@vps3 work]#fc-cache
执行截图功能
[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png && phantomjs-2.1.1-linux-x86_64/bin/phantomjs screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png
注意:经过测试我有台vps保存的图片是透明无内容的图片,其他机器正常,原因不明。
Selenium加载phantomjs采集信息
Selenium最早是作为自动化测试,可以加载各种浏览器作为驱动来测试,官方加上第三方宣布支持的驱动有很多种,除了PC端的浏览器之外,还支持iphone、Android的driver。同时selenium加载浏览器驱动也可以用来采集信息,以便获取需要js执行后的dom文档(Ajax)或者惰性加载的图片等内容。
selenium支持伪浏览器PhantomJS。PhantomJS不是真正的在浏览器、都没有GUI,但支持html、js等解析能力的类浏览器程序,他不会渲染出网页的显示内容,但是支持页面元素的查找、JS的执行等。
PhantomJS浏览器驱动独自安装,是独立完全的程序。
[root@vps3 work]#pip install selenium
[root@vps3 work]#vim collect.py
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
def capture(url, save_fn="capture.png"):
browser = webdriver.PhantomJS(executable_path=r'/workspace/work/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
browser.get(url)
ele = browser.find_element_by_id('weixin-account-btn')
print ele.get_attribute('style')
browser.quit()
if __name__ == "__main__":
capture("http://joke.4399pk.com/")
[root@vps3 work]#python collect.py
注意:如果你用phantomjs作为selenium浏览器,selenium的phantomjs只支持python2.7
Memcache协议中文版
写在前头
偶然之间看到本文的中英文对照版本,感觉看起来不是很方便,于是花费了半个小时的时间,仔细整理出了独立的中文版本,并记录下来。
协议
memcached 的客户端使用TCP链接 与 服务器通讯。(UDP接口也同样有效,参考后文的 "UDP协议" )一个运行中的memcached服务器监视一些(可设置)端口。客户端连接这些端口,发送命令到服务器,读取回应,最后关闭连接。
结束会话不需要发送任何命令。当不再需memcached服务时,要客户端可以在任何时候关闭连接。需要注意的是,鼓励客户端缓存这些连接,而不是每次需要存取数据时都重新打开连接。这是因为memcached 被特意设计成及时开启很多连接也能够高效的工作(数百个,上千个如果需要的话)。缓存这些连接,可以消除建立连接所带来的开销(/*/相对而言,在服务器端建立一个新连接的准备工作所带来的开销,可以忽略不计。)。