2016年6月

采集搜狗公众号文章

作者: forthxu
时间: June 30, 2016
分类: 默认分类
评论

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# https://github.com/forthxu/WechatSearchProjects 还包同样功能改用Scrapy采集
import sys
import re
import urllib, urllib2
import requests
import pymongo
import datetime
from bs4 import BeautifulSoup
import multiprocessing as mp
class MongoDBIO:
 # 申明相关的属性
 def __init__(self, host, port, name, password, database, collection):
 self.host = host
 self.port = port
 self.name = name
 self.password = password
 self.database = database
 self.collection = collection
 # 连接数据库,db和posts为数据库和集合的游标
 def Connection(self):
 # connection = pymongo.Connection() # 连接本地数据库
 connection = pymongo.Connection(host=self.host, port=self.port)
 # db = connection.datas
 db = connection[self.database]
 if self.name or self.password:
 db.authenticate(name=self.name, password=self.password) # 验证用户名密码
 # print "Database:", db.name
 # posts = db.cn_live_news
 posts = db[self.collection]
 # print "Collection:", posts.name
 return posts
# # 保存操作
# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):
# posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
#
# for save_content in save_contents:
# posts.save(save_content)
# 保存操作
def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):
 posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
 posts.save(save_content)
def GetTitleUrl(url, data):
 content = requests.get(url=url, params=data).content # GET请求发送
 soup = BeautifulSoup(content)
 tags = soup.findAll("h4")
 titleurl = []
 for tag in tags:
 item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""}
 titleurl.append(item)
 return titleurl
def GetContent(url):
 soup = BeautifulSoup(requests.get(url=url).content)
 tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签
 content_list = [tag_i.text for tag_i in tag.findAll("p")]
 content = "".join(content_list)
 return content
def ContentSave(item):
 # 保存配置
 save_host = "localhost"
 save_port = 27017
 save_name = ""
 save_password = ""
 save_database = "testwechat"
 save_collection = "result"
 save_content = {
 "title":item["title"],
 "link":item["link"],
 "content":item["content"]
 }
 ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)
def func(tuple):
 querystring, type, page = tuple[0], tuple[1], tuple[2]
 url = "http://weixin.sogou.com/weixin"
 # get参数
 data = {
 "query":querystring,
 "type":type,
 "page":page
 }
 titleurl = GetTitleUrl(url, data)
 for item in titleurl:
 url = item["link"]
 print "url:", url
 content = GetContent(url)
 item["content"] = content
 ContentSave(item)
if __name__ == '__main__':
 start = datetime.datetime.now()
 querystring = u"清华"
 type = 2 # 2-文章,1-微信号
 # 多进程抓取
 p = mp.Pool()
 p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)])
 p.close()
 p.join()
 # # 单进程抓取
 # for page in range(1, 50, 1):
 # tuple = (querystring, type, page)
 # func(tuple)
 end = datetime.datetime.now()
 print "last time: ", end-start

采集传送门(chuansong.me)指定公众号文章

作者: forthxu
时间: June 28, 2016
分类: 默认分类
评论

#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import time
import csv
import sys,getopt,os
import pymysql
# 获取当前目录
def get_cur_file_dir():
 path = sys.path[0]
 if os.path.isdir(path):
 return path
 elif os.path.isfile(path):
 return os.path.dirname(path)
# 抓取内容函数
def open_url(url):
 req = urllib2.Request(url)
 req.add_header('User-agent', 'Mozilla 5.10')
 # 尝试三次
 for i in range(0, 3):
 try:
 xhtml = urllib2.urlopen(req)
 return xhtml
 except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面
 print "The server couldn't fulfill the request"
 print "Error code:",e.code
 if e.code!=503:
 return False
 time.sleep(5)
 print("try again")
 except urllib2.URLError,e:
 print "Failed to reach the server"
 print "The reason:",e.reason
 if e.code!=503:
 return False
 time.sleep(5)
 print("try again")
 
 return Fasle
# 处理内容页
def down_content(content_url,path_url):
 xhtml=open_url(content_url)
 # 抓取内容失败
 if False == xhtml :
 return False
 # 分析内容
 soup = BeautifulSoup(xhtml, "html5lib")
 titleH2 = soup.find("h2", id="activity-name")
 if None == titleH2:
 return False
 title = titleH2.string.encode('utf-8')
 string_time = soup.find("em", id="post-date").string.encode('utf-8')
 num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
 keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
 description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
 content = soup.find_all("div", class_="rich_media_content")
 
 if len(content) < 1 :
 print(" "+"no contet")
 return False
 
 # 记录内容日志
 html = """
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>"""+title+"""</title>
<meta name="keywords" content=\""""+keywords+"""\">
<meta name="description" content=\""""+description+"""\">
</head>
<body>
 <div id="body">
 <h1>"""+title+"""</h1>
 <div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div>
 <div id="content">
 """+str(content[0])+"""
 </div>
 </div>
</body>
<script type="text/javascript" src="js/reimg.js"></script>
</html>
 """
 
 f=file(path_url,"w+")
 f.write(html)
 f.close()
 
 # 写入数据库
 cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
 #print cur.description
 #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID 
 #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 
 lastid = int(cur.lastrowid)
 
 cur.execute("INSERT INTO archive_article (archive,intro,content) VALUE (%s,'',%s)",(lastid, str(content[0])))
 
 cur.connection.commit()
 
 return True
# 处理列表页
def down_list(list_url):
 # 列表内容
 xhtml=open_url(list_url)
 if False == xhtml :
 return False
 # 内容连接
 soup = BeautifulSoup(xhtml, "html5lib")
 title = soup.title.string.encode('utf-8')
 li_a = soup.find_all("a", class_="question_link")
 next_list = soup.find_all("a", text="下一页")
 
 # 记录日志
 writer = csv.writer(file(datapath+'list.csv', 'a+b'))
 x = 0
 y = 0
 # 循环抓取内容页
 print(list_url+" start")
 for i in range(0, len(li_a)):
 content_id = li_a[i]['href'].encode('utf-8')[3:]
 content_title = li_a[i].string.encode('utf-8')
 content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
 path_url = datapath+content_id+".html"
 
 if not os.path.exists(path_url):
 # 抓取内容失败,继续
 if False == down_content(content_url,path_url) :
 print(" "+str(x)+content_url+" down fail")
 continue
 #return False
 
 print(" "+str(x)+content_url+" down end")
 # 记录日志
 writer.writerow([content_id, content_title, content_url])
 # 定时休息
 x=x+1
 if x%2 == 1 :
 time.sleep(3)
 time.sleep(1)
 else:
 print(" "+content_url+" exist")
 y=y+1
 # 重复存在三次结束抓取
 if y>2 :
 return False
 print(list_url+" end")
 
 # 不存在下一个列表
 if len(next_list) < 1 :
 return False
 # print("next "+next_list[0]['href'].encode('utf-8')+"\n")
 return True
 
# 抓取列表页
def get_list(wechart):
 start=0
 # 循环抓取列表
 while True:
 if start==0:
 url = 'http://chuansong.me/account/'+wechart
 else:
 url = 'http://chuansong.me/account/'+wechart+'?start='+str(start)
 
 # 完成或者超过2000条数据
 start+=12
 if False == down_list(url) or start>2000:
 break
 time.sleep(1)
 
 print("get_list end")
# 帮助
def usage():
 help = """
-d temp dir,default: """+get_cur_file_dir()+"""
-w wechart,default: xingdongpai77
-u mysql user,default: root
-p mysql pwd,default: 
-h,--help for help
"""
 print help
 
if __name__ == "__main__":
 opts, args = getopt.getopt(sys.argv[1:], "d:w:u:p:h", ["help"])
 arg_dir = get_cur_file_dir()
 arg_wechart = 'xingdongpai77'
 arg_user = 'root'
 arg_pwd = ''
 for op, value in opts:
 if op == "-d":
 arg_dir = value
 elif op == "-w":
 arg_wechart = value
 elif op == "-u":
 arg_user = value
 elif op == "-p":
 arg_pwd = value
 elif op == "-h" or op == "--help":
 usage()
 sys.exit()
 print time.strftime("%Y-%m-%d %H:%M:%S")
 # 初始化临时文件夹
 datapath = arg_dir+'/data/'
 if not os.path.exists(datapath):
 os.makedirs(datapath)
 # 初始化数据库
 try:
 conn = pymysql.connect(host='127.0.0.1', port=3306, user=arg_user, passwd=arg_pwd, db='mysql')
 cur = conn.cursor()
 cur.execute("SET NAMES utf8")
 cur.execute("USE x")
 except pymysql.Error, e:
 print __file__, e
 usage()
 sys.exit()
 # 开始抓取
 get_list(arg_wechart)
 
 # 关闭数据库
 cur.close()
 conn.close()
 
 # xtime = time.strftime("%Y-%m-%d %H:%M:%S")
 # xday = time.strftime("%Y-%m-%d")
 # f=file(datapath+xtime+".html","w+")
 # f.write(body)
 # f.close()

使用phantomjs对网页进行截图和采集

作者: forthxu
时间: June 28, 2016
分类: 默认分类
评论

phantomjs对网页进行截图

[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2
[root@vps3 work]# vim screenshots.js

var page = require('webpage').create();
var args = require('system').args;
var url = args[1];
var filename = args[2];
page.open(url, function(status) {
 console.log("Status: " + status);
 if(status === "success") {
 #执行js
 var title = page.evaluate(function(){
 #滚动加载惰性图片
 window.scrollTo(0,10000);
 #返回标题
 return document.title;
 });
 #调试信息
 console.log('Page title is ' + title);
 
 #延迟处理,以便加载图片执行js 
 window.setTimeout(function ()
 {
 #截图渲染
 page.render(filename);
 #退出
 phantom.exit();
 }, 5000);
 }else{
 phantom.exit();
 }
});

安装微软雅黑字体(截图无文字时)
[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig
[root@vps3 work]#mkdir /usr/share/fonts/win/
[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf
[root@vps3 work]#mkfontscale
[root@vps3 work]#mkfontdir
[root@vps3 work]#fc-cache

执行截图功能
[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png && phantomjs-2.1.1-linux-x86_64/bin/phantomjs screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png

注意:经过测试我有台vps保存的图片是透明无内容的图片,其他机器正常,原因不明。

Selenium加载phantomjs采集信息

Selenium最早是作为自动化测试,可以加载各种浏览器作为驱动来测试,官方加上第三方宣布支持的驱动有很多种,除了PC端的浏览器之外,还支持iphone、Android的driver。同时selenium加载浏览器驱动也可以用来采集信息,以便获取需要js执行后的dom文档(Ajax)或者惰性加载的图片等内容。

selenium支持伪浏览器PhantomJS。PhantomJS不是真正的在浏览器、都没有GUI,但支持html、js等解析能力的类浏览器程序,他不会渲染出网页的显示内容,但是支持页面元素的查找、JS的执行等。

PhantomJS浏览器驱动独自安装,是独立完全的程序。

[root@vps3 work]#pip install selenium
[root@vps3 work]#vim collect.py

# -*- coding: utf-8 -*-
from selenium import webdriver
import time
 
def capture(url, save_fn="capture.png"):
 browser = webdriver.PhantomJS(executable_path=r'/workspace/work/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
 browser.get(url) 
 ele = browser.find_element_by_id('weixin-account-btn') 
 print ele.get_attribute('style') 
 browser.quit()
if __name__ == "__main__":
 capture("http://joke.4399pk.com/")

[root@vps3 work]#python collect.py

注意:如果你用phantomjs作为selenium浏览器,selenium的phantomjs只支持python2.7

*nodejs最新截图方法
*golang调动无头浏览器

Memcache协议中文版

作者: forthxu
时间: June 7, 2016
分类: 默认分类
评论

写在前头
偶然之间看到本文的中英文对照版本,感觉看起来不是很方便,于是花费了半个小时的时间,仔细整理出了独立的中文版本,并记录下来。

协议
memcached 的客户端使用TCP链接与服务器通讯。(UDP接口也同样有效,参考后文的 "UDP协议" )一个运行中的memcached服务器监视一些(可设置)端口。客户端连接这些端口,发送命令到服务器,读取回应,最后关闭连接。

结束会话不需要发送任何命令。当不再需memcached服务时,要客户端可以在任何时候关闭连接。需要注意的是,鼓励客户端缓存这些连接,而不是每次需要存取数据时都重新打开连接。这是因为memcached 被特意设计成及时开启很多连接也能够高效的工作(数百个,上千个如果需要的话)。缓存这些连接,可以消除建立连接所带来的开销(/*/相对而言,在服务器端建立一个新连接的准备工作所带来的开销,可以忽略不计。)。

- 阅读剩余部分 -

正向代理和反向代理的区别和使用

作者: forthxu
时间: June 6, 2016
分类: 默认分类
评论

正向代理和反向代理的区别和使用

请求中扮演的角色 客户端代理服务端内容服务端

正向代理需设置代理,通过代理服务端请求内容服务端的资源代替客户端请求内容服务端只能获取代理服务端的请求内容并返回

反向代理无需设置,请求代理服务端的资源本身并无资源,获取内容服务端资源返回给客户端只能获取代理服务端的请求内容并返回

用途

正向代理正向代理的典型用途是为在防火墙内的局域网客户端提供访问Internet的途径。

正向代理还可以使用缓冲特性减少网络使用率。

反向代理反向典型用途是将防火墙后面的服务器提供给Internet用户访问。

反向代理还可以为后端的多台服务器提供负载平衡,或为后端较慢的服务器提供缓冲服务。

反向代理还可以启用高级URL策略和管理技术,从而使处于不同web服务器系统的web页面同时存在于同一个URL空间下。

安全性

正向代理正向代理允许客户端通过它访问任意网站并且隐藏客户端自身,因此你必须采取安全措施以确保仅为经过授权的客户端提供服务。

反向代理反向代理对外都是透明的,访问者并不知道自己访问的是一个代理

2016年6月

采集搜狗公众号文章

采集传送门(chuansong.me)指定公众号文章

使用phantomjs对网页进行截图和采集

Memcache协议中文版

正向代理和反向代理的区别和使用

京东外快

最新文章

最近回复

分类

归档

其它

友情链接