Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5c4914d

Browse files
committed
百度网页爬虫
1 parent 32b31df commit 5c4914d

File tree

1 file changed

+147
-0
lines changed

1 file changed

+147
-0
lines changed

‎baidu_news/news.py‎

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# -*- coding: utf-8 -*-
2+
# 作者: inspurer(月小水长)
3+
# 创建时间: 2020年11月27日 22:10
4+
# 运行环境 Python3.6+
5+
# github https://github.com/inspurer
6+
# qq邮箱 2391527690@qq.com
7+
# 微信公众号 月小水长(ID: inspurer)
8+
# 文件备注信息 todo
9+
10+
import requests
11+
12+
from datetime import datetime, timedelta
13+
14+
from lxml import etree
15+
16+
import csv
17+
18+
import os
19+
20+
from time import sleep
21+
from random import randint
22+
23+
24+
def parseTime(unformatedTime):
25+
if '分钟' in unformatedTime:
26+
minute = unformatedTime[:unformatedTime.find('分钟')]
27+
minute = timedelta(minutes=int(minute))
28+
return (datetime.now() -
29+
minute).strftime('%Y-%m-%d %H:%M')
30+
elif '小时' in unformatedTime:
31+
hour = unformatedTime[:unformatedTime.find('小时')]
32+
hour = timedelta(hours=int(hour))
33+
return (datetime.now() -
34+
hour).strftime('%Y-%m-%d %H:%M')
35+
else:
36+
return unformatedTime
37+
38+
39+
def dealHtml(html):
40+
results = html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
41+
42+
saveData = []
43+
44+
for result in results:
45+
title = result.xpath('.//h3/a')[0]
46+
title = title.xpath('string(.)').strip()
47+
48+
summary = result.xpath('.//span[@class="c-font-normal c-color-text"]')[0]
49+
summary = summary.xpath('string(.)').strip()
50+
51+
# ./ 是直接下级,.// 是直接/间接下级
52+
infos = result.xpath('.//div[@class="news-source"]')[0]
53+
source, dateTime = infos.xpath(".//span[last()-1]/text()")[0], \
54+
infos.xpath(".//span[last()]/text()")[0]
55+
56+
dateTime = parseTime(dateTime)
57+
58+
print('标题', title)
59+
print('来源', source)
60+
print('时间', dateTime)
61+
print('概要', summary)
62+
print('\n')
63+
64+
saveData.append({
65+
'title': title,
66+
'source': source,
67+
'time': dateTime,
68+
'summary': summary
69+
})
70+
with open(fileName, 'a+', encoding='utf-8-sig', newline='') as f:
71+
writer = csv.writer(f)
72+
for row in saveData:
73+
writer.writerow([row['title'], row['source'], row['time'], row['summary']])
74+
75+
76+
headers = {
77+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
78+
'Referer': 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=%B0%D9%B6%C8%D0%C2%CE%C5&fr=zhidao'
79+
}
80+
81+
url = 'https://www.baidu.com/s'
82+
83+
params = {
84+
'ie': 'utf-8',
85+
'medium': 0,
86+
# rtt=4 按时间排序 rtt=1 按焦点排序
87+
'rtt': 1,
88+
'bsst': 1,
89+
'rsv_dl': 'news_t_sk',
90+
'cl': 2,
91+
'tn': 'news',
92+
'rsv_bp': 1,
93+
'oq': '',
94+
'rsv_btype': 't',
95+
'f': 8,
96+
}
97+
98+
99+
def doSpider(keyword, sortBy = 'focus'):
100+
'''
101+
:param keyword: 搜索关键词
102+
:param sortBy: 排序规则,可选:focus(按焦点排序),time(按时间排序),默认 focus
103+
:return:
104+
'''
105+
global fileName
106+
fileName = '{}.csv'.format(keyword)
107+
108+
if not os.path.exists(fileName):
109+
with open(fileName, 'w+', encoding='utf-8-sig', newline='') as f:
110+
writer = csv.writer(f)
111+
writer.writerow(['title', 'source', 'time', 'summary'])
112+
113+
params['wd'] = keyword
114+
if sortBy == 'time':
115+
params['rtt'] = 4
116+
117+
response = requests.get(url=url, params=params, headers=headers)
118+
119+
html = etree.HTML(response.text)
120+
121+
dealHtml(html)
122+
123+
total = html.xpath('//div[@id="header_top_bar"]/span/text()')[0]
124+
125+
total = total.replace(',', '')
126+
127+
total = int(total[7:-1])
128+
129+
pageNum = total // 10
130+
131+
for page in range(1, pageNum):
132+
print('第 {} 页\n\n'.format(page))
133+
headers['Referer'] = response.url
134+
params['pn'] = page * 10
135+
136+
response = requests.get(url=url, headers=headers, params=params)
137+
138+
html = etree.HTML(response.text)
139+
140+
dealHtml(html)
141+
142+
sleep(randint(2, 4))
143+
...
144+
145+
146+
if __name__ == "__main__":
147+
doSpider(keyword = '马保国', sortBy='focus')

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /