Commit 5c4914d

committed

百度网页爬虫

1 parent 32b31df commit 5c4914dCopy full SHA for 5c4914d

File tree

1 file changed

+147

-0

lines changed

baidu_news
- news.py

1 file changed

+147

-0

lines changed

`‎baidu_news/news.py‎`

Lines changed: 147 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,147 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+# 作者: inspurer(月小水长)`
	`3`	`+# 创建时间: 2020年11月27日 22:10`
	`4`	`+# 运行环境 Python3.6+`
	`5`	`+# github https://github.com/inspurer`
	`6`	`+# qq邮箱 2391527690@qq.com`
	`7`	`+# 微信公众号月小水长(ID: inspurer)`
	`8`	`+# 文件备注信息 todo`
	`9`	`+`
	`10`	`+import requests`
	`11`	`+`
	`12`	`+from datetime import datetime, timedelta`
	`13`	`+`
	`14`	`+from lxml import etree`
	`15`	`+`
	`16`	`+import csv`
	`17`	`+`
	`18`	`+import os`
	`19`	`+`
	`20`	`+from time import sleep`
	`21`	`+from random import randint`
	`22`	`+`
	`23`	`+`
	`24`	`+def parseTime(unformatedTime):`
	`25`	`+ if '分钟' in unformatedTime:`
	`26`	`+ minute = unformatedTime[:unformatedTime.find('分钟')]`
	`27`	`+ minute = timedelta(minutes=int(minute))`
	`28`	`+ return (datetime.now() -`
	`29`	`+ minute).strftime('%Y-%m-%d %H:%M')`
	`30`	`+ elif '小时' in unformatedTime:`
	`31`	`+ hour = unformatedTime[:unformatedTime.find('小时')]`
	`32`	`+ hour = timedelta(hours=int(hour))`
	`33`	`+ return (datetime.now() -`
	`34`	`+ hour).strftime('%Y-%m-%d %H:%M')`
	`35`	`+ else:`
	`36`	`+ return unformatedTime`
	`37`	`+`
	`38`	`+`
	`39`	`+def dealHtml(html):`
	`40`	`+ results = html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')`
	`41`	`+`
	`42`	`+ saveData = []`
	`43`	`+`
	`44`	`+ for result in results:`
	`45`	`+ title = result.xpath('.//h3/a')[0]`
	`46`	`+ title = title.xpath('string(.)').strip()`
	`47`	`+`
	`48`	`+ summary = result.xpath('.//span[@class="c-font-normal c-color-text"]')[0]`
	`49`	`+ summary = summary.xpath('string(.)').strip()`
	`50`	`+`
	`51`	`+ # ./ 是直接下级,.// 是直接/间接下级`
	`52`	`+ infos = result.xpath('.//div[@class="news-source"]')[0]`
	`53`	`+ source, dateTime = infos.xpath(".//span[last()-1]/text()")[0], \`
	`54`	`+ infos.xpath(".//span[last()]/text()")[0]`
	`55`	`+`
	`56`	`+ dateTime = parseTime(dateTime)`
	`57`	`+`
	`58`	`+ print('标题', title)`
	`59`	`+ print('来源', source)`
	`60`	`+ print('时间', dateTime)`
	`61`	`+ print('概要', summary)`
	`62`	`+ print('\n')`
	`63`	`+`
	`64`	`+ saveData.append({`
	`65`	`+ 'title': title,`
	`66`	`+ 'source': source,`
	`67`	`+ 'time': dateTime,`
	`68`	`+ 'summary': summary`
	`69`	`+ })`
	`70`	`+ with open(fileName, 'a+', encoding='utf-8-sig', newline='') as f:`
	`71`	`+ writer = csv.writer(f)`
	`72`	`+ for row in saveData:`
	`73`	`+ writer.writerow([row['title'], row['source'], row['time'], row['summary']])`
	`74`	`+`
	`75`	`+`
	`76`	`+headers = {`
	`77`	`+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',`
	`78`	`+ 'Referer': 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=%B0%D9%B6%C8%D0%C2%CE%C5&fr=zhidao'`
	`79`	`+}`
	`80`	`+`
	`81`	`+url = 'https://www.baidu.com/s'`
	`82`	`+`
	`83`	`+params = {`
	`84`	`+ 'ie': 'utf-8',`
	`85`	`+ 'medium': 0,`
	`86`	`+ # rtt=4 按时间排序 rtt=1 按焦点排序`
	`87`	`+ 'rtt': 1,`
	`88`	`+ 'bsst': 1,`
	`89`	`+ 'rsv_dl': 'news_t_sk',`
	`90`	`+ 'cl': 2,`
	`91`	`+ 'tn': 'news',`
	`92`	`+ 'rsv_bp': 1,`
	`93`	`+ 'oq': '',`
	`94`	`+ 'rsv_btype': 't',`
	`95`	`+ 'f': 8,`
	`96`	`+}`
	`97`	`+`
	`98`	`+`
	`99`	`+def doSpider(keyword, sortBy = 'focus'):`
	`100`	`+ '''`
	`101`	`+ :param keyword: 搜索关键词`
	`102`	`+ :param sortBy: 排序规则,可选:focus(按焦点排序),time(按时间排序),默认 focus`
	`103`	`+ :return:`
	`104`	`+ '''`
	`105`	`+ global fileName`
	`106`	`+ fileName = '{}.csv'.format(keyword)`
	`107`	`+`
	`108`	`+ if not os.path.exists(fileName):`
	`109`	`+ with open(fileName, 'w+', encoding='utf-8-sig', newline='') as f:`
	`110`	`+ writer = csv.writer(f)`
	`111`	`+ writer.writerow(['title', 'source', 'time', 'summary'])`
	`112`	`+`
	`113`	`+ params['wd'] = keyword`
	`114`	`+ if sortBy == 'time':`
	`115`	`+ params['rtt'] = 4`
	`116`	`+`
	`117`	`+ response = requests.get(url=url, params=params, headers=headers)`
	`118`	`+`
	`119`	`+ html = etree.HTML(response.text)`
	`120`	`+`
	`121`	`+ dealHtml(html)`
	`122`	`+`
	`123`	`+ total = html.xpath('//div[@id="header_top_bar"]/span/text()')[0]`
	`124`	`+`
	`125`	`+ total = total.replace(',', '')`
	`126`	`+`
	`127`	`+ total = int(total[7:-1])`
	`128`	`+`
	`129`	`+ pageNum = total // 10`
	`130`	`+`
	`131`	`+ for page in range(1, pageNum):`
	`132`	`+ print('第 {} 页\n\n'.format(page))`
	`133`	`+ headers['Referer'] = response.url`
	`134`	`+ params['pn'] = page * 10`
	`135`	`+`
	`136`	`+ response = requests.get(url=url, headers=headers, params=params)`
	`137`	`+`
	`138`	`+ html = etree.HTML(response.text)`
	`139`	`+`
	`140`	`+ dealHtml(html)`
	`141`	`+`
	`142`	`+ sleep(randint(2, 4))`
	`143`	`+ ...`
	`144`	`+`
	`145`	`+`
	`146`	`+if __name__ == "__main__":`
	`147`	`+ doSpider(keyword = '马保国', sortBy='focus')`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 5c4914d

File tree

1 file changed

1 file changed

`‎baidu_news/news.py‎`

0 commit comments