Commit 9705c05

author

committed

在指定标签下提取链接

1 parent 64cbcde commit 9705c05Copy full SHA for 9705c05

File tree

1 file changed

+87

-0

lines changed

014-爬童话故事.py

1 file changed

+87

-0

lines changed

`‎014-爬童话故事.py‎`

Lines changed: 87 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,87 @@`
	`1`	`+"""`
	`2`	`+`
	`3`	`+@file : 014-爬童话故事.py`
	`4`	`+`
	`5`	`+@author : xiaolu`
	`6`	`+`
	`7`	`+@time : 2019年10月29日`
	`8`	`+`
	`9`	`+"""`
	`10`	`+import requests`
	`11`	`+import random`
	`12`	`+import time`
	`13`	`+from lxml import etree`
	`14`	`+from bs4 import BeautifulSoup`
	`15`	`+import re`
	`16`	`+import os`
	`17`	`+`
	`18`	`+`
	`19`	`+def sava_data(title, content, i):`
	`20`	`+ '''`
	`21`	`+ 存数据`
	`22`	`+ :param title:`
	`23`	`+ :param content:`
	`24`	`+ :return:`
	`25`	`+ '''`
	`26`	`+ if not os.path.exists('./睡前故事/{}page'.format(i)):`
	`27`	`+ os.mkdir('./睡前故事/{}page'.format(i))`
	`28`	`+ else:`
	`29`	`+ pass`
	`30`	`+ with open('./睡前故事/{}page/{}.txt'.format(i, title), 'w', errors='ignore') as f:`
	`31`	`+ f.write(content)`
	`32`	`+`
	`33`	`+`
	`34`	`+def crawl_content(links, i):`
	`35`	`+ '''`
	`36`	`+ 爬取内容`
	`37`	`+ :param links:`
	`38`	`+ :return:`
	`39`	`+ '''`
	`40`	`+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}`
	`41`	`+`
	`42`	`+ for temp in links:`
	`43`	`+ time.sleep(random.randint(8, 20))`
	`44`	`+ response = requests.get(temp, headers)`
	`45`	`+ response.encoding = 'gbk'`
	`46`	`+`
	`47`	`+ selector = etree.HTML(response.text)`
	`48`	`+`
	`49`	`+ try:`
	`50`	`+ title = selector.xpath('//*[@id="left"]/h1/text()')[0]`
	`51`	`+ content = selector.xpath('string(//*[@id="ny"])')`
	`52`	`+ print("当前页:{}, 文章:{}".format(i, title))`
	`53`	`+ sava_data(title, content, i)`
	`54`	`+ except:`
	`55`	`+ pass`
	`56`	`+`
	`57`	`+`
	`58`	`+def crawl_link():`
	`59`	`+ '''`
	`60`	`+ 爬取相应的链接`
	`61`	`+ :return:`
	`62`	`+ '''`
	`63`	`+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}`
	`64`	`+ for i in range(1, 96):`
	`65`	`+ print("当前页: {}".format(i))`
	`66`	`+ # 经典童话:http://www.etgushi.com/jdth/list_1_{}.html`
	`67`	`+ # 儿童故事:http://www.etgushi.com/etgs/list_4_{}.html`
	`68`	`+ # 睡前故事:http://www.etgushi.com/sqgs/list_7_{}.html`
	`69`	`+ url = 'http://www.etgushi.com/sqgs/list_7_{}.html'.format(i)`
	`70`	`+`
	`71`	`+ response = requests.get(url, headers)`
	`72`	`+ html = response.text`
	`73`	`+`
	`74`	`+ soup = BeautifulSoup(html, features='lxml')`
	`75`	`+`
	`76`	`+ # href="/etgs/6950.html"`
	`77`	`+ # href="/sqgs/6813.html"`
	`78`	`+ links = soup.find_all('a', {'target': "_blank", 'href': re.compile('/sqgs/.*.html')})`
	`79`	`+ total_links = ['http://www.etgushi.com' + _['href'] for _ in links]`
	`80`	`+ total_links = list(set(total_links))`
	`81`	`+ print("总共链接数:{}".format(len(total_links)))`
	`82`	`+ print(total_links)`
	`83`	`+ crawl_content(total_links, i)`
	`84`	`+`
	`85`	`+`
	`86`	`+if __name__ == '__main__':`
	`87`	`+ crawl_link()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 9705c05

File tree

1 file changed

1 file changed

`‎014-爬童话故事.py‎`

0 commit comments