Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 9705c05

Browse files
author
XL
committed
在指定标签下提取链接
1 parent 64cbcde commit 9705c05

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

‎014-爬童话故事.py‎

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
3+
@file : 014-爬童话故事.py
4+
5+
@author : xiaolu
6+
7+
@time : 2019年10月29日
8+
9+
"""
10+
import requests
11+
import random
12+
import time
13+
from lxml import etree
14+
from bs4 import BeautifulSoup
15+
import re
16+
import os
17+
18+
19+
def sava_data(title, content, i):
20+
'''
21+
存数据
22+
:param title:
23+
:param content:
24+
:return:
25+
'''
26+
if not os.path.exists('./睡前故事/{}page'.format(i)):
27+
os.mkdir('./睡前故事/{}page'.format(i))
28+
else:
29+
pass
30+
with open('./睡前故事/{}page/{}.txt'.format(i, title), 'w', errors='ignore') as f:
31+
f.write(content)
32+
33+
34+
def crawl_content(links, i):
35+
'''
36+
爬取内容
37+
:param links:
38+
:return:
39+
'''
40+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
41+
42+
for temp in links:
43+
time.sleep(random.randint(8, 20))
44+
response = requests.get(temp, headers)
45+
response.encoding = 'gbk'
46+
47+
selector = etree.HTML(response.text)
48+
49+
try:
50+
title = selector.xpath('//*[@id="left"]/h1/text()')[0]
51+
content = selector.xpath('string(//*[@id="ny"])')
52+
print("当前页:{}, 文章:{}".format(i, title))
53+
sava_data(title, content, i)
54+
except:
55+
pass
56+
57+
58+
def crawl_link():
59+
'''
60+
爬取相应的链接
61+
:return:
62+
'''
63+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
64+
for i in range(1, 96):
65+
print("当前页: {}".format(i))
66+
# 经典童话:http://www.etgushi.com/jdth/list_1_{}.html
67+
# 儿童故事:http://www.etgushi.com/etgs/list_4_{}.html
68+
# 睡前故事:http://www.etgushi.com/sqgs/list_7_{}.html
69+
url = 'http://www.etgushi.com/sqgs/list_7_{}.html'.format(i)
70+
71+
response = requests.get(url, headers)
72+
html = response.text
73+
74+
soup = BeautifulSoup(html, features='lxml')
75+
76+
# href="/etgs/6950.html"
77+
# href="/sqgs/6813.html"
78+
links = soup.find_all('a', {'target': "_blank", 'href': re.compile('/sqgs/.*.html')})
79+
total_links = ['http://www.etgushi.com' + _['href'] for _ in links]
80+
total_links = list(set(total_links))
81+
print("总共链接数:{}".format(len(total_links)))
82+
print(total_links)
83+
crawl_content(total_links, i)
84+
85+
86+
if __name__ == '__main__':
87+
crawl_link()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /