|
| 1 | +""" |
| 2 | + |
| 3 | +@file : 014-爬童话故事.py |
| 4 | + |
| 5 | +@author : xiaolu |
| 6 | + |
| 7 | +@time : 2019年10月29日 |
| 8 | + |
| 9 | +""" |
| 10 | +import requests |
| 11 | +import random |
| 12 | +import time |
| 13 | +from lxml import etree |
| 14 | +from bs4 import BeautifulSoup |
| 15 | +import re |
| 16 | +import os |
| 17 | + |
| 18 | + |
| 19 | +def sava_data(title, content, i): |
| 20 | + ''' |
| 21 | + 存数据 |
| 22 | + :param title: |
| 23 | + :param content: |
| 24 | + :return: |
| 25 | + ''' |
| 26 | + if not os.path.exists('./睡前故事/{}page'.format(i)): |
| 27 | + os.mkdir('./睡前故事/{}page'.format(i)) |
| 28 | + else: |
| 29 | + pass |
| 30 | + with open('./睡前故事/{}page/{}.txt'.format(i, title), 'w', errors='ignore') as f: |
| 31 | + f.write(content) |
| 32 | + |
| 33 | + |
| 34 | +def crawl_content(links, i): |
| 35 | + ''' |
| 36 | + 爬取内容 |
| 37 | + :param links: |
| 38 | + :return: |
| 39 | + ''' |
| 40 | + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'} |
| 41 | + |
| 42 | + for temp in links: |
| 43 | + time.sleep(random.randint(8, 20)) |
| 44 | + response = requests.get(temp, headers) |
| 45 | + response.encoding = 'gbk' |
| 46 | + |
| 47 | + selector = etree.HTML(response.text) |
| 48 | + |
| 49 | + try: |
| 50 | + title = selector.xpath('//*[@id="left"]/h1/text()')[0] |
| 51 | + content = selector.xpath('string(//*[@id="ny"])') |
| 52 | + print("当前页:{}, 文章:{}".format(i, title)) |
| 53 | + sava_data(title, content, i) |
| 54 | + except: |
| 55 | + pass |
| 56 | + |
| 57 | + |
| 58 | +def crawl_link(): |
| 59 | + ''' |
| 60 | + 爬取相应的链接 |
| 61 | + :return: |
| 62 | + ''' |
| 63 | + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'} |
| 64 | + for i in range(1, 96): |
| 65 | + print("当前页: {}".format(i)) |
| 66 | + # 经典童话:http://www.etgushi.com/jdth/list_1_{}.html |
| 67 | + # 儿童故事:http://www.etgushi.com/etgs/list_4_{}.html |
| 68 | + # 睡前故事:http://www.etgushi.com/sqgs/list_7_{}.html |
| 69 | + url = 'http://www.etgushi.com/sqgs/list_7_{}.html'.format(i) |
| 70 | + |
| 71 | + response = requests.get(url, headers) |
| 72 | + html = response.text |
| 73 | + |
| 74 | + soup = BeautifulSoup(html, features='lxml') |
| 75 | + |
| 76 | + # href="/etgs/6950.html" |
| 77 | + # href="/sqgs/6813.html" |
| 78 | + links = soup.find_all('a', {'target': "_blank", 'href': re.compile('/sqgs/.*.html')}) |
| 79 | + total_links = ['http://www.etgushi.com' + _['href'] for _ in links] |
| 80 | + total_links = list(set(total_links)) |
| 81 | + print("总共链接数:{}".format(len(total_links))) |
| 82 | + print(total_links) |
| 83 | + crawl_content(total_links, i) |
| 84 | + |
| 85 | + |
| 86 | +if __name__ == '__main__': |
| 87 | + crawl_link() |
0 commit comments