Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit edd24dc

Browse files
爬生成语料
1 parent 5e4a03a commit edd24dc

File tree

1 file changed

+62
-0
lines changed

1 file changed

+62
-0
lines changed

‎012-bs+selenium爬数据.py‎

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
3+
@file : 012-bs+selenium爬数据.py
4+
5+
@author : xiaolu
6+
7+
@time : 2019年10月23日
8+
9+
"""
10+
from selenium import webdriver
11+
import time
12+
import random
13+
import pandas as pd
14+
import requests
15+
from bs4 import BeautifulSoup
16+
import re
17+
18+
19+
def save_data(content, i):
20+
with open('{}corpus.data'.format(i), 'w', errors='ignore') as f:
21+
f.write(content)
22+
23+
24+
def spider(driver, lists):
25+
# 1. 刷新首页
26+
i = 0
27+
for list in lists:
28+
driver.get(list)
29+
# 随机停几秒
30+
i += 1
31+
time.sleep(random.randint(5, 10))
32+
# /html/body/div[7]/div[1]/div[1]/h1
33+
# data = driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/h1/text()')[0]
34+
data = driver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[2]').text
35+
save_data(data, i)
36+
37+
38+
def crawl_link():
39+
'''
40+
爬去链接
41+
:return:
42+
'''
43+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
44+
url = 'https://www.duanwenxue.com/sanwen/youmei/list_8.html'
45+
response = requests.get(url, headers)
46+
html = response.text
47+
48+
soup = BeautifulSoup(html, features='lxml')
49+
50+
# <a target="_blank" href="/article/4881118.html">小城记忆</a>
51+
links = soup.find_all('a', {'href': re.compile('/article/.*')})
52+
total_links = ['https://www.duanwenxue.com' + _['href'] for _ in links]
53+
print(total_links)
54+
55+
return total_links
56+
57+
58+
if __name__ == '__main__':
59+
# 实例化webdriver
60+
lists = crawl_link()
61+
driver = webdriver.Chrome(r'D:\learn-install\python3.6\chromedriver.exe')
62+
spider(driver, lists)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /