Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 5d43463

Browse files
selenium简单使用
1 parent 4b534f5 commit 5d43463

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed

‎007-使用selenium爬新浪微博.py‎

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from lxml import etree
2+
import requests
3+
from selenium import webdriver
4+
import time
5+
import random
6+
import csv
7+
8+
9+
10+
def login(driver):
11+
driver.get('https://weibo.com')
12+
time.sleep(3)
13+
# 设置窗口的尺寸 防止尺寸不够影响我们提取内容
14+
driver.set_window_size(1920, 1080)
15+
16+
# 找到用户输入框
17+
username = driver.find_element_by_xpath('//*[@id="loginname"]')
18+
username.send_keys('*******')
19+
password = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input')
20+
password.send_keys('*****')
21+
submit = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a')
22+
print("一切输入准备好了,点击登录")
23+
submit.click()
24+
time.sleep(random.randint(2, 5))
25+
26+
27+
def spider(driver):
28+
# 1.刷新首页
29+
driver.get('https://weibo.com')
30+
# 随机停几秒
31+
time.sleep(random.randint(2, 6))
32+
# 先获取所有微博代码
33+
all_weibo = driver.find_elements_by_xpath('//div[@class="WB_cardwrap WB_feed_type S_bg2 WB_feed_like"]')
34+
35+
for weibo in all_weibo:
36+
# 注意webdriver中xpath用法
37+
38+
# 解析id 提取文本
39+
pubid = weibo.find_elements_by_xpath('div[1]/div[3]/div[1]/a[1]')[0].text
40+
# 解析微博链接 提取属性值
41+
pubid_url = weibo.find_elements_by_xpath('div[1]/div[3]/div[1]/a[1]')[0].get_attribute('href')
42+
# 解析微博内容
43+
pub_content = weibo.find_elements_by_xpath('div[1]/div[3]/div[3]')[0].text
44+
item = [pubid, pubid_url, pub_content]
45+
data_csv(item)
46+
47+
def data_csv(item):
48+
with open('新浪微博爬取.csv', 'a', encoding='gbk', newline='') as csvfile:
49+
writer = csv.writer(csvfile)
50+
try:
51+
# 因为提取的内容gbk解码不了 然后就会报错。我们可以忽略这个错误
52+
writer.writerrow(item)
53+
except:
54+
print("写入失败")
55+
if __name__ == '__main__':
56+
# 实例化webdriver
57+
driver = webdriver.Chrome(r'D:\learnsofeware\python3.5\chromedriver.exe')
58+
59+
driver.implicitly_wait(10) # 隐式等待时间为10秒 若10秒反应则报错
60+
login(driver) # 执行登录
61+
while True:
62+
spider(driver) # 进入主页进行内容的爬取
63+
time.sleep(600)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /