1+ from lxml import etree
2+ import requests
3+ from selenium import webdriver
4+ import time
5+ import random
6+ import csv
7+ 8+ 9+ 10+ def login (driver ):
11+ driver .get ('https://weibo.com' )
12+ time .sleep (3 )
13+ # 设置窗口的尺寸 防止尺寸不够影响我们提取内容
14+ driver .set_window_size (1920 , 1080 )
15+ 16+ # 找到用户输入框
17+ username = driver .find_element_by_xpath ('//*[@id="loginname"]' )
18+ username .send_keys ('*******' )
19+ password = driver .find_element_by_xpath ('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input' )
20+ password .send_keys ('*****' )
21+ submit = driver .find_element_by_xpath ('//*[@id="pl_login_form"]/div/div[3]/div[6]/a' )
22+ print ("一切输入准备好了,点击登录" )
23+ submit .click ()
24+ time .sleep (random .randint (2 , 5 ))
25+ 26+ 27+ def spider (driver ):
28+ # 1.刷新首页
29+ driver .get ('https://weibo.com' )
30+ # 随机停几秒
31+ time .sleep (random .randint (2 , 6 ))
32+ # 先获取所有微博代码
33+ all_weibo = driver .find_elements_by_xpath ('//div[@class="WB_cardwrap WB_feed_type S_bg2 WB_feed_like"]' )
34+ 35+ for weibo in all_weibo :
36+ # 注意webdriver中xpath用法
37+ 38+ # 解析id 提取文本
39+ pubid = weibo .find_elements_by_xpath ('div[1]/div[3]/div[1]/a[1]' )[0 ].text
40+ # 解析微博链接 提取属性值
41+ pubid_url = weibo .find_elements_by_xpath ('div[1]/div[3]/div[1]/a[1]' )[0 ].get_attribute ('href' )
42+ # 解析微博内容
43+ pub_content = weibo .find_elements_by_xpath ('div[1]/div[3]/div[3]' )[0 ].text
44+ item = [pubid , pubid_url , pub_content ]
45+ data_csv (item )
46+ 47+ def data_csv (item ):
48+ with open ('新浪微博爬取.csv' , 'a' , encoding = 'gbk' , newline = '' ) as csvfile :
49+ writer = csv .writer (csvfile )
50+ try :
51+ # 因为提取的内容gbk解码不了 然后就会报错。我们可以忽略这个错误
52+ writer .writerrow (item )
53+ except :
54+ print ("写入失败" )
55+ if __name__ == '__main__' :
56+ # 实例化webdriver
57+ driver = webdriver .Chrome (r'D:\learnsofeware\python3.5\chromedriver.exe' )
58+ 59+ driver .implicitly_wait (10 ) # 隐式等待时间为10秒 若10秒反应则报错
60+ login (driver ) # 执行登录
61+ while True :
62+ spider (driver ) # 进入主页进行内容的爬取
63+ time .sleep (600 )
0 commit comments