Commit 5d43463

committed

selenium简单使用

1 parent 4b534f5 commit 5d43463Copy full SHA for 5d43463

File tree

1 file changed

+63

-0

lines changed

007-使用selenium爬新浪微博.py

1 file changed

+63

-0

lines changed

`‎007-使用selenium爬新浪微博.py‎`

Lines changed: 63 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,63 @@`
	`1`	`+from lxml import etree`
	`2`	`+import requests`
	`3`	`+from selenium import webdriver`
	`4`	`+import time`
	`5`	`+import random`
	`6`	`+import csv`
	`7`	`+`
	`8`	`+`
	`9`	`+`
	`10`	`+def login(driver):`
	`11`	`+ driver.get('https://weibo.com')`
	`12`	`+ time.sleep(3)`
	`13`	`+ # 设置窗口的尺寸防止尺寸不够影响我们提取内容`
	`14`	`+ driver.set_window_size(1920, 1080)`
	`15`	`+`
	`16`	`+ # 找到用户输入框`
	`17`	`+ username = driver.find_element_by_xpath('//*[@id="loginname"]')`
	`18`	`+ username.send_keys('*******')`
	`19`	`+ password = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input')`
	`20`	`+ password.send_keys('*****')`
	`21`	`+ submit = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a')`
	`22`	`+ print("一切输入准备好了,点击登录")`
	`23`	`+ submit.click()`
	`24`	`+ time.sleep(random.randint(2, 5))`
	`25`	`+`
	`26`	`+`
	`27`	`+def spider(driver):`
	`28`	`+ # 1.刷新首页`
	`29`	`+ driver.get('https://weibo.com')`
	`30`	`+ # 随机停几秒`
	`31`	`+ time.sleep(random.randint(2, 6))`
	`32`	`+ # 先获取所有微博代码`
	`33`	`+ all_weibo = driver.find_elements_by_xpath('//div[@class="WB_cardwrap WB_feed_type S_bg2 WB_feed_like"]')`
	`34`	`+`
	`35`	`+ for weibo in all_weibo:`
	`36`	`+ # 注意webdriver中xpath用法`
	`37`	`+`
	`38`	`+ # 解析id 提取文本`
	`39`	`+ pubid = weibo.find_elements_by_xpath('div[1]/div[3]/div[1]/a[1]')[0].text`
	`40`	`+ # 解析微博链接提取属性值`
	`41`	`+ pubid_url = weibo.find_elements_by_xpath('div[1]/div[3]/div[1]/a[1]')[0].get_attribute('href')`
	`42`	`+ # 解析微博内容`
	`43`	`+ pub_content = weibo.find_elements_by_xpath('div[1]/div[3]/div[3]')[0].text`
	`44`	`+ item = [pubid, pubid_url, pub_content]`
	`45`	`+ data_csv(item)`
	`46`	`+`
	`47`	`+def data_csv(item):`
	`48`	`+ with open('新浪微博爬取.csv', 'a', encoding='gbk', newline='') as csvfile:`
	`49`	`+ writer = csv.writer(csvfile)`
	`50`	`+ try:`
	`51`	`+ # 因为提取的内容gbk解码不了然后就会报错。我们可以忽略这个错误`
	`52`	`+ writer.writerrow(item)`
	`53`	`+ except:`
	`54`	`+ print("写入失败")`
	`55`	`+if __name__ == '__main__':`
	`56`	`+ # 实例化webdriver`
	`57`	`+ driver = webdriver.Chrome(r'D:\learnsofeware\python3.5\chromedriver.exe')`
	`58`	`+`
	`59`	`+ driver.implicitly_wait(10) # 隐式等待时间为10秒若10秒反应则报错`
	`60`	`+ login(driver) # 执行登录`
	`61`	`+ while True:`
	`62`	`+ spider(driver) # 进入主页进行内容的爬取`
	`63`	`+ time.sleep(600)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 5d43463

File tree

1 file changed

1 file changed

`‎007-使用selenium爬新浪微博.py‎`

0 commit comments