From 602bb329c108261d21b739f9aaa544f747338f4f Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月19日 19:31:12 +0800 Subject: [PATCH 01/50] update README.md --- README.md | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 98b807f..96ae3b6 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,28 @@ -# PythonCrawler: 用python编写的爬虫项目集合 ``` - ( - )\ ) ) ) ( ( -(()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( -(_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ -| _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) -| _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| -|_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` +#

PythonCrawler: 用 python编写的爬虫项目集合

+

+ + + + + + + + + +

+ # spiderFile模块简介 From d4b0cb1aa3ce85ac7be301a297c878c04cd0998c Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月19日 19:32:47 +0800 Subject: [PATCH 02/50] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 96ae3b6..bafbfe5 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From 3fb5c3ae29d685d7dc41f3357602b99a9b0b0537 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月19日 19:34:58 +0800 Subject: [PATCH 03/50] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bafbfe5..3a391c3 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From aa412528725d09ec9bd7ae79e530cc168b1046f2 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月19日 19:36:22 +0800 Subject: [PATCH 04/50] update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3a391c3..c5d9826 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ``` - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

From 9f0a1afb8bbfb891a6af02b5695bba3c33c0d056 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月22日 12:16:06 +0800 Subject: [PATCH 05/50] update readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c5d9826..c698629 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ restaurantlist=restaurantList('http://www.dianping.com/search/category/2/10/p2') ``` ##### 2.获取代理IP -爬取http://proxy.ipcn.org,获取可用代理 +爬取[代理IP](http://proxy.ipcn.org) ```python from spiderAPI.proxyip import get_enableips From 1f4ac1645d6f793eff4bcac47f66c8d8b835a213 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年5月24日 13:02:57 +0800 Subject: [PATCH 06/50] update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index c698629..bf4462e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,12 @@ + + + + + +

From 2024a8e9083df330c55dddb5147fc172bb7dcd3f Mon Sep 17 00:00:00 2001 From: Hangfeng Yang Date: Fri, 3 Aug 2018 12:57:19 +0800 Subject: [PATCH 07/50] Update README.md --- README.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index bf4462e..e37e684 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ``` #

PythonCrawler: 用 python编写的爬虫项目集合

- + @@ -21,11 +21,11 @@ - - + + - - + +

@@ -33,33 +33,33 @@ # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 +##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 -##### 2. [baidu_wm_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 +##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 -##### 3. [get_photos.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 +##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 -##### 4. [get_web_all_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 +##### 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 -##### 5. [lagou_position_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 +##### 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 -##### 6. [student_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 +##### 6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 -##### 7. [JD_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 +##### 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 -##### 8. [ECUT_pos_html.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +##### 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 -##### 9. [ECUT_get_grade.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 +##### 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 -##### 10. [github_hot.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +##### 10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 -##### 11.[xz_picture_spider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +##### 11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 -##### 12.[one_img.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 +##### 12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 -##### 13.[get_baike.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 +##### 13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 -##### 14.[kantuSpider.py](https://github.com/Fenghuapiao/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 --- # spiderAPI模块简介 From d4ff21c6661613194e5a17cfb71863003e608190 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年8月20日 10:59:10 +0800 Subject: [PATCH 08/50] =?UTF-8?q?=E6=B7=BB=E5=8A=A0fuckCTF.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 + spiderFile/fuckCTF.py | 123 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 spiderFile/fuckCTF.py diff --git a/README.md b/README.md index e37e684..e332982 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ ##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py new file mode 100644 index 0000000..8e7f3a5 --- /dev/null +++ b/spiderFile/fuckCTF.py @@ -0,0 +1,123 @@ +import os +import random +from PIL import Image +from selenium import webdriver + + +class fuckCTF: + + def __init__(self, username, old_password): + self.url = "http://hetianlab.com/" + self.login_url = "http://hetianlab.com/loginLab.do" + self.username = username + self.old_password = old_password + self.new_password = (self.yield_new_password(), "111111")[0] + self.options = webdriver.FirefoxOptions() + self.options.add_argument("-headless") + self.browser = webdriver.Firefox(options=self.options) + print("init ok") + + def login_hetian(self): + self.browser.get(self.login_url) + self.browser.find_element_by_id("userEmail").clear() + self.browser.find_element_by_id("userEmail").send_keys(self.username) + self.browser.find_element_by_id("passwordIn").clear() + self.browser.find_element_by_id("passwordIn").send_keys(self.old_password) + self.browser.get_screenshot_as_file(self.username + '/' + "login.png") + self.browser.find_element_by_id("registButIn").click() + self.browser.get(self.url) + print("login_hetian running ok!") + + def get_personl_information_page(self): + grzx_btn = self.browser.find_element_by_xpath("/html/body/div[1]/div[1]/div/div/div[2]/ul/li[2]/a") + self.browser.execute_script("$(arguments[0]).click()", grzx_btn) + self.browser.get("http://hetianlab.com/getUserInfo.do") + print("get_personl_information_page running ok!") + + def get_password_setting_page(self): + mmsz_btn = self.browser.find_element_by_xpath("/html/body/div[2]/div/div[1]/ul/ul[3]/li[2]") + self.browser.execute_script("$(arguments[0]).click()", mmsz_btn) + self.browser.find_element_by_id("person").click() + self.browser.find_element_by_class_name("check") + print("get_password_setting_page running ok!") + + def yield_new_password(self): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + return "".join(random.choices(strings, k=6)) + + def setting_password(self): + self.browser.find_element_by_id("oldpwd").clear() + self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) + self.browser.find_element_by_id("newpwd").clear() + self.browser.find_element_by_id("newpwd").send_keys(self.new_password) + self.browser.find_element_by_id("quepwd").clear() + self.browser.find_element_by_id("quepwd").send_keys(self.new_password) + print("setting_password running ok!") + + def get_v_code(self): + status = self.browser.get_screenshot_as_file(self.username + '/' + "v_code.png") + if status: + img = Image.open(self.username + '/' + "v_code.png") + img.show() + self.v_code = input("请输入验证码: ") + self.browser.find_element_by_class_name("code").send_keys(self.v_code) + else: + raise("截屏失败!") + print("get_v_code running ok!") + + def submit_data(self): + self.browser.find_element_by_id("submitbtn").click() + self.browser.get_screenshot_as_file(self.username + '/' + "result.png") + self.browser.quit() + print("submit_data running ok!") + + def make_portfolio(self): + if not os.path.exists(self.username): + os.makedirs(self.username) + print("make_portfolio running ok!") + + def save_success_data(self): + with open("./username_and_password_data_successed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\t" + "password" + ": {}".format(self.new_password) + + "\n" + ) + print("save_success_data running ok!") + + def save_failed_data(self): + with open("./username_and_password_data_failed.log", "a+") as fp: + fp.write( + "username" + ": {}".format(self.username) + "\n" + ) + print("save_failed_data running ok!") + + def main(self): + try: + self.make_portfolio() + self.login_hetian() + self.get_personl_information_page() + self.get_password_setting_page() + self.setting_password() + self.get_v_code() + self.submit_data() + self.save_success_data() + except: + self.save_failed_data() + + +def yield_usernames(n): + prefix = "ctf2018_gzhu" + postfix = "@dh.com" + for num in range(1, n): + if num < 10: + infix = '0' + str(num) + else: + infix = str(num) + yield prefix + infix + postfix + + +if __name__ == "__main__": + for username in yield_usernames(100): + ctfer = fuckCTF(username, "111111") + ctfer.main() From a6590adc1e44f647bc02daff4ef299d6615577c6 Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年8月20日 11:01:42 +0800 Subject: [PATCH 09/50] Update fuckCTF.py --- spiderFile/fuckCTF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 8e7f3a5..5e42979 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -119,5 +119,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "111111") + ctfer = fuckCTF(username, "******") ctfer.main() From 3e5a3e0200907a85137cb433c4049103663810c3 Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年8月20日 15:19:08 +0800 Subject: [PATCH 10/50] Update fuckCTF.py --- spiderFile/fuckCTF.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 5e42979..17e4b64 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -1,3 +1,10 @@ + +""" +author: 杨航锋 +date:2018年8月19日 +""" + + import os import random from PIL import Image From 6a9cf1f0f2723f007f71f98d6b9c35b8f8f51c4c Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年8月20日 18:57:56 +0800 Subject: [PATCH 11/50] Update fuckCTF.py --- spiderFile/fuckCTF.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 17e4b64..6142056 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -1,7 +1,8 @@ """ author: 杨航锋 -date:2018年8月19日 +date : 2018年8月19日 +mood : 嗯,比较无聊,甚至还有点想吃黄焖鸡米饭😋 """ From 3ee30111aa1551deea01b8bde85bc8e4b694150c Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年8月20日 19:56:58 +0800 Subject: [PATCH 12/50] Update fuckCTF.py --- spiderFile/fuckCTF.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 6142056..a89e269 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -8,6 +8,8 @@ import os import random +import functools + from PIL import Image from selenium import webdriver @@ -19,7 +21,7 @@ def __init__(self, username, old_password): self.login_url = "http://hetianlab.com/loginLab.do" self.username = username self.old_password = old_password - self.new_password = (self.yield_new_password(), "111111")[0] + self.new_password = (self.yield_new_password(), "******")[0] self.options = webdriver.FirefoxOptions() self.options.add_argument("-headless") self.browser = webdriver.Firefox(options=self.options) @@ -49,9 +51,10 @@ def get_password_setting_page(self): self.browser.find_element_by_class_name("check") print("get_password_setting_page running ok!") + @gen_decorator def yield_new_password(self): strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - return "".join(random.choices(strings, k=6)) + yield "".join(random.choices(strings, k=6)) def setting_password(self): self.browser.find_element_by_id("oldpwd").clear() @@ -113,7 +116,20 @@ def main(self): except: self.save_failed_data() - + +def gen_decorator(gen): + @functools.wraps(gen) + def inner(*args, **kwargs): + return next(gen(*args, **kwargs)) + return inner + + +@gen_decorator +def yield_new_password(): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + yield "".join(random.choices(strings, k=6)) + + def yield_usernames(n): prefix = "ctf2018_gzhu" postfix = "@dh.com" From bc0c59b3039ed2af3555546a0d8366bc8842420d Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年8月20日 20:11:45 +0800 Subject: [PATCH 13/50] :yum::yum::yum: --- spiderFile/fuckCTF.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index a89e269..d894db2 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -122,12 +122,6 @@ def gen_decorator(gen): def inner(*args, **kwargs): return next(gen(*args, **kwargs)) return inner - - -@gen_decorator -def yield_new_password(): - strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - yield "".join(random.choices(strings, k=6)) def yield_usernames(n): From a30b6030412adb4a0ea821ff31ff21a0901f2546 Mon Sep 17 00:00:00 2001 From: HangfengYang Date: 2018年8月21日 10:36:04 +0800 Subject: [PATCH 14/50] :yum::yum::yum: --- spiderFile/fuckCTF.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index d894db2..6c501ae 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -21,7 +21,7 @@ def __init__(self, username, old_password): self.login_url = "http://hetianlab.com/loginLab.do" self.username = username self.old_password = old_password - self.new_password = (self.yield_new_password(), "******")[0] + self.new_password = (yield_new_password(), "******")[0] self.options = webdriver.FirefoxOptions() self.options.add_argument("-headless") self.browser = webdriver.Firefox(options=self.options) @@ -51,11 +51,6 @@ def get_password_setting_page(self): self.browser.find_element_by_class_name("check") print("get_password_setting_page running ok!") - @gen_decorator - def yield_new_password(self): - strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") - yield "".join(random.choices(strings, k=6)) - def setting_password(self): self.browser.find_element_by_id("oldpwd").clear() self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) @@ -122,12 +117,18 @@ def gen_decorator(gen): def inner(*args, **kwargs): return next(gen(*args, **kwargs)) return inner + + +@gen_decorator +def yield_new_password(): + strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") + yield "".join(random.choices(strings, k=6)) - + def yield_usernames(n): prefix = "ctf2018_gzhu" postfix = "@dh.com" - for num in range(1, n): + for num in range(n): if num < 10: infix = '0' + str(num) else: @@ -137,5 +138,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "******") + ctfer = fuckCTF(username, "111111") ctfer.main() From 8b29fdf6e60f1e9007724308b5a8d23a6d577055 Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年8月21日 11:32:10 +0800 Subject: [PATCH 15/50] Update fuckCTF.py --- spiderFile/fuckCTF.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/fuckCTF.py b/spiderFile/fuckCTF.py index 6c501ae..76597f1 100644 --- a/spiderFile/fuckCTF.py +++ b/spiderFile/fuckCTF.py @@ -138,5 +138,5 @@ def yield_usernames(n): if __name__ == "__main__": for username in yield_usernames(100): - ctfer = fuckCTF(username, "111111") + ctfer = fuckCTF(username, "******") ctfer.main() From b94801a967b236c5d7289904d6a67ce0608ea6a2 Mon Sep 17 00:00:00 2001 From: yhf Date: 2018年12月28日 11:09:40 +0800 Subject: [PATCH 16/50] Create LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..02bfa5d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 yhf + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From dd2c6c7f7b6fc4ce2a04088d9cbc71df14a762b6 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月22日 16:30:25 +0800 Subject: [PATCH 17/50] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e332982..11e32c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -``` +```shell ( )\ ) ) ) ( ( (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( From 601fd91f9f50470103cd898b883fd729087131ab Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月23日 17:04:17 +0800 Subject: [PATCH 18/50] add one_update.py --- spiderFile/one_update.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 spiderFile/one_update.py diff --git a/spiderFile/one_update.py b/spiderFile/one_update.py new file mode 100644 index 0000000..d457785 --- /dev/null +++ b/spiderFile/one_update.py @@ -0,0 +1,38 @@ +import re +import requests as rq + +ROOT_URL = "http://wufazhuce.com/one/" +URL_NUM = 14 + +def yield_url(ROOT_URL, URL_NUM): + return ROOT_URL + str(URL_NUM) + +def get_html(url): + return rq.get(url).content.decode("utf-8") + +def get_data(html): + img_url_regex = re.compile('') + cite_regex = re.compile('
(.*?)
', re.S) + img_url = re.findall(img_url_regex, html)[0] + cite = re.findall(cite_regex, html)[0].strip() + return img_url, cite + +def save_data(img_url, cite, URL_NUM): + with open("./{}.jpg".format(URL_NUM), "wb") as fp: + fp.write(rq.get(img_url).content) + with open("./cite{}.txt".format(URL_NUM), "w") as fp: + fp.write(cite) + return URL_NUM + 1 + +def main(ROOT_URL, URL_NUM, number): + for _ in range(number): + url = yield_url(ROOT_URL, URL_NUM) + html = get_html(url) + img_url, cite = get_data(html) + URL_NUM = save_data(img_url, cite, URL_NUM) + +if __name__ == "__main__": + try: + main(ROOT_URL, URL_NUM, 20) + except: + pass From bdb5d02d7111659c0a43f6fdf0cd2c3f261f7994 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月23日 17:04:48 +0800 Subject: [PATCH 19/50] update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 11e32c8..ceed4fe 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ ##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 +##### 16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): 更新抓取one文艺网站的代码,添加一句箴言的抓取。 + --- # spiderAPI模块简介 #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 From 9b598859c520cf510ca1e65485b5e8327b21764d Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月24日 16:33:08 +0800 Subject: [PATCH 20/50] update readme file. --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ceed4fe..5fdc214 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合

+

@@ -67,8 +68,11 @@ --- # spiderAPI模块简介 + #### 本模块提供一些网站的API爬虫接口,功能可能不是很全因此可塑性很大智慧的你如果有兴趣可以继续改进。 + ##### 1.大众点评 + ```python from spiderAPI.dianping import * @@ -100,7 +104,8 @@ enableips=get_enableips() ``` ##### 3.百度地图 -百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口 + +百度地图提供的API,对查询有一些限制,这里找出了web上查询的接口。 ```python from spiderAPI.baidumap import * From c7d9ed106529bfba0eca81a788e889343970d34f Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月25日 15:49:29 +0800 Subject: [PATCH 21/50] update readme file. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 5fdc214..e1549be 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:

@@ -31,7 +31,6 @@

- # spiderFile模块简介 ##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 From 7df29c9aa457686f170305761df5dff901563a29 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月27日 15:31:17 +0800 Subject: [PATCH 22/50] update --- spiderFile/baidu_sy_img.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spiderFile/baidu_sy_img.py b/spiderFile/baidu_sy_img.py index faaf6e2..b663ea0 100644 --- a/spiderFile/baidu_sy_img.py +++ b/spiderFile/baidu_sy_img.py @@ -42,11 +42,11 @@ def get_img(page, headers): reg = re.compile('http://.*?\.jpg') imglist1 = re.findall(reg, page) imglist2 = imglist1[0: len(imglist1): 3] -# [img_url_list.append(i) for i in imglist if not i in img_url_list] + # [img_url_list.append(i) for i in imglist if not i in img_url_list] x = 0 for imgurl in imglist2: bin = requests.get(imgurl, headers=headers).content - with open('E:/Pic2/%s.jpg' % x, 'wb') as file: + with open('./%s.jpg' % x, 'wb') as file: file.write(bin) x += 1 From fba4d61bdc3e9e4fa651ebfcf8b0563f664587af Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年1月28日 08:49:28 +0800 Subject: [PATCH 23/50] update readme file. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1549be..35e4c3b 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ ##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 -##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片‘唯美意境’模块。 +##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片`唯美意境`模块。 ##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 From ed73a121534c187ca9735a148f1770c2de575ba7 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年2月19日 13:42:21 +0800 Subject: [PATCH 24/50] update README --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 35e4c3b..bf1dee5 100644 --- a/README.md +++ b/README.md @@ -33,37 +33,37 @@ # spiderFile模块简介 -##### 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): 抓取百度的‘高清摄影’图片。 +1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的‘高清摄影’图片。** -##### 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): 抓取百度图片`唯美意境`模块。 +2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** -##### 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): 抓取百度贴吧某话题下的所有图片。 +3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** -##### 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): 抓取整个网站的图片。 +4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** -##### 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): 任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。 +5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -##### 6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): 基于本学校官网的url漏洞,获取所有注册学生学籍证件照。 +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** -##### 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): 大批量抓取京东商品id和标签。 +7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** -##### 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): 抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。 +8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** -##### 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): 模拟登陆学校官网,抓取成绩并计算平均学分绩。 +9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** -##### 10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): 抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。 +10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** -##### 11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): 应一位知友的请求,抓取某网站上面所有的写真图片。 +11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** -##### 12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): 抓取one文艺网站的图片。 +12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** -##### 13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): 任意输入一个关键词抓取百度百科的介绍。 +13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** -##### 14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): 抓取看图网站上的所有图片。 +14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** -##### 15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): 通过selenium模拟登入合天网站,自动修改原始密码。 +15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** -##### 16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): 更新抓取one文艺网站的代码,添加一句箴言的抓取。 +16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** --- # spiderAPI模块简介 From 16e884afcf68c4f003bf79220ae168d5fd6e75c4 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年2月19日 13:43:52 +0800 Subject: [PATCH 25/50] update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bf1dee5..47e5c43 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ # spiderFile模块简介 -1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的‘高清摄影’图片。** +1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** 2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** From db1bb529108f37548067e31ba276f91d3c05c4e7 Mon Sep 17 00:00:00 2001 From: yhf Date: 2019年2月19日 13:46:14 +0800 Subject: [PATCH 26/50] update README --- README.md | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 47e5c43..a6621e5 100644 --- a/README.md +++ b/README.md @@ -34,36 +34,21 @@ # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** - -2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** - -3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** - -4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** - -5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** - -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** - -7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** - -8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** - -9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** - -10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** - -11.[xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** - -12.[one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** - -13.[get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** - -14.[kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** - -15.[fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** - -16.[one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** +2. [baidu_wm_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_wm_img.py): **抓取百度图片`唯美意境`模块。** +3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** +4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** +5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** +7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** +8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** +9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** +10. [github_hot.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/github_hot.py): **抓取github上面热门语言所对应的项目,并把项目简介和项目主页地址保存到本地文件。** +11. [xz_picture_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/xz_picture_spider.py): **应一位知友的请求,抓取某网站上面所有的写真图片。** +12. [one_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_img.py): **抓取one文艺网站的图片。** +13. [get_baike.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_baike.py): **任意输入一个关键词抓取百度百科的介绍。** +14. [kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** +15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** +16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** --- # spiderAPI模块简介 From 3ef305ba7b5b79f36dd1740d0f4043d0e4ba3faf Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 9 May 2019 09:04:09 +0800 Subject: [PATCH 27/50] add get_history_weather.py:leaves: --- spiderFile/get_history_weather.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 spiderFile/get_history_weather.py diff --git a/spiderFile/get_history_weather.py b/spiderFile/get_history_weather.py new file mode 100644 index 0000000..77176fc --- /dev/null +++ b/spiderFile/get_history_weather.py @@ -0,0 +1,31 @@ +import re +import pandas as pd +import requests as rq +from bs4 import BeautifulSoup + + +def get_data(url): + html = rq.get(url).content.decode("gbk") + soup = BeautifulSoup(html, "html.parser") + tr_list = soup.find_all("tr") + dates, conditions, temperatures = [], [], [] + for data in tr_list[1:]: + sub_data = data.text.split() + dates.append(sub_data[0]) + conditions.append("".join(sub_data[1:3])) + temperatures.append("".join(sub_data[3:6])) + _data = pd.DataFrame() + _data["日期"] = dates + _data["天气状况"] = conditions + _data["气温"] = temperatures + return _data + +# 获取广州市2019年第一季度天气状况 +data_1_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201901.html") +data_2_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201902.html") +data_3_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201903.html") + + +data = pd.concat([data_1_month, data_2_month, data_3_month]).reset_index(drop=True) + +data.to_csv("guangzhou_history_weather_data.csv", index=False, encoding="utf-8") From d0a29c2deb36d9986321f38f97ff352c1d589135 Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 9 May 2019 09:05:22 +0800 Subject: [PATCH 28/50] update README:fire: --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a6621e5..b2c5dc4 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ 14. [kantuSpider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/kantuSpider.py): **抓取看图网站上的所有图片。** 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** +17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** --- # spiderAPI模块简介 From be62473e586df0f910bf05fc375e0daf1eda8374 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 9 Nov 2020 09:59:00 +0800 Subject: [PATCH 29/50] Create search_useful_camera_ip_address.py --- spiderFile/search_useful_camera_ip_address.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 spiderFile/search_useful_camera_ip_address.py diff --git a/spiderFile/search_useful_camera_ip_address.py b/spiderFile/search_useful_camera_ip_address.py new file mode 100644 index 0000000..652b180 --- /dev/null +++ b/spiderFile/search_useful_camera_ip_address.py @@ -0,0 +1,92 @@ +import re +import tqdm +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoAlertPresentException, TimeoutException + +# 扫描网站可自己寻找,代码仅演示逻辑 +country = "IN" #印度 +city = "" +login_url = "" +query_url = "" +city_url = "" +USER_NAME = "" +PASSWORD = "" + +# 无头浏览器配置 +chrome_options = Options() +chrome_options.add_argument("--headless") +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("log-level=3") +browser = webdriver.Chrome(chrome_options=chrome_options) +browser.set_page_load_timeout(10) + +#登录模块 +browser.get(login_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.XPATH, '//*[@name="login_submit"]')) +) +browser.find_element_by_id("username").clear() +browser.find_element_by_id("username").send_keys(USER_NAME) +browser.find_element_by_id("password").clear() +browser.find_element_by_id("password").send_keys(PASSWORD) +browser.find_element_by_name("login_submit").click() + +#抓取潜在的摄像头url,默认抓取两页 +if city: + query_url += city_url + +latent_camera_url = [] +browser.get(query_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'button')) +) +html = browser.page_source +latent_camera_url += re.findall(' Date: Mon, 9 Nov 2020 10:03:18 +0800 Subject: [PATCH 30/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b2c5dc4..f3cfa1e 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** - +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** --- # spiderAPI模块简介 From 375a7bd08425abd0b820135f26bf0459ac3b41f3 Mon Sep 17 00:00:00 2001 From: yhf Date: Mon, 9 Nov 2020 10:04:26 +0800 Subject: [PATCH 31/50] Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f3cfa1e..4fd02b1 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ```shell - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` #

PythonCrawler: 用 python编写的爬虫项目集合:bug:

From 47370eee4e4e6a5b7bc80aa54945bd26792a4b88 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 7 Feb 2021 10:54:20 +0800 Subject: [PATCH 32/50] Create get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 67 +++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 spiderFile/get_top_sec_com.py diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py new file mode 100644 index 0000000..0007e77 --- /dev/null +++ b/spiderFile/get_top_sec_com.py @@ -0,0 +1,67 @@ +import re +import os +import joblib +import requests as rq + +import pandas as pd +import matplotlib.pyplot as plt + +class getTopSecCom: + def __init__(self, top=None): + self.headers = {"Referer": "http://quote.eastmoney.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} + self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611" + self.shares_api = "https://xueqiu.com/S/" + self.top = top + if not os.path.exists("./useful_sec_com_list"): + self.useful_sec_com_list = self.get_sec_com_code() + else: + with open("./useful_sec_com_list", "rb") as fp: + self.useful_sec_com_list = joblib.load(fp) + + def get_sec_com_code(self): + html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8") + sec_com_list = eval(re.findall("\[(.*?)\]", html)[0]) + useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]] + + # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh + for sec_com in useful_sec_com_list: + if sec_com[0][0] == "6": + sec_com[0] = "sh" + sec_com[0] + else: + sec_com[0] = "sz" + sec_com[0] + with open("useful_sec_com_list", "wb") as fp: + joblib.dump(useful_sec_com_list, fp) + return useful_sec_com_list + + def get_shares_details(self): + all_shares = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + response = rq.get(url, headers=headers).content.decode("utf-8") + market_value = re.search(" 总市值:(.*?)亿", response) + if market_value: + all_shares.append([*sec_com, market_value.groups()[0]]) + return all_shares + + def yield_picture(self, save_path): + all_shares = self.get_shares_details() + df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) + df["市值(亿)"] = df["市值(亿)"].astype(float) + df.sort_values(by="市值(亿)", ascending=False, inplace=True) + height = 0.18 * df.shape[0] + if self.top and 0< self.top <= df.shape[0]: + df = df.iloc[:self.top, :] + height = 0.18 * self.top + df.index = range(1, df.shape[0]+1) + + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + + fig = plt.figure(figsize=(2.5, height), dpi=400) + ax = fig.add_subplot(111, frame_on=False) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + _ = table(ax, df, loc="center") + fig.savefig(save_path) From 4d86db327934e60e75223219652440ed60c799c2 Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 7 Feb 2021 10:56:46 +0800 Subject: [PATCH 33/50] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4fd02b1..87ed4b0 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** --- # spiderAPI模块简介 From ac3c78a8b82168931d9ec36e7f7669528a9aa347 Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年2月28日 10:56:40 +0800 Subject: [PATCH 34/50] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 0007e77..3a3186c 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -38,7 +38,7 @@ def get_shares_details(self): all_shares = [] for sec_com in self.useful_sec_com_list: url = self.shares_api + sec_com[0] - response = rq.get(url, headers=headers).content.decode("utf-8") + response = rq.get(url, headers=self.headers).content.decode("utf-8") market_value = re.search(" 总市值:(.*?)亿", response) if market_value: all_shares.append([*sec_com, market_value.groups()[0]]) @@ -49,10 +49,10 @@ def yield_picture(self, save_path): df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) df.sort_values(by="市值(亿)", ascending=False, inplace=True) - height = 0.18 * df.shape[0] + height = 0.2 * df.shape[0] if self.top and 0< self.top <= df.shape[0]: df = df.iloc[:self.top, :] - height = 0.18 * self.top + height = 0.2 * self.top df.index = range(1, df.shape[0]+1) plt.rcParams['font.sans-serif'] = ['SimHei'] @@ -63,5 +63,5 @@ def yield_picture(self, save_path): ax = fig.add_subplot(111, frame_on=False) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) - _ = table(ax, df, loc="center") - fig.savefig(save_path) + _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") + plt.savefig(save_path) From 57c0937cc3349facd0f03ec1b21c2d833d1fe8d4 Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年4月16日 15:46:10 +0800 Subject: [PATCH 35/50] Update get_top_sec_com.py add async function. --- spiderFile/get_top_sec_com.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 3a3186c..be0f706 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -1,6 +1,8 @@ import re import os import joblib +import asyncio +import aiohttp import requests as rq import pandas as pd @@ -33,6 +35,26 @@ def get_sec_com_code(self): with open("useful_sec_com_list", "wb") as fp: joblib.dump(useful_sec_com_list, fp) return useful_sec_com_list + + async def async_get_shares_details(self, sec_com, url): + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self.headers) as response: + html = await response.text() + market_value = re.search(" 总市值:(.*?)亿", html) + if market_value: + return [*sec_com, market_value.groups()[0]] + + async def async_get_all_shares(self): + tasks = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + tasks.append( + asyncio.create_task( + self.async_get_shares_details(sec_com, url) + ) + ) + done, pendding = await asyncio.wait(tasks) + return [share.result() for share in done if share.result()] def get_shares_details(self): all_shares = [] @@ -45,7 +67,8 @@ def get_shares_details(self): return all_shares def yield_picture(self, save_path): - all_shares = self.get_shares_details() + # all_shares = self.get_shares_details() # 同步代码 + all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) df.sort_values(by="市值(亿)", ascending=False, inplace=True) From 4e2dc05f1935659700358b10604ac6622aced224 Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年4月16日 16:01:25 +0800 Subject: [PATCH 36/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 87ed4b0..9f1e6e2 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** -19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** --- # spiderAPI模块简介 From 27f856ffcf3dc76ef6762c41973d889583604e69 Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年4月17日 10:12:58 +0800 Subject: [PATCH 37/50] Create get_tj_accident_info.py --- spiderFile/get_tj_accident_info.py | 77 ++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 spiderFile/get_tj_accident_info.py diff --git a/spiderFile/get_tj_accident_info.py b/spiderFile/get_tj_accident_info.py new file mode 100644 index 0000000..b8b2237 --- /dev/null +++ b/spiderFile/get_tj_accident_info.py @@ -0,0 +1,77 @@ +import re +import joblib +import asyncio +import aiohttp +import requests as rq +from bs4 import BeautifulSoup + +def yield_all_page_url(root_url, page=51): + """生成所有的页面url + @param root_url: 首页url + type root_url: str + @param page: 爬取的页面个数 + type page: int + """ + # 观察网站翻页结构可知 + page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] + # 添加首页url + page_url_list.insert(0, root_url) + return page_url_list + +async def get_info_page_url(url, session): + regex = re.compile("
') + html = rq.get(url, headers=HEADERS).content.decode("utf-8") + soup = BeautifulSoup(html) + title = re.search(title_regex, html) + content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB") + content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word") + content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web") + if content_1: + content = content_1.text + elif content_2: + content = content_2.text + elif content_3: + content = content_3.text + else: + content = "" + return {"title": title.groups()[0], "content": content} + +def get_all_data(all_info_page_url_list): + all_data = [] + for i, url in enumerate(all_info_page_url_list): + all_data.append(get_data(url)) + print(i, url, all_data[-1]) + joblib.dump(all_data, "all_data.joblib") + + +if __name__ == "__main__": + root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/" + agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" + HEADERS = {"Host": "yjgl.tj.gov.cn", + "Connection": "keep-alive", + "User-Agent": agent_part_1 + agent_part_2, + "Referer": "http://static.bshare.cn/"} + page_url_list = yield_all_page_url(root_url, page=51) + all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list)) + joblib.dump("all_info_page_url_list", all_info_page_url_list) From cf927e594a2cb1ff224f95347a513c729e2b71ed Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年4月17日 10:15:39 +0800 Subject: [PATCH 38/50] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9f1e6e2..4062693 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** 18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** 19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** --- # spiderAPI模块简介 From 198e817f146d211c9d4c3a470de44eb372d7c291 Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年4月28日 12:14:53 +0800 Subject: [PATCH 39/50] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index be0f706..5229bbd 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -87,4 +87,4 @@ def yield_picture(self, save_path): ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") - plt.savefig(save_path) + plt.savefig(save_path, bbox_inches="tight") From 66304e7b63b028a9a031428ea7b1d3859d12627a Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年5月14日 16:45:31 +0800 Subject: [PATCH 40/50] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index 5229bbd..caf9ac2 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -1,5 +1,6 @@ import re import os +import time import joblib import asyncio import aiohttp @@ -71,20 +72,22 @@ def yield_picture(self, save_path): all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) df["市值(亿)"] = df["市值(亿)"].astype(float) + date = time.strftime("%Y年%m月%d日", time.localtime()) df.sort_values(by="市值(亿)", ascending=False, inplace=True) - height = 0.2 * df.shape[0] - if self.top and 0< self.top <= df.shape[0]: - df = df.iloc[:self.top, :] - height = 0.2 * self.top df.index = range(1, df.shape[0]+1) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False - fig = plt.figure(figsize=(2.5, height), dpi=400) + fig = plt.figure(dpi=400) ax = fig.add_subplot(111, frame_on=False) ax.xaxis.set_visible(False) - ax.yaxis.set_visible(False) - _ = pd.plotting.table(ax, df, loc="center", cellLoc="center") + ax.yaxis.set_visible(False) + _ = pd.plotting.table(ax, df, loc="best", cellLoc="center") + ax.set_title(f"{date}A股网安版块公司市值排名", fontsize=10) plt.savefig(save_path, bbox_inches="tight") + +if __name__ == "__main__": + m = getTopSecCom() + m.yield_picture("rank.png") From 5bb06b8864226e6977501bf08f56db889e0f8e7c Mon Sep 17 00:00:00 2001 From: yhf Date: 2021年5月14日 16:46:33 +0800 Subject: [PATCH 41/50] Update get_top_sec_com.py --- spiderFile/get_top_sec_com.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py index caf9ac2..f1fce0a 100644 --- a/spiderFile/get_top_sec_com.py +++ b/spiderFile/get_top_sec_com.py @@ -8,6 +8,8 @@ import pandas as pd import matplotlib.pyplot as plt +# import nest_asyncio +# nest_asyncio.apply() class getTopSecCom: def __init__(self, top=None): From bfa05ebfd961c00a5840812a536860eba7f92faf Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:11:53 +0800 Subject: [PATCH 42/50] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4062693..bab2523 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合:bug:

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)

@@ -38,7 +38,7 @@ 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。** 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** From 4630508b964288937b0c68dd9a845dcbbc0a605b Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:13:18 +0800 Subject: [PATCH 43/50] Update student_img.py --- spiderFile/student_img.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/spiderFile/student_img.py b/spiderFile/student_img.py index d3135ea..a66436d 100644 --- a/spiderFile/student_img.py +++ b/spiderFile/student_img.py @@ -1,29 +1,6 @@ import requests """ -思路:去官网自己的主页,看自己的照片的url然后你懂的。 +思路:去官网自己的主页,看自己的学籍照片的url。 """ -url = '' -banji = [] -zhuanye = [] -for a in range(10): - for b in range(10): - banji.append(str(a) + '0' + str(b)) -for c in range(10): - zhuanye.append('20' + str(c)) -for year in range(2011, 2015): - for xh in zhuanye: - for nj in banji: - for i in range(1, 35): - if i < 10: - xuehao = str(year) + str(xh) + str(nj) + '0' + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) - else: - xuehao = str(year) + str(xh) + str(nj) + str(i) - student_url = url + xuehao - with open('E:/student_img/%s.jpeg' % xuehao, 'wb') as file: - file.write(requests.get(student_url).content) -print('OK!') From 483c276b7c700f16ec7b7f5a62b194a93dedf26e Mon Sep 17 00:00:00 2001 From: yhf Date: Sun, 2 Jan 2022 09:14:46 +0800 Subject: [PATCH 44/50] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bab2523..36c5fdb 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** -18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **模拟登入某扫描网站获取潜在的摄像头IP地址,然后使用弱密码验证筛选出可登录的摄像头IP地址。** +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **摄像头弱密码安全科普。** 19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** 20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** --- From 81a5eed902dd2514464ec7da5e5fe465eb8b082f Mon Sep 17 00:00:00 2001 From: yhf Date: Fri, 8 Jul 2022 15:43:54 +0800 Subject: [PATCH 45/50] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 36c5fdb..6b92523 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ |__/ —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者千万要遵循中华人民共和国法律!)

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)

@@ -38,7 +38,7 @@ 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取学籍证件照。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取自己学籍证件照。** 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** From 0d60014d61a5b159257a366a9088752d6c6059b3 Mon Sep 17 00:00:00 2001 From: yhf Date: 2022年10月24日 14:59:17 +0800 Subject: [PATCH 46/50] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b92523..277bb18 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,13 @@

- +对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf +产品介绍: +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。 +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。 +3、支持HTTP/HTTPS/Socks5协议 +4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 +5、支持海量IP免费试用 # spiderFile模块简介 From bf967800e286a4241518bc3304d0c593fc7d3062 Mon Sep 17 00:00:00 2001 From: yhf Date: 2022年10月24日 15:01:33 +0800 Subject: [PATCH 47/50] Update README.md --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 277bb18..dd07248 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@

-对于很多小伙伴咨询IP代理的问题,推荐一个产品,链接:http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf +对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) 产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池。 -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景。 -3、支持HTTP/HTTPS/Socks5协议 +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; +3、支持HTTP/HTTPS/Socks5协议; 4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -5、支持海量IP免费试用 +; +5、支持海量IP免费试用。 # spiderFile模块简介 From 72ba185ce7a774f0a06515a08db02ff494b6c49e Mon Sep 17 00:00:00 2001 From: yhf Date: 2022年10月24日 15:30:14 +0800 Subject: [PATCH 48/50] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dd07248..7dc2c5b 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,14 @@

-对于很多小伙伴咨询IP代理的问题,推荐一个产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) -产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; -3、支持HTTP/HTTPS/Socks5协议; + +由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) +产品介绍: +1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; +2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; +3、支持HTTP/HTTPS/Socks5协议; 4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -; +; 5、支持海量IP免费试用。 # spiderFile模块简介 From f9315ea9a0ec52e1e24aa62211ebfdd797903f2c Mon Sep 17 00:00:00 2001 From: yhf Date: Thu, 2 Mar 2023 19:02:48 +0800 Subject: [PATCH 49/50] Update README.md --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index 7dc2c5b..c54c2a1 100644 --- a/README.md +++ b/README.md @@ -30,15 +30,6 @@

-由于很多小伙伴都咨询IP代理的问题,在这里推荐大家一个好用的产品,[相关链接点击直达](http://www.ipidea.net/?utm-source=yhf&utm-keyword=?yhf) -产品介绍: -1、覆盖220+的国家和地区,9000万真实住宅IP资源,汇聚成大规模代理服务池; -2、提供动态住宅代理、静态住宅代理、数据中心、移动代理等多种解决方案,满足电子商务、市场调查、抓取索引、网站测试、广告验证、seo监控优化等多个业务场景; -3、支持HTTP/HTTPS/Socks5协议; -4、真实住宅IP,支持从制定国家城市访问目标网站,隐藏真实网络环境,保护隐私,24小时持续过滤并更新,IP纯净度高,快速响应,无限并发,99.9%的成功率,确保高效稳定连接,让您的业务得心应手 -; -5、支持海量IP免费试用。 - # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。** From fac1ffc9c9e6a04875b55d54cd67dbf72ac39db2 Mon Sep 17 00:00:00 2001 From: yhf Date: 2025年4月17日 14:14:18 +0800 Subject: [PATCH 50/50] Update README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index c54c2a1..6f96817 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,18 @@

+# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!) +### 官网地址 +[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN) +### 产品简介 +* 免费试用,先体验再选择 +* 9000万+真实住宅IP,覆盖220+国家和地区 +* 支持动态住宅代理、静态住宅代理(ISP) +* 适用于数据抓取、电商、广告验证、SEO监控等场景 +* 支持HTTP/HTTPS/SOCKS5协议,兼容性强 +* 纯净IP池,实时更新,99.9%连接成功率 +* 支持指定国家城市地区访问,保护隐私 + # spiderFile模块简介 1. [baidu_sy_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/baidu_sy_img.py): **抓取百度的`高清摄影`图片。**

AltStyle によって変換されたページ (->オリジナル) /