diff --git a/README.md b/README.md index b2c5dc4..6f96817 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ ```shell - ( - )\ ) ) ) ( ( - (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( - /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( - (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ - | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) - | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| - |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| - |__/ - —————— by yanghangfeng + ( + )\ ) ) ) ( ( + (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( + /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( + (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ + | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) + | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| + |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| + |__/ + —————— by yanghangfeng ``` -#

PythonCrawler: 用 python编写的爬虫项目集合:bug:

+#

PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)

@@ -30,6 +30,17 @@

+# IPWO全球代理资源 | 为采集、跨境与测试项目提供支持(免费试用,爬虫使用强烈推荐!!!) +### 官网地址 +[👉 访问 IPWO 官网](https://www.ipwo.net/?code=WSESV2ONN) +### 产品简介 +* 免费试用,先体验再选择 +* 9000万+真实住宅IP,覆盖220+国家和地区 +* 支持动态住宅代理、静态住宅代理(ISP) +* 适用于数据抓取、电商、广告验证、SEO监控等场景 +* 支持HTTP/HTTPS/SOCKS5协议,兼容性强 +* 纯净IP池,实时更新,99.9%连接成功率 +* 支持指定国家城市地区访问,保护隐私 # spiderFile模块简介 @@ -38,7 +49,7 @@ 3. [get_photos.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_photos.py): **抓取百度贴吧某话题下的所有图片。** 4. [get_web_all_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_web_all_img.py): **抓取整个网站的图片。** 5. [lagou_position_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/lagou_position_spider.py): **任意输入关键字,一键抓取与关键字相关的职位招聘信息,并保存到本地文件。** -6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **基于本学校官网的url漏洞,获取所有注册学生学籍证件照。** +6. [student_img.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/student_img.py): **自动化获取自己学籍证件照。** 7. [JD_spider.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/JD_spider.py): **大批量抓取京东商品id和标签。** 8. [ECUT_pos_html.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_pos_html.py): **抓取学校官网所有校园招聘信息,并保存为html格式,图片也会镶嵌在html中。** 9. [ECUT_get_grade.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/ECUT_get_grade.py): **模拟登陆学校官网,抓取成绩并计算平均学分绩。** @@ -50,7 +61,9 @@ 15. [fuckCTF.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/fuckCTF.py): **通过selenium模拟登入合天网站,自动修改原始密码。** 16. [one_update.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/one_update.py): **更新抓取one文艺网站的代码,添加一句箴言的抓取。** 17. [get_history_weather.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_history_weather.py): **抓取广州市2019年第一季度的天气数据。** - +18. [search_useful_camera_ip_address.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/search_useful_camera_ip_address.py): **摄像头弱密码安全科普。** +19. [get_top_sec_com.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_top_sec_com.py): **异步编程获取A股市场网络安全版块公司市值排名情况,并以图片格式保存下来。** +20. [get_tf_accident_info.py](https://github.com/yhangf/PythonCrawler/blob/master/spiderFile/get_tj_accident_info.py): **同步和异步编程结合获取天津市应急管理局所有事故信息。** --- # spiderAPI模块简介 diff --git a/spiderFile/get_tj_accident_info.py b/spiderFile/get_tj_accident_info.py new file mode 100644 index 0000000..b8b2237 --- /dev/null +++ b/spiderFile/get_tj_accident_info.py @@ -0,0 +1,77 @@ +import re +import joblib +import asyncio +import aiohttp +import requests as rq +from bs4 import BeautifulSoup + +def yield_all_page_url(root_url, page=51): + """生成所有的页面url + @param root_url: 首页url + type root_url: str + @param page: 爬取的页面个数 + type page: int + """ + # 观察网站翻页结构可知 + page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] + # 添加首页url + page_url_list.insert(0, root_url) + return page_url_list + +async def get_info_page_url(url, session): + regex = re.compile("') + html = rq.get(url, headers=HEADERS).content.decode("utf-8") + soup = BeautifulSoup(html) + title = re.search(title_regex, html) + content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB") + content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word") + content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web") + if content_1: + content = content_1.text + elif content_2: + content = content_2.text + elif content_3: + content = content_3.text + else: + content = "" + return {"title": title.groups()[0], "content": content} + +def get_all_data(all_info_page_url_list): + all_data = [] + for i, url in enumerate(all_info_page_url_list): + all_data.append(get_data(url)) + print(i, url, all_data[-1]) + joblib.dump(all_data, "all_data.joblib") + + +if __name__ == "__main__": + root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/" + agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" + HEADERS = {"Host": "yjgl.tj.gov.cn", + "Connection": "keep-alive", + "User-Agent": agent_part_1 + agent_part_2, + "Referer": "http://static.bshare.cn/"} + page_url_list = yield_all_page_url(root_url, page=51) + all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list)) + joblib.dump("all_info_page_url_list", all_info_page_url_list) diff --git a/spiderFile/get_top_sec_com.py b/spiderFile/get_top_sec_com.py new file mode 100644 index 0000000..f1fce0a --- /dev/null +++ b/spiderFile/get_top_sec_com.py @@ -0,0 +1,95 @@ +import re +import os +import time +import joblib +import asyncio +import aiohttp +import requests as rq + +import pandas as pd +import matplotlib.pyplot as plt +# import nest_asyncio +# nest_asyncio.apply() + +class getTopSecCom: + def __init__(self, top=None): + self.headers = {"Referer": "http://quote.eastmoney.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} + self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611" + self.shares_api = "https://xueqiu.com/S/" + self.top = top + if not os.path.exists("./useful_sec_com_list"): + self.useful_sec_com_list = self.get_sec_com_code() + else: + with open("./useful_sec_com_list", "rb") as fp: + self.useful_sec_com_list = joblib.load(fp) + + def get_sec_com_code(self): + html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8") + sec_com_list = eval(re.findall("\[(.*?)\]", html)[0]) + useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]] + + # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh + for sec_com in useful_sec_com_list: + if sec_com[0][0] == "6": + sec_com[0] = "sh" + sec_com[0] + else: + sec_com[0] = "sz" + sec_com[0] + with open("useful_sec_com_list", "wb") as fp: + joblib.dump(useful_sec_com_list, fp) + return useful_sec_com_list + + async def async_get_shares_details(self, sec_com, url): + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self.headers) as response: + html = await response.text() + market_value = re.search(" 总市值:(.*?)亿", html) + if market_value: + return [*sec_com, market_value.groups()[0]] + + async def async_get_all_shares(self): + tasks = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + tasks.append( + asyncio.create_task( + self.async_get_shares_details(sec_com, url) + ) + ) + done, pendding = await asyncio.wait(tasks) + return [share.result() for share in done if share.result()] + + def get_shares_details(self): + all_shares = [] + for sec_com in self.useful_sec_com_list: + url = self.shares_api + sec_com[0] + response = rq.get(url, headers=self.headers).content.decode("utf-8") + market_value = re.search(" 总市值:(.*?)亿", response) + if market_value: + all_shares.append([*sec_com, market_value.groups()[0]]) + return all_shares + + def yield_picture(self, save_path): + # all_shares = self.get_shares_details() # 同步代码 + all_shares = asyncio.run(self.async_get_all_shares()) # 异步代码 + df = pd.DataFrame(all_shares, columns=["股票代码", "公司", "市值(亿)"]) + df["市值(亿)"] = df["市值(亿)"].astype(float) + date = time.strftime("%Y年%m月%d日", time.localtime()) + df.sort_values(by="市值(亿)", ascending=False, inplace=True) + df.index = range(1, df.shape[0]+1) + + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + + fig = plt.figure(dpi=400) + ax = fig.add_subplot(111, frame_on=False) + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + _ = pd.plotting.table(ax, df, loc="best", cellLoc="center") + ax.set_title(f"{date}A股网安版块公司市值排名", fontsize=10) + plt.savefig(save_path, bbox_inches="tight") + +if __name__ == "__main__": + m = getTopSecCom() + m.yield_picture("rank.png") diff --git a/spiderFile/search_useful_camera_ip_address.py b/spiderFile/search_useful_camera_ip_address.py new file mode 100644 index 0000000..652b180 --- /dev/null +++ b/spiderFile/search_useful_camera_ip_address.py @@ -0,0 +1,92 @@ +import re +import tqdm +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoAlertPresentException, TimeoutException + +# 扫描网站可自己寻找,代码仅演示逻辑 +country = "IN" #印度 +city = "" +login_url = "" +query_url = "" +city_url = "" +USER_NAME = "" +PASSWORD = "" + +# 无头浏览器配置 +chrome_options = Options() +chrome_options.add_argument("--headless") +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("log-level=3") +browser = webdriver.Chrome(chrome_options=chrome_options) +browser.set_page_load_timeout(10) + +#登录模块 +browser.get(login_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.XPATH, '//*[@name="login_submit"]')) +) +browser.find_element_by_id("username").clear() +browser.find_element_by_id("username").send_keys(USER_NAME) +browser.find_element_by_id("password").clear() +browser.find_element_by_id("password").send_keys(PASSWORD) +browser.find_element_by_name("login_submit").click() + +#抓取潜在的摄像头url,默认抓取两页 +if city: + query_url += city_url + +latent_camera_url = [] +browser.get(query_url) +WebDriverWait(browser, 30).until( + EC.presence_of_element_located((By.CLASS_NAME, 'button')) +) +html = browser.page_source +latent_camera_url += re.findall('

AltStyle によって変換されたページ (->オリジナル) /