Commit 40da46e

committed

no message

1 parent 1ae563a commit 40da46eCopy full SHA for 40da46e

File tree

8 files changed

+256

-0

lines changed

moumoubaimifan/wxCrawler

8 files changed

+256

-0

lines changed

`‎moumoubaimifan/wxCrawler/articles.py‎`

Lines changed: 66 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,66 @@`
	`1`	`+# articles.py`
	`2`	`+import html`
	`3`	`+import requests`
	`4`	`+import utils`
	`5`	`+`
	`6`	`+from urllib.parse import urlsplit`
	`7`	`+`
	`8`	`+class Articles(object):`
	`9`	`+ """文章信息"""`
	`10`	`+`
	`11`	`+ def __init__(self, appmsg_token, cookie):`
	`12`	`+ # 具有时效性`
	`13`	`+ self.appmsg_token = appmsg_token`
	`14`	`+`
	`15`	`+ self.headers = {`
	`16`	`+ "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",`
	`17`	`+ "Cookie": cookie`
	`18`	`+ }`
	`19`	`+`
	`20`	`+ self.data = {`
	`21`	`+ "is_only_read": "1",`
	`22`	`+ "is_temp_url": "0",`
	`23`	`+ "appmsg_type": "9",`
	`24`	`+ }`
	`25`	`+`
	`26`	`+`
	`27`	`+ def read_like_nums(self, article_url):`
	`28`	`+ """获取数据"""`
	`29`	`+ appmsgstat = self.get_appmsgext(article_url)["appmsgstat"]`
	`30`	`+ return appmsgstat["read_num"], appmsgstat["old_like_num"], appmsgstat["like_num"]`
	`31`	`+`
	`32`	`+ def get_params(self, article_url):`
	`33`	`+ """`
	`34`	`+ 获取到文章url上的请求参数`
	`35`	`+ :param article_url: 文章 url`
	`36`	`+ :return:`
	`37`	`+ """`
	`38`	`+ # url转义处理`
	`39`	`+ article_url = html.unescape(article_url)`
	`40`	`+ """获取文章链接的参数"""`
	`41`	`+ url_params = utils.str_to_dict(urlsplit(article_url).query, "&", "=")`
	`42`	`+ return url_params`
	`43`	`+`
	`44`	`+ def get_appmsgext(self, article_url):`
	`45`	`+ """`
	`46`	`+ 请求阅读数`
	`47`	`+ :param article_url: 文章 url`
	`48`	`+ :return:`
	`49`	`+ """`
	`50`	`+ url_params = self.get_params(article_url)`
	`51`	`+`
	`52`	`+ appmsgext_url = "https://mp.weixin.qq.com/mp/getappmsgext?appmsg_token={}&x5=0".format(self.appmsg_token)`
	`53`	`+ self.data.update(url_params)`
	`54`	`+`
	`55`	`+ appmsgext_json = requests.post(`
	`56`	`+ appmsgext_url, headers=self.headers, data=self.data).json()`
	`57`	`+`
	`58`	`+ if "appmsgstat" not in appmsgext_json.keys():`
	`59`	`+ raise Exception(appmsgext_json)`
	`60`	`+ return appmsgext_json`
	`61`	`+`
	`62`	`+`
	`63`	`+if __name__ == '__main__':`
	`64`	`+ info = Articles('1068_XQoMoGGBYG8Tf8k23jfdBr2H_LNekAAlDDUe2aG13TN2fer8xOSMyrLV6s-yWESt8qg5I2fJr1r9n5Y5', 'rewardsn=; wxtokenkey=777; wxuin=1681274216; devicetype=android-29; version=27001037; lang=zh_CN; pass_ticket=H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk; wap_sid2=COjq2KEGEnBPTHRVOHlYV2U4dnRqaWZqRXBqaWl3Xy1saXVWYllIVjAzdlM1VkNDNHgxeWpHOG9pckdkREMwTFEwYmNWMl9FZWtRU3pRRnhDS0pyV1BaZUVMWXN1ZWN0WnZ6aHFXdVBnbVhTY21BYnBSUXNCQUFBMLLAjfgFOA1AAQ==')`
	`65`	`+ a, b,c = info.read_like_nums('http://mp.weixin.qq.com/s?__biz=MzU1NDk2MzQyNg==&mid=2247486254&idx=1&sn=c3a47f4bf72b1ca85c99190597e0c190&chksm=fbdad3a3ccad5ab55f6ef1f4d5b8f97887b4a344c67f9186d5802a209693de582aac6429a91c&scene=27#wechat_redirect')`
	`66`	`+ print(a, b, c)`

`‎moumoubaimifan/wxCrawler/cookie.txt‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=777&key=777&pass_ticket=H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk&wxtoken=777&devicetype=android-29&clientversion=27001037&__biz=MzU1NDk2MzQyNg==&enterid=1594059276&appmsg_token=1068_ar7adip424A4GEBXj3ICCnwArRb2kU4Y5Y5m9QrePtCDqEng3qndGHMimpnfnqU7wbKfavW3dYDqknmJ&x5=0&f=json`
	`2`	`+MultiDictView[['rewardsn', ''], ['wxtokenkey', '777'], ['wxuin', '1681274216'], ['devicetype', 'android-29'], ['version', '27001037'], ['lang', 'zh_CN'], ['pass_ticket', 'H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk'], ['wap_sid2', 'COjq2KEGEnBPTHRVOHlYV2U4dnRqaWZqRXBqaWkydmtSVUtDSjNVYVQteFRnd0N5V2RQQkMxUWdOZktORlZoLU5pTEpOZndCTWdTRHNYbktjRnpZa09nOTBoUWZTajFzX1VWaFRxNEZWbUdwbDM3VjNtWXNCQUFBMNfUjfgFOA1AAQ==']]`

`‎moumoubaimifan/wxCrawler/main.py‎`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,13 @@`
	`1`	`+# coding:utf-8`
	`2`	`+# main.py`
	`3`	`+from read_cookie import ReadCookie`
	`4`	`+from wxCrawler import WxCrawler`
	`5`	`+`
	`6`	`+"""程序启动类"""`
	`7`	`+if __name__ == '__main__':`
	`8`	`+ cookie = ReadCookie('E:/python/cookie.txt')`
	`9`	`+`
	`10`	`+ cookie.write_cookie()`
	`11`	`+ appmsg_token, biz, cookie_str = cookie.parse_cookie()`
	`12`	`+ wx = WxCrawler(appmsg_token, biz, cookie_str)`
	`13`	`+ wx.run()`

`‎moumoubaimifan/wxCrawler/read_cookie.py‎`

Lines changed: 47 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,47 @@`
	`1`	`+# read_cookie.py`
	`2`	`+import re`
	`3`	`+import os`
	`4`	`+`
	`5`	`+class ReadCookie(object):`
	`6`	`+ """`
	`7`	`+ 启动write_cookie.py 和解析cookie文件,`
	`8`	`+ """`
	`9`	`+`
	`10`	`+ def __init__(self, outfile):`
	`11`	`+ self.outfile = outfile`
	`12`	`+`
	`13`	`+ def parse_cookie(self):`
	`14`	`+ """`
	`15`	`+ 解析cookie`
	`16`	`+ :return: appmsg_token, biz, cookie_str·`
	`17`	`+ """`
	`18`	`+ f = open(self.outfile)`
	`19`	`+ lines = f.readlines()`
	`20`	`+ appmsg_token_string = re.findall("appmsg_token.+?&", lines[0])`
	`21`	`+ biz_string = re.findall('__biz.+?&', lines[0])`
	`22`	`+ appmsg_token = appmsg_token_string[0].split("=")[1][:-1]`
	`23`	`+ biz = biz_string[0].split("__biz=")[1][:-1]`
	`24`	`+`
	`25`	`+ cookie_str = '; '.join(lines[1][15:-2].split('], [')).replace('\'','').replace(', ', '=')`
	`26`	`+ return appmsg_token, biz, cookie_str`
	`27`	`+`
	`28`	`+ def write_cookie(self):`
	`29`	`+ """`
	`30`	`+ 启动 write_cookie。py`
	`31`	`+ :return:`
	`32`	`+ """`
	`33`	`+`
	`34`	`+ #当前文件路径`
	`35`	`+ path = os.path.split(os.path.realpath(__file__))[0]`
	`36`	`+ # mitmdump -s 执行脚本 -w 保存到文件本命令`
	`37`	`+ command = "mitmdump -s {}/write_cookie.py -w {} mp.weixin.qq.com/mp/getappmsgext".format(`
	`38`	`+ path, self.outfile)`
	`39`	`+`
	`40`	`+ os.system(command)`
	`41`	`+`
	`42`	`+`
	`43`	`+if __name__ == '__main__':`
	`44`	`+ rc = ReadCookie('cookie.txt')`
	`45`	`+ rc.write_cookie()`
	`46`	`+ appmsg_token, biz, cookie_str = rc.parse_cookie()`
	`47`	`+ print("appmsg_token:" + appmsg_token , "\nbiz:" + biz, "\ncookie:"+cookie_str)`

`‎moumoubaimifan/wxCrawler/utils.py‎`

Lines changed: 11 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,11 @@`
	`1`	`+# utils.py`
	`2`	`+# 工具模块,将字符串变成字典`
	`3`	`+def str_to_dict(s, join_symbol="\n", split_symbol=":"):`
	`4`	`+ s_list = s.split(join_symbol)`
	`5`	`+ data = dict()`
	`6`	`+ for item in s_list:`
	`7`	`+ item = item.strip()`
	`8`	`+ if item:`
	`9`	`+ k, v = item.split(split_symbol, 1)`
	`10`	`+ data[k] = v.strip()`
	`11`	`+ return data`

`‎moumoubaimifan/wxCrawler/write_cookie.py‎`

Lines changed: 34 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,34 @@`
	`1`	`+# coding: utf-8`
	`2`	`+import urllib`
	`3`	`+import sys`
	`4`	`+`
	`5`	`+from mitmproxy import http`
	`6`	`+`
	`7`	`+# command: mitmdump -s write_cookie.py -w outfile mp.weixin.qq.com/mp/getappmsgext`
	`8`	`+`
	`9`	`+class WriterCookie:`
	`10`	`+ """`
	`11`	`+ mitmproxy的监听脚本,写入cookie和url到文件`
	`12`	`+ """`
	`13`	`+`
	`14`	`+ def __init__(self, outfile):`
	`15`	`+ self.f = open(outfile, "w")`
	`16`	`+`
	`17`	`+ def response(self, flow: http.HTTPFlow) -> None:`
	`18`	`+ """`
	`19`	`+ 完整的response响应`
	`20`	`+ :param flow: flow实例,`
	`21`	`+ """`
	`22`	`+ # 获取url`
	`23`	`+ url = urllib.parse.unquote(flow.request.url)`
	`24`	`+`
	`25`	`+ # 将url和cookie写入文件`
	`26`	`+ if "mp.weixin.qq.com/mp/getappmsgext" in url:`
	`27`	`+ self.f.write(url + '\n')`
	`28`	`+ self.f.write(str(flow.request.cookies))`
	`29`	`+ self.f.close()`
	`30`	`+ # 退出`
	`31`	`+ exit()`
	`32`	`+`
	`33`	`+# 第四个命令中的参数`
	`34`	`+addons = [WriterCookie(sys.argv[4])]`

`‎moumoubaimifan/wxCrawler/wxCrawler.py‎`

Lines changed: 83 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,83 @@`
	`1`	`+# coding:utf-8`
	`2`	`+# wxCrawler.py`
	`3`	`+import os`
	`4`	`+import time`
	`5`	`+`
	`6`	`+import requests`
	`7`	`+import json`
	`8`	`+import urllib3`
	`9`	`+`
	`10`	`+import utils`
	`11`	`+from articles import Articles`
	`12`	`+`
	`13`	`+`
	`14`	`+class WxCrawler(object):`
	`15`	`+ """翻页内容抓取"""`
	`16`	`+ urllib3.disable_warnings()`
	`17`	`+`
	`18`	`+ def __init__(self, appmsg_token, biz, cookie, begin_page_index = 0, end_page_index = 100):`
	`19`	`+ # 起始页数`
	`20`	`+ self.begin_page_index = begin_page_index`
	`21`	`+ # 结束页数`
	`22`	`+ self.end_page_index = end_page_index`
	`23`	`+ # 抓了多少条了`
	`24`	`+ self.num = 1`
	`25`	`+`
	`26`	`+ self.appmsg_token = appmsg_token`
	`27`	`+ self.biz = biz`
	`28`	`+ self.headers = {`
	`29`	`+ "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",`
	`30`	`+ "Cookie": cookie`
	`31`	`+ }`
	`32`	`+ self.cookie = cookie`
	`33`	`+`
	`34`	`+ def article_list(self, context):`
	`35`	`+ articles = json.loads(context).get('general_msg_list')`
	`36`	`+ return json.loads(articles)`
	`37`	`+`
	`38`	`+ def run(self):`
	`39`	`+`
	`40`	`+ # 翻页地址`
	`41`	`+ page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={}&f=json&offset={}&count=10&is_ok=1&scene=&uin=777&key=777&pass_ticket={}&wxtoken=&appmsg_token=" + self.appmsg_token + "&x5=0f=json"`
	`42`	`+ # 将 cookie 字典化`
	`43`	`+ wx_dict = utils.str_to_dict(self.cookie, join_symbol='; ', split_symbol='=')`
	`44`	`+ # 请求地址`
	`45`	`+ response = requests.get(page_url.format(self.biz, self.begin_page_index * 10, wx_dict['pass_ticket']), headers=self.headers, verify=False)`
	`46`	`+ # 将文章列表字典化`
	`47`	`+ articles = self.article_list(response.text)`
	`48`	`+ info = Articles(self.appmsg_token, self.cookie)`
	`49`	`+`
	`50`	`+ result = []`
	`51`	`+ for a in articles['list']:`
	`52`	`+ if 'app_msg_ext_info' in a.keys() and '' != a.get('app_msg_ext_info').get('content_url', ''):`
	`53`	`+`
	`54`	`+ read_num, old_like_num, like_num = info.read_like_nums(a.get('app_msg_ext_info').get('content_url'))`
	`55`	`+ result.append(str(self.num) + '条,' + a.get('app_msg_ext_info').get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num))`
	`56`	`+ time.sleep(2)`
	`57`	`+`
	`58`	`+ if 'app_msg_ext_info' in a.keys():`
	`59`	`+ for m in a.get('app_msg_ext_info').get('multi_app_msg_item_list', []):`
	`60`	`+ read_num, old_like_num, like_num = info.read_like_nums(m.get('content_url'))`
	`61`	`+ result.append(str(self.num) + '条的副条,' + m.get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num))`
	`62`	`+`
	`63`	`+ time.sleep(3)`
	`64`	`+`
	`65`	`+ self.num = self.num + 1`
	`66`	`+`
	`67`	`+ self.write_file(result)`
	`68`	`+`
	`69`	`+ self.is_exit_or_continue()`
	`70`	`+ # 递归调用`
	`71`	`+ self.run()`
	`72`	`+`
	`73`	`+ def write_file(self, result):`
	`74`	`+ with open('微信公众号.csv', 'a') as f:`
	`75`	`+ for row in result:`
	`76`	`+ f.write(row + '\n')`
	`77`	`+`
	`78`	`+ def is_exit_or_continue(self):`
	`79`	`+ self.begin_page_index = self.begin_page_index + 1`
	`80`	`+`
	`81`	`+ if self.begin_page_index > self.end_page_index:`
	`82`	`+ print('公众号导出结束,共导出了' + str(self.end_page_index) + '页')`
	`83`	`+ os.exit()`

`‎moumoubaimifan/wxCrawler/微信公众号.csv‎`

Whitespace-only changes.

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 40da46e

File tree

8 files changed

8 files changed

`‎moumoubaimifan/wxCrawler/articles.py‎`

`‎moumoubaimifan/wxCrawler/cookie.txt‎`

`‎moumoubaimifan/wxCrawler/main.py‎`

`‎moumoubaimifan/wxCrawler/read_cookie.py‎`

`‎moumoubaimifan/wxCrawler/utils.py‎`

`‎moumoubaimifan/wxCrawler/write_cookie.py‎`

`‎moumoubaimifan/wxCrawler/wxCrawler.py‎`

`‎moumoubaimifan/wxCrawler/微信公众号.csv‎`

0 commit comments