Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 40da46e

Browse files
committed
no message
1 parent 1ae563a commit 40da46e

File tree

8 files changed

+256
-0
lines changed

8 files changed

+256
-0
lines changed

‎moumoubaimifan/wxCrawler/articles.py‎

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# articles.py
2+
import html
3+
import requests
4+
import utils
5+
6+
from urllib.parse import urlsplit
7+
8+
class Articles(object):
9+
"""文章信息"""
10+
11+
def __init__(self, appmsg_token, cookie):
12+
# 具有时效性
13+
self.appmsg_token = appmsg_token
14+
15+
self.headers = {
16+
"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
17+
"Cookie": cookie
18+
}
19+
20+
self.data = {
21+
"is_only_read": "1",
22+
"is_temp_url": "0",
23+
"appmsg_type": "9",
24+
}
25+
26+
27+
def read_like_nums(self, article_url):
28+
"""获取数据"""
29+
appmsgstat = self.get_appmsgext(article_url)["appmsgstat"]
30+
return appmsgstat["read_num"], appmsgstat["old_like_num"], appmsgstat["like_num"]
31+
32+
def get_params(self, article_url):
33+
"""
34+
获取到文章url上的请求参数
35+
:param article_url: 文章 url
36+
:return:
37+
"""
38+
# url转义处理
39+
article_url = html.unescape(article_url)
40+
"""获取文章链接的参数"""
41+
url_params = utils.str_to_dict(urlsplit(article_url).query, "&", "=")
42+
return url_params
43+
44+
def get_appmsgext(self, article_url):
45+
"""
46+
请求阅读数
47+
:param article_url: 文章 url
48+
:return:
49+
"""
50+
url_params = self.get_params(article_url)
51+
52+
appmsgext_url = "https://mp.weixin.qq.com/mp/getappmsgext?appmsg_token={}&x5=0".format(self.appmsg_token)
53+
self.data.update(url_params)
54+
55+
appmsgext_json = requests.post(
56+
appmsgext_url, headers=self.headers, data=self.data).json()
57+
58+
if "appmsgstat" not in appmsgext_json.keys():
59+
raise Exception(appmsgext_json)
60+
return appmsgext_json
61+
62+
63+
if __name__ == '__main__':
64+
info = Articles('1068_XQoMoGGBYG8Tf8k23jfdBr2H_LNekAAlDDUe2aG13TN2fer8xOSMyrLV6s-yWESt8qg5I2fJr1r9n5Y5', 'rewardsn=; wxtokenkey=777; wxuin=1681274216; devicetype=android-29; version=27001037; lang=zh_CN; pass_ticket=H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk; wap_sid2=COjq2KEGEnBPTHRVOHlYV2U4dnRqaWZqRXBqaWl3Xy1saXVWYllIVjAzdlM1VkNDNHgxeWpHOG9pckdkREMwTFEwYmNWMl9FZWtRU3pRRnhDS0pyV1BaZUVMWXN1ZWN0WnZ6aHFXdVBnbVhTY21BYnBSUXNCQUFBMLLAjfgFOA1AAQ==')
65+
a, b,c = info.read_like_nums('http://mp.weixin.qq.com/s?__biz=MzU1NDk2MzQyNg==&mid=2247486254&idx=1&sn=c3a47f4bf72b1ca85c99190597e0c190&chksm=fbdad3a3ccad5ab55f6ef1f4d5b8f97887b4a344c67f9186d5802a209693de582aac6429a91c&scene=27#wechat_redirect')
66+
print(a, b, c)

‎moumoubaimifan/wxCrawler/cookie.txt‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=777&key=777&pass_ticket=H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk&wxtoken=777&devicetype=android-29&clientversion=27001037&__biz=MzU1NDk2MzQyNg==&enterid=1594059276&appmsg_token=1068_ar7adip424A4GEBXj3ICCnwArRb2kU4Y5Y5m9QrePtCDqEng3qndGHMimpnfnqU7wbKfavW3dYDqknmJ&x5=0&f=json
2+
MultiDictView[['rewardsn', ''], ['wxtokenkey', '777'], ['wxuin', '1681274216'], ['devicetype', 'android-29'], ['version', '27001037'], ['lang', 'zh_CN'], ['pass_ticket', 'H9Osk2CMhrlH34mQ3w2PLv/RAVoiDxweAdyGh/Woa1qwGy2jGATJ6hhg7syTQ9nk'], ['wap_sid2', 'COjq2KEGEnBPTHRVOHlYV2U4dnRqaWZqRXBqaWkydmtSVUtDSjNVYVQteFRnd0N5V2RQQkMxUWdOZktORlZoLU5pTEpOZndCTWdTRHNYbktjRnpZa09nOTBoUWZTajFzX1VWaFRxNEZWbUdwbDM3VjNtWXNCQUFBMNfUjfgFOA1AAQ==']]

‎moumoubaimifan/wxCrawler/main.py‎

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# coding:utf-8
2+
# main.py
3+
from read_cookie import ReadCookie
4+
from wxCrawler import WxCrawler
5+
6+
"""程序启动类"""
7+
if __name__ == '__main__':
8+
cookie = ReadCookie('E:/python/cookie.txt')
9+
10+
cookie.write_cookie()
11+
appmsg_token, biz, cookie_str = cookie.parse_cookie()
12+
wx = WxCrawler(appmsg_token, biz, cookie_str)
13+
wx.run()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# read_cookie.py
2+
import re
3+
import os
4+
5+
class ReadCookie(object):
6+
"""
7+
启动write_cookie.py 和 解析cookie文件,
8+
"""
9+
10+
def __init__(self, outfile):
11+
self.outfile = outfile
12+
13+
def parse_cookie(self):
14+
"""
15+
解析cookie
16+
:return: appmsg_token, biz, cookie_str·
17+
"""
18+
f = open(self.outfile)
19+
lines = f.readlines()
20+
appmsg_token_string = re.findall("appmsg_token.+?&", lines[0])
21+
biz_string = re.findall('__biz.+?&', lines[0])
22+
appmsg_token = appmsg_token_string[0].split("=")[1][:-1]
23+
biz = biz_string[0].split("__biz=")[1][:-1]
24+
25+
cookie_str = '; '.join(lines[1][15:-2].split('], [')).replace('\'','').replace(', ', '=')
26+
return appmsg_token, biz, cookie_str
27+
28+
def write_cookie(self):
29+
"""
30+
启动 write_cookie。py
31+
:return:
32+
"""
33+
34+
#当前文件路径
35+
path = os.path.split(os.path.realpath(__file__))[0]
36+
# mitmdump -s 执行脚本 -w 保存到文件 本命令
37+
command = "mitmdump -s {}/write_cookie.py -w {} mp.weixin.qq.com/mp/getappmsgext".format(
38+
path, self.outfile)
39+
40+
os.system(command)
41+
42+
43+
if __name__ == '__main__':
44+
rc = ReadCookie('cookie.txt')
45+
rc.write_cookie()
46+
appmsg_token, biz, cookie_str = rc.parse_cookie()
47+
print("appmsg_token:" + appmsg_token , "\nbiz:" + biz, "\ncookie:"+cookie_str)

‎moumoubaimifan/wxCrawler/utils.py‎

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# utils.py
2+
# 工具模块,将字符串变成字典
3+
def str_to_dict(s, join_symbol="\n", split_symbol=":"):
4+
s_list = s.split(join_symbol)
5+
data = dict()
6+
for item in s_list:
7+
item = item.strip()
8+
if item:
9+
k, v = item.split(split_symbol, 1)
10+
data[k] = v.strip()
11+
return data
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# coding: utf-8
2+
import urllib
3+
import sys
4+
5+
from mitmproxy import http
6+
7+
# command: mitmdump -s write_cookie.py -w outfile mp.weixin.qq.com/mp/getappmsgext
8+
9+
class WriterCookie:
10+
"""
11+
mitmproxy的监听脚本,写入cookie和url到文件
12+
"""
13+
14+
def __init__(self, outfile):
15+
self.f = open(outfile, "w")
16+
17+
def response(self, flow: http.HTTPFlow) -> None:
18+
"""
19+
完整的response响应
20+
:param flow: flow实例,
21+
"""
22+
# 获取url
23+
url = urllib.parse.unquote(flow.request.url)
24+
25+
# 将url和cookie写入文件
26+
if "mp.weixin.qq.com/mp/getappmsgext" in url:
27+
self.f.write(url + '\n')
28+
self.f.write(str(flow.request.cookies))
29+
self.f.close()
30+
# 退出
31+
exit()
32+
33+
# 第四个命令中的参数
34+
addons = [WriterCookie(sys.argv[4])]
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# coding:utf-8
2+
# wxCrawler.py
3+
import os
4+
import time
5+
6+
import requests
7+
import json
8+
import urllib3
9+
10+
import utils
11+
from articles import Articles
12+
13+
14+
class WxCrawler(object):
15+
"""翻页内容抓取"""
16+
urllib3.disable_warnings()
17+
18+
def __init__(self, appmsg_token, biz, cookie, begin_page_index = 0, end_page_index = 100):
19+
# 起始页数
20+
self.begin_page_index = begin_page_index
21+
# 结束页数
22+
self.end_page_index = end_page_index
23+
# 抓了多少条了
24+
self.num = 1
25+
26+
self.appmsg_token = appmsg_token
27+
self.biz = biz
28+
self.headers = {
29+
"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
30+
"Cookie": cookie
31+
}
32+
self.cookie = cookie
33+
34+
def article_list(self, context):
35+
articles = json.loads(context).get('general_msg_list')
36+
return json.loads(articles)
37+
38+
def run(self):
39+
40+
# 翻页地址
41+
page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={}&f=json&offset={}&count=10&is_ok=1&scene=&uin=777&key=777&pass_ticket={}&wxtoken=&appmsg_token=" + self.appmsg_token + "&x5=0f=json"
42+
# 将 cookie 字典化
43+
wx_dict = utils.str_to_dict(self.cookie, join_symbol='; ', split_symbol='=')
44+
# 请求地址
45+
response = requests.get(page_url.format(self.biz, self.begin_page_index * 10, wx_dict['pass_ticket']), headers=self.headers, verify=False)
46+
# 将文章列表字典化
47+
articles = self.article_list(response.text)
48+
info = Articles(self.appmsg_token, self.cookie)
49+
50+
result = []
51+
for a in articles['list']:
52+
if 'app_msg_ext_info' in a.keys() and '' != a.get('app_msg_ext_info').get('content_url', ''):
53+
54+
read_num, old_like_num, like_num = info.read_like_nums(a.get('app_msg_ext_info').get('content_url'))
55+
result.append(str(self.num) + '条,' + a.get('app_msg_ext_info').get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num))
56+
time.sleep(2)
57+
58+
if 'app_msg_ext_info' in a.keys():
59+
for m in a.get('app_msg_ext_info').get('multi_app_msg_item_list', []):
60+
read_num, old_like_num, like_num = info.read_like_nums(m.get('content_url'))
61+
result.append(str(self.num) + '条的副条,' + m.get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num))
62+
63+
time.sleep(3)
64+
65+
self.num = self.num + 1
66+
67+
self.write_file(result)
68+
69+
self.is_exit_or_continue()
70+
# 递归调用
71+
self.run()
72+
73+
def write_file(self, result):
74+
with open('微信公众号.csv', 'a') as f:
75+
for row in result:
76+
f.write(row + '\n')
77+
78+
def is_exit_or_continue(self):
79+
self.begin_page_index = self.begin_page_index + 1
80+
81+
if self.begin_page_index > self.end_page_index:
82+
print('公众号导出结束,共导出了' + str(self.end_page_index) + '页')
83+
os.exit()

‎moumoubaimifan/wxCrawler/微信公众号.csv‎

Whitespace-only changes.

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /