|
| 1 | +# coding:utf-8 |
| 2 | +# wxCrawler.py |
| 3 | +import os |
| 4 | +import time |
| 5 | + |
| 6 | +import requests |
| 7 | +import json |
| 8 | +import urllib3 |
| 9 | + |
| 10 | +import utils |
| 11 | +from articles import Articles |
| 12 | + |
| 13 | + |
| 14 | +class WxCrawler(object): |
| 15 | + """翻页内容抓取""" |
| 16 | + urllib3.disable_warnings() |
| 17 | + |
| 18 | + def __init__(self, appmsg_token, biz, cookie, begin_page_index = 0, end_page_index = 100): |
| 19 | + # 起始页数 |
| 20 | + self.begin_page_index = begin_page_index |
| 21 | + # 结束页数 |
| 22 | + self.end_page_index = end_page_index |
| 23 | + # 抓了多少条了 |
| 24 | + self.num = 1 |
| 25 | + |
| 26 | + self.appmsg_token = appmsg_token |
| 27 | + self.biz = biz |
| 28 | + self.headers = { |
| 29 | + "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", |
| 30 | + "Cookie": cookie |
| 31 | + } |
| 32 | + self.cookie = cookie |
| 33 | + |
| 34 | + def article_list(self, context): |
| 35 | + articles = json.loads(context).get('general_msg_list') |
| 36 | + return json.loads(articles) |
| 37 | + |
| 38 | + def run(self): |
| 39 | + |
| 40 | + # 翻页地址 |
| 41 | + page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={}&f=json&offset={}&count=10&is_ok=1&scene=&uin=777&key=777&pass_ticket={}&wxtoken=&appmsg_token=" + self.appmsg_token + "&x5=0f=json" |
| 42 | + # 将 cookie 字典化 |
| 43 | + wx_dict = utils.str_to_dict(self.cookie, join_symbol='; ', split_symbol='=') |
| 44 | + # 请求地址 |
| 45 | + response = requests.get(page_url.format(self.biz, self.begin_page_index * 10, wx_dict['pass_ticket']), headers=self.headers, verify=False) |
| 46 | + # 将文章列表字典化 |
| 47 | + articles = self.article_list(response.text) |
| 48 | + info = Articles(self.appmsg_token, self.cookie) |
| 49 | + |
| 50 | + result = [] |
| 51 | + for a in articles['list']: |
| 52 | + if 'app_msg_ext_info' in a.keys() and '' != a.get('app_msg_ext_info').get('content_url', ''): |
| 53 | + |
| 54 | + read_num, old_like_num, like_num = info.read_like_nums(a.get('app_msg_ext_info').get('content_url')) |
| 55 | + result.append(str(self.num) + '条,' + a.get('app_msg_ext_info').get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num)) |
| 56 | + time.sleep(2) |
| 57 | + |
| 58 | + if 'app_msg_ext_info' in a.keys(): |
| 59 | + for m in a.get('app_msg_ext_info').get('multi_app_msg_item_list', []): |
| 60 | + read_num, old_like_num, like_num = info.read_like_nums(m.get('content_url')) |
| 61 | + result.append(str(self.num) + '条的副条,' + m.get('title') + ',' + str(read_num) + ',' + str(old_like_num) + ',' + str(like_num)) |
| 62 | + |
| 63 | + time.sleep(3) |
| 64 | + |
| 65 | + self.num = self.num + 1 |
| 66 | + |
| 67 | + self.write_file(result) |
| 68 | + |
| 69 | + self.is_exit_or_continue() |
| 70 | + # 递归调用 |
| 71 | + self.run() |
| 72 | + |
| 73 | + def write_file(self, result): |
| 74 | + with open('微信公众号.csv', 'a') as f: |
| 75 | + for row in result: |
| 76 | + f.write(row + '\n') |
| 77 | + |
| 78 | + def is_exit_or_continue(self): |
| 79 | + self.begin_page_index = self.begin_page_index + 1 |
| 80 | + |
| 81 | + if self.begin_page_index > self.end_page_index: |
| 82 | + print('公众号导出结束,共导出了' + str(self.end_page_index) + '页') |
| 83 | + os.exit() |
0 commit comments