diff --git a/12306.py b/12306.py new file mode 100644 index 00000000..3ca4806f --- /dev/null +++ b/12306.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +""" +@author: liuyw +""" +from splinter.browser import Browser +from time import sleep +import traceback +import time, sys + +class huoche(object): + driver_name = '' + executable_path = '' + #用户名,密码 + username = u"xxx" + passwd = u"xxx" + # cookies值得自己去找, 下面两个分别是沈阳, 哈尔滨 + starts = u"%u6C88%u9633%2CSYT" + ends = u"%u54C8%u5C14%u6EE8%2CHBB" + + # 时间格式2018年01月19日 + dtime = u"2018-01-19" + # 车次,选择第几趟,0则从上之下依次点击 + order = 0 + ###乘客名 + users = [u"xxx",u"xxx"] + ##席位 + xb = u"二等座" + pz = u"成人票" + + """网址""" + ticket_url = "https://kyfw.12306.cn/otn/leftTicket/init" + login_url = "https://kyfw.12306.cn/otn/login/init" + initmy_url = "https://kyfw.12306.cn/otn/index/initMy12306" + buy = "https://kyfw.12306.cn/otn/confirmPassenger/initDc" + + def __init__(self): + self.driver_name = 'chrome' + self.executable_path = 'D:/chromedriver' + + def login(self): + self.driver.visit(self.login_url) + self.driver.fill("loginUserDTO.user_name", self.username) + # sleep(1) + self.driver.fill("userDTO.password", self.passwd) + print(u"等待验证码,自行输入...") + while True: + if self.driver.url != self.initmy_url: + sleep(1) + else: + break + + def start(self): + self.driver = Browser(driver_name=self.driver_name,executable_path=self.executable_path) + self.driver.driver.set_window_size(1400, 1000) + self.login() + # sleep(1) + self.driver.visit(self.ticket_url) + try: + print(u"购票页面开始...") + # sleep(1) + # 加载查询信息 + self.driver.cookies.add({"_jc_save_fromStation": self.starts}) + self.driver.cookies.add({"_jc_save_toStation": self.ends}) + self.driver.cookies.add({"_jc_save_fromDate": self.dtime}) + + self.driver.reload() + + count = 0 + if self.order != 0: + while self.driver.url == self.ticket_url: + self.driver.find_by_text(u"查询").click() + count += 1 + print(u"循环点击查询... 第 %s 次" % count) + # sleep(1) + try: + self.driver.find_by_text(u"预订")[self.order - 1].click() + except Exception as e: + print(e) + print(u"还没开始预订") + continue + else: + while self.driver.url == self.ticket_url: + self.driver.find_by_text(u"查询").click() + count += 1 + print(u"循环点击查询... 第 %s 次" % count) + # sleep(0.8) + try: + for i in self.driver.find_by_text(u"预订"): + i.click() + sleep(1) + except Exception as e: + print(e) + print(u"还没开始预订 %s" % count) + continue + print(u"开始预订...") + # sleep(3) + # self.driver.reload() + sleep(1) + print(u'开始选择用户...') + for user in self.users: + self.driver.find_by_text(user).last.click() + + print(u"提交订单...") + sleep(1) + self.driver.find_by_text(self.pz).click() + self.driver.find_by_id('').select(self.pz) + # sleep(1) + self.driver.find_by_text(self.xb).click() + sleep(1) + self.driver.find_by_id('submitOrder_id').click() + print(u"开始选座...") + self.driver.find_by_id('1D').last.click() + self.driver.find_by_id('1F').last.click() + + sleep(1.5) + print(u"确认选座...") + self.driver.find_by_id('qr_submit_id').click() + + except Exception as e: + print(e) + +if __name__ == '__main__': + huoche = huoche() + huoche.start() \ No newline at end of file diff --git a/2020/README.md b/2020/README.md new file mode 100644 index 00000000..d2185705 --- /dev/null +++ b/2020/README.md @@ -0,0 +1,38 @@ +# Python Spider 2020 + +由于这个项目时间太长了,陆陆续续,很多实战示例也早已失效。 + +网络爬虫,是一门比较通用的基础技术,各个领域都会有所涉及,比如我做视觉算法的,也需要用到网络爬虫,例如调用 API 接口清洗数据等,这本质也都是一个小的爬虫程序。 + +为了提供各位更好的学习示例,我决定重写这一系列教程,对一些失效的示例,重新找例子,并查缺补漏,完善这一些列教程。 + +2020年,最新版的 Python3 网络爬虫实战系列教程。 + +原创文章每周最少两篇,**后续最新文章**会在[【公众号】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)首发,视频[【B站】](https://space.bilibili.com/331507846)首发,大家可以加我[【微信】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)进**交流群**,技术交流或提意见都可以,欢迎**Star**! + +

+ 微信群 + 公众号 + B站 + 知乎 + CSDN + 头条 + 掘金 +

+ +## Python3 网络爬虫教程 2020 +| 文章 | 公众号 | 代码 | +| :------ | :--------: | :--------: | +| Python3 网络爬虫(一):初识网络爬虫之夜探老王家 | [公众号](https://mp.weixin.qq.com/s/1rcq9RQYuAuHFg1w1j8HXg "Python3 网络爬虫(一)") | no | +| Python3 网络爬虫(二):下载小说的正确姿势 | [公众号](https://mp.weixin.qq.com/s/5e2_r0QXUISVp9GdDsqbzg "Python3 网络爬虫(二)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/xbqg "Python3 网络爬虫(二)") | +| Python3 网络爬虫(三):漫画下载,动态加载、反爬虫这都不叫事!| [公众号](https://mp.weixin.qq.com/s/wyS-OP04K3Vs9arSelRlyA "Python3网络爬虫(三)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/dmzj "Python3 网络爬虫(三)") | +| Python3 网络爬虫(四):视频下载,那些事儿!| [公众号](https://mp.weixin.qq.com/s/_geNA6Dwo4kx25X7trJzlg "Python3 网络爬虫(四)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/zycjw "Python3 网络爬虫(四)") | +| Python3 网络爬虫(五):老板,需要特殊服务吗?| [公众号](https://mp.weixin.qq.com/s/PPTSnIHV71b-wB3oRiYnIA "Python3 网络爬虫(五)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/api "Python3 网络爬虫(五)") | +| Python3 网络爬虫(六):618,爱他/她,就清空他/她的购物车!| [公众号](https://mp.weixin.qq.com/s/lXXDfzyLVrf3f-aqJN1C3A "Python3 网络爬虫(六)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/taobao "Python3 网络爬虫(六)") | +| 宝藏B站UP主,视频弹幕尽收囊中!| [公众号](https://mp.weixin.qq.com/s/aWratg1j9RBAjIghoY66yQ "宝藏B站UP主,视频弹幕尽收囊中!") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/bilibili "宝藏B站UP主,视频弹幕尽收囊中!") | + +更多精彩,敬请期待! + + + +[画像:wechat] diff --git a/2020/api/api.py b/2020/api/api.py new file mode 100644 index 00000000..4ed08497 --- /dev/null +++ b/2020/api/api.py @@ -0,0 +1,65 @@ +import requests +import base64 +import json +import cv2 +import numpy as np +import matplotlib.pyplot as plt +%matplotlib inline + + +beautify_url = "https://api-cn.faceplusplus.com/facepp/v2/beautify" +# 你创建的应用的 API Key 和 API Secret(也叫 Secret Key) +AK = '' +SK = '' + +# 可选参数,不填写,默认50 +# 美白程度 0 - 100 +whitening = 80 +# 磨皮程度 0 - 100 +smoothing = 80 +# 瘦脸程度 0 - 100 +thinface = 20 +# 小脸程度 0 - 100 +shrink_face = 50 +# 大眼程度 0 - 100 +enlarge_eye = 50 +# 去眉毛程度 0 - 100 +remove_eyebrow = 50 +# 滤镜名称,不填写,默认无滤镜 +filter_type = '' + +# 二进制方式打开图片 +img_name = 'test_1.png' +f = open(img_name, 'rb') +# 转 base64 +img_base64 = base64.b64encode(f.read()) + +# 使用 whitening、smoothing、thinface 三个可选参数,其他用默认值 +data = { + 'api_key': AK, + 'api_secret': SK, + 'image_base64': img_base64, + 'whitening': whitening, + 'smoothing': smoothing, + 'thinface': thinface, + } + +r = requests.post(url=beautify_url, data=data) +html = json.loads(r.text) + +# 解析base64图片 +base64_data = html['result'] +imgData = base64.b64decode(base64_data) +nparr = np.frombuffer(imgData, np.uint8) +img_res = cv2.imdecode(nparr, cv2.IMREAD_COLOR) +img_res_BGR = cv2.cvtColor(img_res, cv2.COLOR_RGB2BGR) + +# 原始图片 +img = cv2.imread(img_name) +img_BGR = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + +# 显示图片 +fig, axs = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(10,10)) +axs[0].imshow(img_BGR) +axs[1].imshow(img_res_BGR) +plt.show() diff --git a/2020/api/test_1.png b/2020/api/test_1.png new file mode 100644 index 00000000..38e8def3 Binary files /dev/null and b/2020/api/test_1.png differ diff --git a/2020/bilibili/download.py b/2020/bilibili/download.py new file mode 100644 index 00000000..b8aff376 --- /dev/null +++ b/2020/bilibili/download.py @@ -0,0 +1,120 @@ +# -*-coding:utf-8 -*- +# Website: https://cuijiahua.com +# Author: Jack Cui +# Date: 2020年07月22日 +import requests +import json +import re +import json +import math +import xml2ass +import time +from contextlib import closing + +from bs4 import BeautifulSoup + +import os +from win32com.client import Dispatch + +def addTasktoXunlei(down_url): + flag = False + o = Dispatch('ThunderAgent.Agent64.1') + try: + o.AddTask(down_url, "", "", "", "", -1, 0, 5) + o.CommitTasks() + flag = True + except Exception: + print(Exception.message) + print(" AddTask is fail!") + return flag + +def get_download_url(arcurl): + # 微信搜索 JackCui-AI 关注公众号,后台回复「B 站」获取视频解析地址 + jiexi_url = 'xxx' + payload = {'url': arcurl} + jiexi_req = requests.get(jiexi_url, params=payload) + jiexi_bf = BeautifulSoup(jiexi_req.text) + jiexi_dn_url = jiexi_bf.iframe.get('src') + dn_req = requests.get(jiexi_dn_url) + dn_bf = BeautifulSoup(dn_req.text) + video_script = dn_bf.find('script',src = None) + DPlayer = str(video_script.string) + download_url = re.findall('\'(http[s]?:(?:[a-zA-Z]|[0-9]|[$-_@.&~+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)\'', DPlayer)[0] + download_url = download_url.replace('\\', '') + return download_url + +space_url = 'https://space.bilibili.com/280793434' +search_url = 'https://api.bilibili.com/x/space/arc/search' +mid = space_url.split('/')[-1] +sess = requests.Session() +search_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'application/json, text/plain, */*'} + +# 获取视频个数 +ps = 1 +pn = 1 +search_params = {'mid': mid, + 'ps': ps, + 'tid': 0, + 'pn': pn} +req = sess.get(url=search_url, headers=search_headers, params=search_params, verify=False) +info = json.loads(req.text) +video_count = info['data']['page']['count'] + +ps = 10 +page = math.ceil(video_count/ps) +videos_list = [] +for pn in range(1, page+1): + search_params = {'mid': mid, + 'ps': ps, + 'tid': 0, + 'pn': pn} + req = sess.get(url=search_url, headers=search_headers, params=search_params, verify=False) + info = json.loads(req.text) + vlist = info['data']['list']['vlist'] + for video in vlist: + title = video['title'] + bvid = video['bvid'] + vurl = 'https://www.bilibili.com/video/' + bvid + videos_list.append([title, vurl]) +print('共 %d 个视频' % len(videos_list)) +all_video = {} +# 下载前 10 个视频 +for video in videos_list[:10]: + download_url = get_download_url(video[1]) + print(video[0] + ':' + download_url) + # 记录视频名字 + xunlei_video_name = download_url.split('?')[0].split('/')[-1] + filename = video[0] + for c in u' ́☆❤◦\/:*?"| ': + filename = filename.replace(c, '') + save_video_name = filename + '.mp4' + all_video[xunlei_video_name] = save_video_name + + addTasktoXunlei(download_url) + # 弹幕下载 + danmu_name = filename + '.xml' + danmu_ass = filename + '.ass' + oid = download_url.split('/')[6] + danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format(oid) + danmu_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9'} + with closing(sess.get(danmu_url, headers=danmu_header, stream=True, verify=False)) as response: + if response.status_code == 200: + with open(danmu_name, 'wb') as file: + for data in response.iter_content(): + file.write(data) + file.flush() + else: + print('链接异常') + time.sleep(0.5) + xml2ass.Danmaku2ASS(danmu_name, danmu_ass, 1280, 720) +# 视频重命名 +for key, item in all_video.items(): + while key not in os.listdir('./'): + time.sleep(1) + os.rename(key, item) diff --git a/2020/bilibili/xml2ass.py b/2020/bilibili/xml2ass.py new file mode 100644 index 00000000..eac3f861 --- /dev/null +++ b/2020/bilibili/xml2ass.py @@ -0,0 +1,802 @@ +# The original author of this program, Danmaku2ASS, is StarBrilliant. +# This file is released under General Public License version 3. +# You should have received a copy of General Public License text alongside with +# this program. If not, you can obtain it at http://gnu.org/copyleft/gpl.html . +# This program comes with no warranty, the author will not be resopnsible for +# any damage or problems caused by this program. + +import argparse +import calendar +import gettext +import io +import json +import logging +import math +import os +import random +import re +import sys +import time +import xml.dom.minidom + + +if sys.version_info < (3,): + raise RuntimeError('at least Python 3.0 is required') + +gettext.install('danmaku2ass', os.path.join(os.path.dirname(os.path.abspath(os.path.realpath(sys.argv[0] or 'locale'))), 'locale')) + +def SeekZero(function): + def decorated_function(file_): + file_.seek(0) + try: + return function(file_) + finally: + file_.seek(0) + return decorated_function + + +def EOFAsNone(function): + def decorated_function(*args, **kwargs): + try: + return function(*args, **kwargs) + except EOFError: + return None + return decorated_function + + +@SeekZero +@EOFAsNone +def ProbeCommentFormat(f): + tmp = f.read(1) + if tmp == '[': + return 'Acfun' + # It is unwise to wrap a JSON object in an array! + # See this: http://haacked.com/archive/2008/11/20/anatomy-of-a-subtle-json-vulnerability.aspx/ + # Do never follow what Acfun developers did! + elif tmp == '{': + tmp = f.read(14) + if tmp == '"status_code":': + return 'Tudou' + elif tmp == '"root":{"total': + return 'sH5V' + elif tmp == '<': + tmp = f.read(1) + if tmp == '?': + tmp = f.read(38) + if tmp == 'xml version="1.0" encoding="UTF-8"?>\n<': + return 'Bilibili' # Komica, with the same file format as Bilibili + elif tmp == 'xml version="1.0" encoding="UTF-8"?>\n<': + return 'MioMio' + elif tmp == 'p': + return 'Niconico' # Himawari Douga, with the same file format as Niconico Douga + + +# +# ReadComments**** protocol +# +# Input: +# f: Input file +# fontsize: Default font size +# +# Output: +# yield a tuple: +# (timeline, timestamp, no, comment, pos, color, size, height, width) +# timeline: The position when the comment is replayed +# timestamp: The UNIX timestamp when the comment is submitted +# no: A sequence of 1, 2, 3, ..., used for sorting +# comment: The content of the comment +# pos: 0 for regular moving comment, +# 1 for bottom centered comment, +# 2 for top centered comment, +# 3 for reversed moving comment +# color: Font color represented in 0xRRGGBB, +# e.g. 0xffffff for white +# size: Font size +# height: The estimated height in pixels +# i.e. (comment.count('\n')+1)*size +# width: The estimated width in pixels +# i.e. CalculateLength(comment)*size +# +# After implementing ReadComments****, make sure to update ProbeCommentFormat +# and CommentFormatMap. +# + + +def ReadCommentsNiconico(f, fontsize): + NiconicoColorMap = {'red': 0xff0000, 'pink': 0xff8080, 'orange': 0xffcc00, 'yellow': 0xffff00, 'green': 0x00ff00, 'cyan': 0x00ffff, 'blue': 0x0000ff, 'purple': 0xc000ff, 'black': 0x000000, 'niconicowhite': 0xcccc99, 'white2': 0xcccc99, 'truered': 0xcc0033, 'red2': 0xcc0033, 'passionorange': 0xff6600, 'orange2': 0xff6600, 'madyellow': 0x999900, 'yellow2': 0x999900, 'elementalgreen': 0x00cc66, 'green2': 0x00cc66, 'marineblue': 0x33ffcc, 'blue2': 0x33ffcc, 'nobleviolet': 0x6633cc, 'purple2': 0x6633cc} + dom = xml.dom.minidom.parse(f) + comment_element = dom.getElementsByTagName('chat') + for comment in comment_element: + try: + c = str(comment.childNodes[0].wholeText) + if c.startswith('/'): + continue # ignore advanced comments + pos = 0 + color = 0xffffff + size = fontsize + for mailstyle in str(comment.getAttribute('mail')).split(): + if mailstyle == 'ue': + pos = 1 + elif mailstyle == 'shita': + pos = 2 + elif mailstyle == 'big': + size = fontsize*1.44 + elif mailstyle == 'small': + size = fontsize*0.64 + elif mailstyle in NiconicoColorMap: + color = NiconicoColorMap[mailstyle] + yield (max(int(comment.getAttribute('vpos')), 0)*0.01, int(comment.getAttribute('date')), int(comment.getAttribute('no')), c, pos, color, size, (c.count('\n')+1)*size, CalculateLength(c)*size) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %s') % comment.toxml()) + continue + + +def ReadCommentsAcfun(f, fontsize): + comment_element = json.load(f) + for i, comment in enumerate(comment_element): + try: + p = str(comment['c']).split(',') + assert len(p)>= 6 + assert p[2] in ('1', '2', '4', '5', '7') + size = int(p[3])*fontsize/25.0 + if p[2] != '7': + c = str(comment['m']).replace('\\r', '\n').replace('\r', '\n') + yield (float(p[0]), int(p[5]), i, c, {'1': 0, '2': 0, '4': 2, '5': 1}[p[2]], int(p[1]), size, (c.count('\n')+1)*size, CalculateLength(c)*size) + else: + c = dict(json.loads(comment['m'])) + yield (float(p[0]), int(p[5]), i, c, 'acfunpos', int(p[1]), size, 0, 0) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %r') % comment) + continue + + +def ReadCommentsBilibili(f, fontsize): + dom = xml.dom.minidom.parse(f) + comment_element = dom.getElementsByTagName('d') + for i, comment in enumerate(comment_element): + try: + p = str(comment.getAttribute('p')).split(',') + assert len(p)>= 5 + assert p[1] in ('1', '4', '5', '6', '7') + if p[1] != '7': + c = str(comment.childNodes[0].wholeText).replace('/n', '\n') + size = int(p[2])*fontsize/25.0 + yield (float(p[0]), int(p[4]), i, c, {'1': 0, '4': 2, '5': 1, '6': 3}[p[1]], int(p[3]), size, (c.count('\n')+1)*size, CalculateLength(c)*size) + else: # positioned comment + c = str(comment.childNodes[0].wholeText) + yield (float(p[0]), int(p[4]), i, c, 'bilipos', int(p[3]), int(p[2]), 0, 0) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %s') % comment.toxml()) + continue + + +def ReadCommentsTudou(f, fontsize): + comment_element = json.load(f) + for i, comment in enumerate(comment_element['comment_list']): + try: + assert comment['pos'] in (3, 4, 6) + c = str(comment['data']) + assert comment['size'] in (0, 1, 2) + size = {0: 0.64, 1: 1, 2: 1.44}[comment['size']]*fontsize + yield (int(comment['replay_time']*0.001), int(comment['commit_time']), i, c, {3: 0, 4: 2, 6: 1}[comment['pos']], int(comment['color']), size, (c.count('\n')+1)*size, CalculateLength(c)*size) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %r') % comment) + continue + + +def ReadCommentsMioMio(f, fontsize): + NiconicoColorMap = {'red': 0xff0000, 'pink': 0xff8080, 'orange': 0xffc000, 'yellow': 0xffff00, 'green': 0x00ff00, 'cyan': 0x00ffff, 'blue': 0x0000ff, 'purple': 0xc000ff, 'black': 0x000000} + dom = xml.dom.minidom.parse(f) + comment_element = dom.getElementsByTagName('data') + for i, comment in enumerate(comment_element): + try: + message = comment.getElementsByTagName('message')[0] + c = str(message.childNodes[0].wholeText) + pos = 0 + size = int(message.getAttribute('fontsize'))*fontsize/25.0 + yield (float(comment.getElementsByTagName('playTime')[0].childNodes[0].wholeText), int(calendar.timegm(time.strptime(comment.getElementsByTagName('times')[0].childNodes[0].wholeText, '%Y-%m-%d %H:%M:%S')))-28800, i, c, {'1': 0, '4': 2, '5': 1}[message.getAttribute('mode')], int(message.getAttribute('color')), size, (c.count('\n')+1)*size, CalculateLength(c)*size) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %s') % comment.toxml()) + continue + + +def ReadCommentsSH5V(f, fontsize): + comment_element = json.load(f) + for i, comment in enumerate(comment_element["root"]["bgs"]): + try: + c_at = str(comment['at']) + c_type = str(comment['type']) + c_date = str(comment['timestamp']) + c_color = str(comment['color']) + c = str(comment['text']) + size = fontsize + if c_type != '7': + yield (float(c_at), int(c_date), i, c, {'0': 0, '1': 0, '4': 2, '5': 1}[c_type], int(c_color[1:], 16), size, (c.count('\n')+1)*size, CalculateLength(c)*size) + else: + c_x = float(comment['x']) + c_y = float(comment['y']) + size = int(comment['size']) + dur = int(comment['dur']) + data1 = float(comment['data1']) + data2 = float(comment['data2']) + data3 = int(comment['data3']) + data4 = int(comment['data4']) + yield (float(c_at), int(c_date), i, c, 'sH5Vpos', int(c_color[1:], 16), size, 0, 0, c_x, c_y, dur, data1, data2, data3, data4) + except (AssertionError, AttributeError, IndexError, TypeError, ValueError): + logging.warning(_('Invalid comment: %r') % comment) + continue + + +CommentFormatMap = {None: None, 'Niconico': ReadCommentsNiconico, 'Acfun': ReadCommentsAcfun, 'Bilibili': ReadCommentsBilibili, 'Tudou': ReadCommentsTudou, 'MioMio': ReadCommentsMioMio, 'sH5V': ReadCommentsSH5V} + + +def WriteCommentBilibiliPositioned(f, c, width, height, styleid): + #BiliPlayerSize = (512, 384) # Bilibili player version 2010 + #BiliPlayerSize = (540, 384) # Bilibili player version 2012 + BiliPlayerSize = (672, 438) # Bilibili player version 2014 + ZoomFactor = GetZoomFactor(BiliPlayerSize, (width, height)) + + def GetPosition(InputPos, isHeight): + isHeight = int(isHeight) # True -> 1 + if isinstance(InputPos, int): + return ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1] + elif isinstance(InputPos, float): + if InputPos> 1: + return ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1] + else: + return BiliPlayerSize[isHeight]*ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1] + else: + try: + InputPos = int(InputPos) + except ValueError: + InputPos = float(InputPos) + return GetPosition(InputPos, isHeight) + + try: + comment_args = safe_list(json.loads(c[3])) + text = ASSEscape(str(comment_args[4]).replace('/n', '\n')) + from_x = comment_args.get(0, 0) + from_y = comment_args.get(1, 0) + to_x = comment_args.get(7, from_x) + to_y = comment_args.get(8, from_y) + from_x = round(GetPosition(from_x, False)) + from_y = round(GetPosition(from_y, True)) + to_x = round(GetPosition(to_x, False)) + to_y = round(GetPosition(to_y, True)) + alpha = safe_list(str(comment_args.get(2, '1')).split('-')) + from_alpha = float(alpha.get(0, 1)) + to_alpha = float(alpha.get(1, from_alpha)) + from_alpha = 255-round(from_alpha*255) + to_alpha = 255-round(to_alpha*255) + rotate_z = int(comment_args.get(5, 0)) + rotate_y = int(comment_args.get(6, 0)) + lifetime = float(comment_args.get(3, 4500)) + duration = int(comment_args.get(9, lifetime*1000)) + delay = int(comment_args.get(10, 0)) + fontface = comment_args.get(12) + isborder = comment_args.get(11, 'true') + styles = [] + if (from_x, from_y) == (to_x, to_y): + styles.append('\\pos(%s, %s)' % (from_x, from_y)) + else: + styles.append('\\move(%s, %s, %s, %s, %s, %s)' % (from_x, from_y, to_x, to_y, delay, delay+duration)) + styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (from_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (from_y-ZoomFactor[2])/(height-ZoomFactor[2]*2))) + if (from_x, from_y) != (to_x, to_y): + styles.append('\\t(%s, %s, ' % (delay, delay+duration)) + styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(height-ZoomFactor[2]*2))) + styles.append(')') + if fontface: + styles.append('\\fn%s' % ASSEscape(fontface)) + styles.append('\\fs%s' % round(c[6]*ZoomFactor[0])) + if c[5] != 0xffffff: + styles.append('\\c&H%02X%02X%02X&' % (c[5] & 0xff, (c[5]>> 8) & 0xff, (c[5]>> 16) & 0xff)) + if c[5] == 0x000000: + styles.append('\3円c&HFFFFFF&') + if from_alpha == to_alpha: + styles.append('\\alpha&H%02X' % from_alpha) + elif (from_alpha, to_alpha) == (255, 0): + styles.append('\\fad(%s,0)' % (lifetime*1000)) + elif (from_alpha, to_alpha) == (0, 255): + styles.append('\\fad(0, %s)' % (lifetime*1000)) + else: + styles.append('\\fade(%(from_alpha)s, %(to_alpha)s, %(to_alpha)s, 0, %(end_time)s, %(end_time)s, %(end_time)s)' % {'from_alpha': from_alpha, 'to_alpha': to_alpha, 'end_time': lifetime*1000}) + if isborder == 'false': + styles.append('\\bord0') + f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(c[0]), 'end': ConvertTimestamp(c[0]+lifetime), 'styles': ''.join(styles), 'text': text, 'styleid': styleid}) + except (IndexError, ValueError) as e: + try: + logging.warning(_('Invalid comment: %r') % c[3]) + except IndexError: + logging.warning(_('Invalid comment: %r') % c) + + +def WriteCommentAcfunPositioned(f, c, width, height, styleid): + AcfunPlayerSize = (560, 400) + ZoomFactor = GetZoomFactor(AcfunPlayerSize, (width, height)) + + def GetPosition(InputPos, isHeight): + isHeight = int(isHeight) # True -> 1 + return AcfunPlayerSize[isHeight]*ZoomFactor[0]*InputPos*0.001+ZoomFactor[isHeight+1] + + def GetTransformStyles(x=None, y=None, scale_x=None, scale_y=None, rotate_z=None, rotate_y=None, color=None, alpha=None): + styles = [] + if x is not None and y is not None: + styles.append('\\pos(%s, %s)' % (x, y)) + if scale_x is not None: + styles.append('\\fscx%s' % scale_x) + if scale_y is not None: + styles.append('\\fscy%s' % scale_y) + if rotate_z is not None and rotate_y is not None: + assert x is not None + assert y is not None + styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (y-ZoomFactor[2])/(height-ZoomFactor[2]*2))) + if color is not None: + styles.append('\\c&H%02X%02X%02X&' % (color & 0xff, (color>> 8) & 0xff, (color>> 16) & 0xff)) + if color == 0x000000: + styles.append('\3円c&HFFFFFF&') + if alpha is not None: + alpha = 255-round(alpha*255) + styles.append('\\alpha&H%02X' % alpha) + return styles + + def FlushCommentLine(f, text, styles, start_time, end_time, styleid): + if end_time> start_time: + f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(start_time), 'end': ConvertTimestamp(end_time), 'styles': ''.join(styles), 'text': text, 'styleid': styleid}) + + try: + comment_args = c[3] + text = ASSEscape(str(comment_args['n']).replace('\r', '\n').replace('\r', '\n')) + common_styles = [] + anchor = {0: 7, 1: 8, 2: 9, 3: 4, 4: 5, 5: 6, 6: 1, 7: 2, 8: 3}.get(comment_args.get('c', 0), 7) + if anchor != 7: + common_styles.append('\\an%s' % anchor) + font = comment_args.get('w') + if font: + font = dict(font) + fontface = font.get('f') + if fontface: + common_styles.append('\\fn%s' % ASSEscape(str(fontface))) + fontbold = bool(font.get('b')) + if fontbold: + common_styles.append('\\b1') + common_styles.append('\\fs%s' % round(c[6]*ZoomFactor[0])) + isborder = bool(comment_args.get('b', True)) + if not isborder: + common_styles.append('\\bord0') + to_pos = dict(comment_args.get('p', {'x': 0, 'y': 0})) + to_x = round(GetPosition(int(to_pos.get('x', 0)), False)) + to_y = round(GetPosition(int(to_pos.get('y', 0)), True)) + to_scale_x = round(float(comment_args.get('e', 1.0))*100) + to_scale_y = round(float(comment_args.get('f', 1.0))*100) + to_rotate_z = float(comment_args.get('r', 0.0)) + to_rotate_y = float(comment_args.get('k', 0.0)) + to_color = c[5] + to_alpha = float(comment_args.get('a', 1.0)) + from_time = float(comment_args.get('t', 0.0)) + action_time = float(comment_args.get('l', 3.0)) + actions = list(comment_args.get('z', [])) + transform_styles = GetTransformStyles(to_x, to_y, to_scale_x, to_scale_y, to_rotate_z, to_rotate_y, to_color, to_alpha) + FlushCommentLine(f, text, common_styles+transform_styles, c[0]+from_time, c[0]+from_time+action_time, styleid) + for action in actions: + action = dict(action) + from_x, from_y = to_x, to_y + from_scale_x, from_scale_y = to_scale_x, to_scale_y + from_rotate_z, from_rotate_y = to_rotate_z, to_rotate_y + from_color, from_alpha = to_color, to_alpha + from_time += action_time + action_time = float(action.get('l', 0.0)) + action_styles = [] + if 'x' in action: + to_x = round(GetPosition(int(action['x']), False)) + if 'y' in action: + to_y = round(GetPosition(int(action['y']), True)) + if 'f' in action: + to_scale_x = round(float(action['f'])*100) + action_styles.append('\\fscx%s' % to_scale_x) + if 'g' in action: + to_scale_y = round(float(action['g'])*100) + action_styles.append('\\fscy%s' % to_scale_y) + if 'c' in action: + to_color = int(action['c']) + action_styles.append('\\c&H%02X%02X%02X&' % (to_color & 0xff, (to_color>> 8) & 0xff, (to_color>> 16) & 0xff)) + if 't' in action: + to_alpha = float(action['t']) + action_styles.append('\\alpha&H%02X' % (255-round(to_alpha*255))) + if 'd' in action: + to_rotate_z = float(action['d']) + if 'e' in action: + to_rotate_y = float(action['e']) + if ('x' in action) or ('y' in action): + transform_styles = GetTransformStyles(None, None, from_scale_x, from_scale_y, None, None, from_color, from_alpha) + transform_styles.append('\\move(%s, %s, %s, %s)' % (from_x, from_y, to_x, to_y)) + action_styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(to_rotate_y, to_rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(width-ZoomFactor[2]*2))) + elif ('d' in action) or ('e' in action): + action_styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(to_rotate_y, to_rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(width-ZoomFactor[2]*2))) + else: + transform_styles = GetTransformStyles(from_x, from_y, from_scale_x, from_scale_y, from_rotate_z, from_rotate_y, from_color, from_alpha) + if action_styles: + transform_styles.append('\\t(%s)' % (''.join(action_styles))) + FlushCommentLine(f, text, common_styles+transform_styles, c[0]+from_time, c[0]+from_time+action_time, styleid) + except (IndexError, ValueError) as e: + logging.warning(_('Invalid comment: %r') % c[3]) + + +def WriteCommentSH5VPositioned(f, c, width, height, styleid): + + def GetTransformStyles(x=None, y=None, fsize=None, rotate_z=None, rotate_y=None, color=None, alpha=None): + styles = [] + if x is not None and y is not None: + styles.append('\\pos(%s, %s)' % (x, y)) + if fsize is not None: + styles.append('\\fs%s' % fsize) + if rotate_y is not None and rotate_z is not None: + styles.append('\\frz%s' % round(rotate_z)) + styles.append('\\fry%s' % round(rotate_y)) + if color is not None: + styles.append('\\c&H%02X%02X%02X&' % (color & 0xff, (color>> 8) & 0xff, (color>> 16) & 0xff)) + if color == 0x000000: + styles.append('\3円c&HFFFFFF&') + if alpha is not None: + alpha = 255-round(alpha*255) + styles.append('\\alpha&H%02X' % alpha) + return styles + + def FlushCommentLine(f, text, styles, start_time, end_time, styleid): + if end_time> start_time: + f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(start_time), 'end': ConvertTimestamp(end_time), 'styles': ''.join(styles), 'text': text, 'styleid': styleid}) + + try: + text = ASSEscape(str(c[3])) + to_x = round(float(c[9])*width) + to_y = round(float(c[10])*height) + to_rotate_z = -int(c[14]) + to_rotate_y = -int(c[15]) + to_color = c[5] + to_alpha = float(c[12]) + #Note: Alpha transition hasn't been worked out yet. + to_size = round(int(c[6])*math.sqrt(width*height/307200)) + #Note: Because sH5V's data is the absolute size of font,temporarily solve by it at present.[*math.sqrt(width/640*height/480)] + #But it seems to be working fine... + from_time = float(c[0]) + action_time = float(c[11])/1000 + transform_styles = GetTransformStyles(to_x, to_y, to_size, to_rotate_z, to_rotate_y, to_color, to_alpha) + FlushCommentLine(f, text, transform_styles, from_time, from_time+action_time, styleid) + except (IndexError, ValueError) as e: + logging.warning(_('Invalid comment: %r') % c[3]) + + +# Result: (f, dx, dy) +# To convert: NewX = f*x+dx, NewY = f*y+dy +def GetZoomFactor(SourceSize, TargetSize): + try: + if (SourceSize, TargetSize) == GetZoomFactor.Cached_Size: + return GetZoomFactor.Cached_Result + except AttributeError: + pass + GetZoomFactor.Cached_Size = (SourceSize, TargetSize) + try: + SourceAspect = SourceSize[0]/SourceSize[1] + TargetAspect = TargetSize[0]/TargetSize[1] + if TargetAspect < SourceAspect: # narrower + ScaleFactor = TargetSize[0]/SourceSize[0] + GetZoomFactor.Cached_Result = (ScaleFactor, 0, (TargetSize[1]-TargetSize[0]/SourceAspect)/2) + elif TargetAspect> SourceAspect: # wider + ScaleFactor = TargetSize[1]/SourceSize[1] + GetZoomFactor.Cached_Result = (ScaleFactor, (TargetSize[0]-TargetSize[1]*SourceAspect)/2, 0) + else: + GetZoomFactor.Cached_Result = (TargetSize[0]/SourceSize[0], 0, 0) + return GetZoomFactor.Cached_Result + except ZeroDivisionError: + GetZoomFactor.Cached_Result = (1, 0, 0) + return GetZoomFactor.Cached_Result + + +# Calculation is based on https://github.com/jabbany/CommentCoreLibrary/issues/5#issuecomment-40087282 +# and https://github.com/m13253/danmaku2ass/issues/7#issuecomment-41489422 +# Input: X relative horizonal coordinate: 0 for left edge, 1 for right edge. +# Y relative vertical coordinate: 0 for top edge, 1 for bottom edge. +# FOV = 1.0/math.tan(100*math.pi/360.0) +# Result: (rotX, rotY, rotZ, shearX, shearY) +def ConvertFlashRotation(rotY, rotZ, X, Y, FOV=math.tan(2*math.pi/9.0)): + def WrapAngle(deg): + return 180-((180-deg)%360) + def CalcPerspectiveCorrection(alpha, X, FOV=FOV): + alpha = WrapAngle(alpha) + if FOV is None: + return alpha + if 0 <= alpha <= 180: + costheta = (FOV*math.cos(alpha*math.pi/180.0)-X*math.sin(alpha*math.pi/180.0))/(FOV+max(2, abs(X)+1)*math.sin(alpha*math.pi/180.0)) + try: + if costheta> 1: + costheta = 1 + raise ValueError + elif costheta < -1: + costheta = -1 + raise ValueError + except ValueError: + logging.error('Clipped rotation angle: (alpha=%s, X=%s), it is a bug!' % (alpha, X)) + theta = math.acos(costheta)*180/math.pi + else: + costheta = (FOV*math.cos(alpha*math.pi/180.0)-X*math.sin(alpha*math.pi/180.0))/(FOV-max(2, abs(X)+1)*math.sin(alpha*math.pi/180.0)) + try: + if costheta> 1: + costheta = 1 + raise ValueError + elif costheta < -1: + costheta = -1 + raise ValueError + except ValueError: + logging.error('Clipped rotation angle: (alpha=%s, X=%s), it is a bug!' % (alpha, X)) + theta = -math.acos(costheta)*180/math.pi + return WrapAngle(theta) + X = 2*X-1 + Y = 2*Y-1 + rotY = WrapAngle(rotY) + rotZ = WrapAngle(rotZ) + if rotY == 0 or rotZ == 0: + outX = 0 + outY = -rotY # Positive value means clockwise in Flash + outZ = -rotZ + else: + rotY = rotY*math.pi/180.0 + rotZ = rotZ*math.pi/180.0 + outY = math.atan2(-math.sin(rotY)*math.cos(rotZ), math.cos(rotY))*180/math.pi + outZ = math.atan2(-math.cos(rotY)*math.sin(rotZ), math.cos(rotZ))*180/math.pi + outX = math.asin(math.sin(rotY)*math.sin(rotZ))*180/math.pi + if FOV is not None: + #outX = CalcPerspectiveCorrection(outX, -Y, FOV*0.75) + outY = CalcPerspectiveCorrection(outY, X, FOV) + return (WrapAngle(round(outX)), WrapAngle(round(outY)), WrapAngle(round(outZ)), 0, round(-0.75*Y*math.sin(outY*math.pi/180.0), 3)) + + +def ProcessComments(comments, f, width, height, bottomReserved, fontface, fontsize, alpha, lifetime, reduced, progress_callback): + styleid = 'Danmaku2ASS_%04x' % random.randint(0, 0xffff) + WriteASSHead(f, width, height, fontface, fontsize, alpha, styleid) + rows = [[None]*(height-bottomReserved+1) for i in range(4)] + for idx, i in enumerate(comments): + if progress_callback and idx % 1000 == 0: + progress_callback(idx, len(comments)) + if isinstance(i[4], int): + row = 0 + rowmax = height-bottomReserved-i[7] + while row <= rowmax: + freerows = TestFreeRows(rows, i, row, width, height, bottomReserved, lifetime) + if freerows>= i[7]: + MarkCommentRow(rows, i, row) + WriteComment(f, i, row, width, height, bottomReserved, fontsize, lifetime, styleid) + break + else: + row += freerows or 1 + else: + if not reduced: + row = FindAlternativeRow(rows, i, height, bottomReserved) + MarkCommentRow(rows, i, row) + WriteComment(f, i, row, width, height, bottomReserved, fontsize, lifetime, styleid) + elif i[4] == 'bilipos': + WriteCommentBilibiliPositioned(f, i, width, height, styleid) + elif i[4] == 'acfunpos': + WriteCommentAcfunPositioned(f, i, width, height, styleid) + elif i[4] == 'sH5Vpos': + WriteCommentSH5VPositioned(f, i, width, height, styleid) + else: + logging.warning(_('Invalid comment: %r') % i[3]) + if progress_callback: + progress_callback(len(comments), len(comments)) + + +def TestFreeRows(rows, c, row, width, height, bottomReserved, lifetime): + res = 0 + rowmax = height-bottomReserved + targetRow = None + if c[4] in (1, 2): + while row < rowmax and res < c[7]: + if targetRow != rows[c[4]][row]: + targetRow = rows[c[4]][row] + if targetRow and targetRow[0]+lifetime> c[0]: + break + row += 1 + res += 1 + else: + try: + thresholdTime = c[0]-lifetime*(1-width/(c[8]+width)) + except ZeroDivisionError: + thresholdTime = c[0]-lifetime + while row < rowmax and res < c[7]: + if targetRow != rows[c[4]][row]: + targetRow = rows[c[4]][row] + try: + if targetRow and (targetRow[0]> thresholdTime or targetRow[0]+targetRow[8]*lifetime/(targetRow[8]+width)> c[0]): + break + except ZeroDivisionError: + pass + row += 1 + res += 1 + return res + + +def FindAlternativeRow(rows, c, height, bottomReserved): + res = 0 + for row in range(height-bottomReserved-math.ceil(c[7])): + if not rows[c[4]][row]: + return row + elif rows[c[4]][row][0] < rows[c[4]][res][0]: + res = row + return res + + +def MarkCommentRow(rows, c, row): + try: + for i in range(row, row+math.ceil(c[7])): + rows[c[4]][i] = c + except IndexError: + pass + + +def WriteASSHead(f, width, height, fontface, fontsize, alpha, styleid): + f.write( +''' +[Script Info] +; Script generated by Danmaku2ASS +; https://github.com/m13253/danmaku2ass +Script Updated By: Danmaku2ASS (https://github.com/m13253/danmaku2ass) +ScriptType: v4.00+ +WrapStyle: 2 +Collisions: Normal +PlayResX: %(width)s +PlayResY: %(height)s +ScaledBorderAndShadow: yes +[V4+ Styles] +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding +Style: %(styleid)s, %(fontface)s, %(fontsize)s, &H%(alpha)02XFFFFFF, &H%(alpha)02XFFFFFF, &H%(alpha)02X000000, &H%(alpha)02X000000, 0, 0, 0, 0, 100, 100, 0.00, 0.00, 1, %(outline)s, 0, 7, 0, 0, 0, 0 +[Events] +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text +''' % {'height': height, 'fontface': fontface, 'fontsize': round(fontsize), 'alpha': 255-round(alpha*255), 'outline': round(fontsize/25), 'styleid': styleid} + ) + + +def WriteComment(f, c, row, width, height, bottomReserved, fontsize, lifetime, styleid): + text = ASSEscape(c[3]) + styles = [] + if c[4] == 1: + styles.append('\\an8\\pos(%(halfwidth)s, %(row)s)' % {'halfwidth': round(width/2), 'row': row}) + elif c[4] == 2: + styles.append('\\an2\\pos(%(halfwidth)s, %(row)s)' % {'halfwidth': round(width/2), 'row': ConvertType2(row, height, bottomReserved)}) + elif c[4] == 3: + styles.append('\\move(%(neglen)s, %(row)s, %(width)s, %(row)s)' % {'width': width, 'row': row, 'neglen': -math.ceil(c[8])}) + else: + styles.append('\\move(%(width)s, %(row)s, %(neglen)s, %(row)s)' % {'width': width, 'row': row, 'neglen': -math.ceil(c[8])}) + if not (-1 < c[6]-fontsize < 1): + styles.append('\\fs%s' % round(c[6])) + if c[5] != 0xffffff: + styles.append('\\c&H%02X%02X%02X&' % (c[5] & 0xff, (c[5]>> 8) & 0xff, (c[5]>> 16) & 0xff)) + if c[5] == 0x000000: + styles.append('\3円c&HFFFFFF&') + f.write('Dialogue: 2,%(start)s,%(end)s,%(styleid)s,,0000,0000,0000,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(c[0]), 'end': ConvertTimestamp(c[0]+lifetime), 'styles': ''.join(styles), 'text': text, 'styleid': styleid}) + + +def ASSEscape(s): + return '\\N'.join((i or ' ' for i in str(s).replace('\\', '\\\\').replace('{', '\\{').replace('}', '\\}').split('\n'))) + + +def CalculateLength(s): + return max(map(len, s.split('\n'))) # May not be accurate + + +def ConvertTimestamp(timestamp): + timestamp = round(timestamp*100.0) + hour, minute = divmod(timestamp, 360000) + minute, second = divmod(minute, 6000) + second, centsecond = divmod(second, 100) + return '%d:%02d:%02d.%02d' % (int(hour), int(minute), int(second), int(centsecond)) + + +def ConvertType2(row, height, bottomReserved): + return height-bottomReserved-row + + +def ConvertToFile(filename_or_file, *args, **kwargs): + if isinstance(filename_or_file, bytes): + filename_or_file = str(bytes(filename_or_file).decode('utf-8', 'replace')) + if isinstance(filename_or_file, str): + return open(filename_or_file, *args, **kwargs) + else: + return filename_or_file + + +def FilterBadChars(f): + s = f.read() + s = re.sub('[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]', '\ufffd', s) + return io.StringIO(s) + + +class safe_list(list): + def get(self, index, default=None): + try: + return self[index] + except IndexError: + return default + + +def export(func): + global __all__ + try: + __all__.append(func.__name__) + except NameError: + __all__ = [func.__name__] + return func + + +@export +def Danmaku2ASS(input_files, output_file, stage_width, stage_height, reserve_blank=0, font_face=_('(FONT) sans-serif')[7:], font_size=25.0, text_opacity=1.0, comment_duration=5.0, is_reduce_comments=False, progress_callback=None): + fo = None + comments = ReadComments(input_files, font_size) + try: + if output_file: + fo = ConvertToFile(output_file, 'w', encoding='utf-8-sig', errors='replace', newline='\r\n') + else: + fo = sys.stdout + ProcessComments(comments, fo, stage_width, stage_height, reserve_blank, font_face, font_size, text_opacity, comment_duration, is_reduce_comments, progress_callback) + finally: + if output_file and fo != output_file: + fo.close() + + +@export +def ReadComments(input_files, font_size=25.0, progress_callback=None): + if isinstance(input_files, bytes): + input_files = str(bytes(input_files).decode('utf-8', 'replace')) + if isinstance(input_files, str): + input_files = [input_files] + else: + input_files = list(input_files) + comments = [] + for idx, i in enumerate(input_files): + if progress_callback: + progress_callback(idx, len(input_files)) + with ConvertToFile(i, 'r', encoding='utf-8', errors='replace') as f: + CommentProcessor = GetCommentProcessor(f) + if not CommentProcessor: + raise ValueError(_('Unknown comment file format: %s') % i) + comments.extend(CommentProcessor(FilterBadChars(f), font_size)) + if progress_callback: + progress_callback(len(input_files), len(input_files)) + comments.sort() + return comments + + +@export +def GetCommentProcessor(input_file): + return CommentFormatMap[ProbeCommentFormat(input_file)] + + +def main(): + if len(sys.argv) == 1: + sys.argv.append('--help') + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', metavar=_('OUTPUT'), help=_('Output file')) + parser.add_argument('-s', '--size', metavar=_('WIDTHxHEIGHT'), required=True, help=_('Stage size in pixels')) + parser.add_argument('-fn', '--font', metavar=_('FONT'), help=_('Specify font face [default: %s]') % _('(FONT) sans-serif')[7:], default=_('(FONT) sans-serif')[7:]) + parser.add_argument('-fs', '--fontsize', metavar=_('SIZE'), help=(_('Default font size [default: %s]') % 25), type=float, default=25.0) + parser.add_argument('-a', '--alpha', metavar=_('ALPHA'), help=_('Text opacity'), type=float, default=1.0) + parser.add_argument('-l', '--lifetime', metavar=_('SECONDS'), help=_('Duration of comment display [default: %s]') % 5, type=float, default=5.0) + parser.add_argument('-p', '--protect', metavar=_('HEIGHT'), help=_('Reserve blank on the bottom of the stage'), type=int, default=0) + parser.add_argument('-r', '--reduce', action='store_true', help=_('Reduce the amount of comments if stage is full')) + parser.add_argument('file', metavar=_('FILE'), nargs='+', help=_('Comment file to be processed')) + args = parser.parse_args() + try: + width, height = str(args.size).split('x', 1) + width = int(width) + height = int(height) + except ValueError: + raise ValueError(_('Invalid stage size: %r') % args.size) + Danmaku2ASS(args.file, args.output, width, height, args.protect, args.font, args.fontsize, args.alpha, args.lifetime, args.reduce) + + +if __name__ == '__main__': + main() diff --git a/2020/dmzj/cartoon.py b/2020/dmzj/cartoon.py new file mode 100644 index 00000000..a1546a0b --- /dev/null +++ b/2020/dmzj/cartoon.py @@ -0,0 +1,74 @@ +import requests +import os +import re +from bs4 import BeautifulSoup +from contextlib import closing +from tqdm import tqdm +import time + +""" + Author: + Jack Cui + Wechat: + https://mp.weixin.qq.com/s/OCWwRVDFNslIuKyiCVUoTA +""" + +# 创建保存目录 +save_dir = '妖神记' +if save_dir not in os.listdir('./'): + os.mkdir(save_dir) + +target_url = "https://www.dmzj.com/info/yaoshenji.html" + +# 获取动漫章节链接和章节名 +r = requests.get(url = target_url) +bs = BeautifulSoup(r.text, 'lxml') +list_con_li = bs.find('ul', class_="list_con_li") +cartoon_list = list_con_li.find_all('a') +chapter_names = [] +chapter_urls = [] +for cartoon in cartoon_list: + href = cartoon.get('href') + name = cartoon.text + chapter_names.insert(0, name) + chapter_urls.insert(0, href) + +# 下载漫画 +for i, url in enumerate(tqdm(chapter_urls)): + download_header = { + 'Referer': url + } + name = chapter_names[i] + # 去掉. + while '.' in name: + name = name.replace('.', '') + chapter_save_dir = os.path.join(save_dir, name) + if name not in os.listdir(save_dir): + os.mkdir(chapter_save_dir) + r = requests.get(url = url) + html = BeautifulSoup(r.text, 'lxml') + script_info = html.script + pics = re.findall('\d{13,14}', str(script_info)) + for j, pic in enumerate(pics): + if len(pic) == 13: + pics[j] = pic + '0' + pics = sorted(pics, key=lambda x:int(x)) + chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0] + chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0] + for idx, pic in enumerate(pics): + if pic[-1] == '0': + url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg' + else: + url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg' + pic_name = '%03d.jpg' % (idx + 1) + pic_save_path = os.path.join(chapter_save_dir, pic_name) + with closing(requests.get(url, headers = download_header, stream = True)) as response: + chunk_size = 1024 + content_size = int(response.headers['content-length']) + if response.status_code == 200: + with open(pic_save_path, "wb") as file: + for data in response.iter_content(chunk_size=chunk_size): + file.write(data) + else: + print('链接异常') + time.sleep(10) \ No newline at end of file diff --git a/2020/images/gzh-1.jpg b/2020/images/gzh-1.jpg new file mode 100644 index 00000000..b49e5753 Binary files /dev/null and b/2020/images/gzh-1.jpg differ diff --git a/2020/taobao/1.png b/2020/taobao/1.png new file mode 100644 index 00000000..2d207c97 Binary files /dev/null and b/2020/taobao/1.png differ diff --git a/2020/taobao/taobao_login.py b/2020/taobao/taobao_login.py new file mode 100644 index 00000000..a24d2016 --- /dev/null +++ b/2020/taobao/taobao_login.py @@ -0,0 +1,99 @@ +from selenium import webdriver +import logging +import time +from selenium.common.exceptions import NoSuchElementException, WebDriverException +from retrying import retry +from selenium.webdriver import ActionChains + +import pyautogui +pyautogui.PAUSE = 0.5 + +logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +""" +微信公众号 JackCui-AI +更多精彩教程、源码尽在微信公众号 +""" + +class taobao(): + def __init__(self): + self.browser = webdriver.Chrome("path\to\your\chromedriver.exe") + # 最大化窗口 + self.browser.maximize_window() + self.browser.implicitly_wait(5) + self.domain = 'http://www.taobao.com' + self.action_chains = ActionChains(self.browser) + + def login(self, username, password): + while True: + self.browser.get(self.domain) + time.sleep(1) + + #会xpath可以简化这几步 + #self.browser.find_element_by_class_name('h').click() + #self.browser.find_element_by_id('fm-login-id').send_keys(username) + #self.browser.find_element_by_id('fm-login-password').send_keys(password) + self.browser.find_element_by_xpath('//*[@id="J_SiteNavLogin"]/div[1]/div[1]/a[1]').click() + self.browser.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(username) + self.browser.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(password) + time.sleep(1) + + try: + # 出现验证码,滑动验证 + slider = self.browser.find_element_by_xpath("//span[contains(@class, 'btn_slide')]") + if slider.is_displayed(): + # 拖拽滑块 + self.action_chains.drag_and_drop_by_offset(slider, 258, 0).perform() + time.sleep(0.5) + # 释放滑块,相当于点击拖拽之后的释放鼠标 + self.action_chains.release().perform() + except (NoSuchElementException, WebDriverException): + logger.info('未出现登录验证码') + + # 会xpath可以简化点击登陆按钮,但都无法登录,需要使用 pyautogui 完成点击事件 + #self.browser.find_element_by_class_name('password-login').click() + #self.browser.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click() + # 图片地址 + coords = pyautogui.locateOnScreen('1.png') + x, y = pyautogui.center(coords) + pyautogui.leftClick(x, y) + + nickname = self.get_nickname() + if nickname: + logger.info('登录成功,呢称为:' + nickname) + break + logger.debug('登录出错,5s后继续登录') + time.sleep(5) + + def get_nickname(self): + self.browser.get(self.domain) + time.sleep(0.5) + try: + return self.browser.find_element_by_class_name('site-nav-user').text + except NoSuchElementException: + return '' + + def clear_cart(self): + cart = self.browser.find_element_by_xpath('//*[@id="J_MiniCart"]') + if cart.is_displayed(): + cart.click() + select = self.browser.find_element_by_xpath('//*[@id="J_SelectAll1"]/div/label') + if select.is_displayed(): + select.click() + time.sleep(0.5) + go = self.browser.find_element_by_xpath('//*[@id="J_Go"]') + if go.is_displayed(): + go.click() + submit = self.browser.find_element_by_xpath('//*[@id="submitOrderPC_1"]/div/a[2]') + if submit.is_displayed(): + submit.click() + + +if __name__ == '__main__': + # 填入自己的用户名,密码 + username = 'username' + password = 'password' + tb = taobao() + tb.login(username, password) + #tb.clear_cart() diff --git a/2020/xbqg/xbqg_spider.py b/2020/xbqg/xbqg_spider.py new file mode 100644 index 00000000..5dcd10b7 --- /dev/null +++ b/2020/xbqg/xbqg_spider.py @@ -0,0 +1,40 @@ +import requests +import time +from tqdm import tqdm +from bs4 import BeautifulSoup + +""" + Author: + Jack Cui + Wechat: + https://mp.weixin.qq.com/s/OCWwRVDFNslIuKyiCVUoTA +""" + +def get_content(target): + req = requests.get(url = target) + req.encoding = 'utf-8' + html = req.text + bf = BeautifulSoup(html, 'lxml') + texts = bf.find('div', id='content') + content = texts.text.strip().split('\xa0'*4) + return content + +if __name__ == '__main__': + server = 'https://www.xsbiquge.com' + book_name = '诡秘之主.txt' + target = 'https://www.xsbiquge.com/15_15338/' + req = requests.get(url = target) + req.encoding = 'utf-8' + html = req.text + chapter_bs = BeautifulSoup(html, 'lxml') + chapters = chapter_bs.find('div', id='list') + chapters = chapters.find_all('a') + for chapter in tqdm(chapters): + chapter_name = chapter.string + url = server + chapter.get('href') + content = get_content(url) + with open(book_name, 'a', encoding='utf-8') as f: + f.write(chapter_name) + f.write('\n') + f.write('\n'.join(content)) + f.write('\n') \ No newline at end of file diff --git a/2020/zycjw/video_download.py b/2020/zycjw/video_download.py new file mode 100644 index 00000000..89914ab7 --- /dev/null +++ b/2020/zycjw/video_download.py @@ -0,0 +1,64 @@ +import os +import ffmpy3 +import requests +from bs4 import BeautifulSoup +from multiprocessing.dummy import Pool as ThreadPool + +search_keyword = '越狱第一季' +search_url = 'http://www.jisudhw.com/index.php' +serach_params = { + 'm': 'vod-search' +} +serach_headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36', + 'Referer': 'http://www.jisudhw.com/', + 'Origin': 'http://www.jisudhw.com', + 'Host': 'www.jisudhw.com' +} +serach_datas = { + 'wd': search_keyword, + 'submit': 'search' +} + + +video_dir = '' + +r = requests.post(url=search_url, params=serach_params, headers=serach_headers, data=serach_datas) +r.encoding = 'utf-8' +server = 'http://www.jisudhw.com' +search_html = BeautifulSoup(r.text, 'lxml') +search_spans = search_html.find_all('span', class_='xing_vb4') +for span in search_spans: + url = server + span.a.get('href') + name = span.a.string + print(name) + print(url) + video_dir = name + if name not in os.listdir('./'): + os.mkdir(name) + + detail_url = url + r = requests.get(url = detail_url) + r.encoding = 'utf-8' + detail_bf = BeautifulSoup(r.text, 'lxml') + num = 1 + serach_res = {} + for each_url in detail_bf.find_all('input'): + if 'm3u8' in each_url.get('value'): + url = each_url.get('value') + if url not in serach_res.keys(): + serach_res[url] = num + print('第%03d集:' % num) + print(url) + num += 1 + +def downVideo(url): + num = serach_res[url] + name = os.path.join(video_dir, '第%03d集.mp4' % num) + ffmpy3.FFmpeg(inputs={url: None}, outputs={name:None}).run() + +# 开8个线程池 +pool = ThreadPool(8) +results = pool.map(downVideo, serach_res.keys()) +pool.close() +pool.join() \ No newline at end of file diff --git a/Netease/Netease.py b/Netease/Netease.py new file mode 100644 index 00000000..1c5aacee --- /dev/null +++ b/Netease/Netease.py @@ -0,0 +1,236 @@ +# -*- coding:utf-8 -*- +import requests, hashlib, sys, click, re, base64, binascii, json, os +from Crypto.Cipher import AES +from http import cookiejar + +""" +Website:http://cuijiahua.com +Author:Jack Cui +Refer:https://github.com/darknessomi/musicbox +""" + +class Encrypyed(): + """ + 解密算法 + """ + def __init__(self): + self.modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' + self.nonce = '0CoJUm6Qyw8W8jud' + self.pub_key = '010001' + + # 登录加密算法, 基于https://github.com/stkevintan/nw_musicbox脚本实现 + def encrypted_request(self, text): + text = json.dumps(text) + sec_key = self.create_secret_key(16) + enc_text = self.aes_encrypt(self.aes_encrypt(text, self.nonce), sec_key.decode('utf-8')) + enc_sec_key = self.rsa_encrpt(sec_key, self.pub_key, self.modulus) + data = {'params': enc_text, 'encSecKey': enc_sec_key} + return data + + def aes_encrypt(self, text, secKey): + pad = 16 - len(text) % 16 + text = text + chr(pad) * pad + encryptor = AES.new(secKey.encode('utf-8'), AES.MODE_CBC, b'0102030405060708') + ciphertext = encryptor.encrypt(text.encode('utf-8')) + ciphertext = base64.b64encode(ciphertext).decode('utf-8') + return ciphertext + + def rsa_encrpt(self, text, pubKey, modulus): + text = text[::-1] + rs = pow(int(binascii.hexlify(text), 16), int(pubKey, 16), int(modulus, 16)) + return format(rs, 'x').zfill(256) + + def create_secret_key(self, size): + return binascii.hexlify(os.urandom(size))[:16] + + +class Song(): + """ + 歌曲对象,用于存储歌曲的信息 + """ + def __init__(self, song_id, song_name, song_num, song_url=None): + self.song_id = song_id + self.song_name = song_name + self.song_num = song_num + self.song_url = '' if song_url is None else song_url + +class Crawler(): + """ + 网易云爬取API + """ + def __init__(self, timeout=60, cookie_path='.'): + self.headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4', + 'Connection': 'keep-alive', + 'Content-Type': 'application/x-www-form-urlencoded', + 'Host': 'music.163.com', + 'Referer': 'http://music.163.com/search/', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' + } + self.session = requests.Session() + self.session.headers.update(self.headers) + self.session.cookies = cookiejar.LWPCookieJar(cookie_path) + self.download_session = requests.Session() + self.timeout = timeout + self.ep = Encrypyed() + + def post_request(self, url, params): + """ + Post请求 + :return: 字典 + """ + + data = self.ep.encrypted_request(params) + resp = self.session.post(url, data=data, timeout=self.timeout) + result = resp.json() + if result['code'] != 200: + click.echo('post_request error') + else: + return result + + def search(self, search_content, search_type, limit=9): + """ + 搜索API + :params search_content: 搜索内容 + :params search_type: 搜索类型 + :params limit: 返回结果数量 + :return: 字典. + """ + + url = 'http://music.163.com/weapi/cloudsearch/get/web?csrf_token=' + params = {'s': search_content, 'type': search_type, 'offset': 0, 'sub': 'false', 'limit': limit} + result = self.post_request(url, params) + return result + + def search_song(self, song_name, song_num, quiet=True, limit=9): + """ + 根据音乐名搜索 + :params song_name: 音乐名 + :params song_num: 下载的歌曲数 + :params quiet: 自动选择匹配最优结果 + :params limit: 返回结果数量 + :return: Song独享 + """ + + result = self.search(song_name, search_type=1, limit=limit) + + if result['result']['songCount'] <= 0: + click.echo('Song {} not existed.'.format(song_name)) + else: + songs = result['result']['songs'] + if quiet: + song_id, song_name = songs[0]['id'], songs[0]['name'] + song = Song(song_id=song_id, song_name=song_name, song_num=song_num) + return song + + def get_song_url(self, song_id, bit_rate=320000): + """ + 获得歌曲的下载地址 + :params song_id: 音乐ID. + :params bit_rate: {'MD 128k': 128000, 'HD 320k': 320000} + :return: 歌曲下载地址 + """ + + url = 'http://music.163.com/weapi/song/enhance/player/url?csrf_token=' + csrf = '' + params = {'ids': [song_id], 'br': bit_rate, 'csrf_token': csrf} + result = self.post_request(url, params) + # 歌曲下载地址 + song_url = result['data'][0]['url'] + + # 歌曲不存在 + if song_url is None: + click.echo('Song {} is not available due to copyright issue.'.format(song_id)) + else: + return song_url + + def get_song_by_url(self, song_url, song_name, song_num, folder): + """ + 下载歌曲到本地 + :params song_url: 歌曲下载地址 + :params song_name: 歌曲名字 + :params song_num: 下载的歌曲数 + :params folder: 保存路径 + """ + if not os.path.exists(folder): + os.makedirs(folder) + fpath = os.path.join(folder, str(song_num) + '_' + song_name + '.mp3') + if sys.platform == 'win32' or sys.platform == 'cygwin': + valid_name = re.sub(r'[:"/\\|?*]', '', song_name) + if valid_name != song_name: + click.echo('{} will be saved as: {}.mp3'.format(song_name, valid_name)) + fpath = os.path.join(folder, str(song_num) + '_' + valid_name + '.mp3') + + if not os.path.exists(fpath): + resp = self.download_session.get(song_url, timeout=self.timeout, stream=True) + length = int(resp.headers.get('content-length')) + label = 'Downloading {} {}kb'.format(song_name, int(length/1024)) + + with click.progressbar(length=length, label=label) as progressbar: + with open(fpath, 'wb') as song_file: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + song_file.write(chunk) + progressbar.update(1024) + + +class Netease(): + """ + 网易云音乐下载 + """ + def __init__(self, timeout, folder, quiet, cookie_path): + self.crawler = Crawler(timeout, cookie_path) + self.folder = '.' if folder is None else folder + self.quiet = quiet + + def download_song_by_search(self, song_name, song_num): + """ + 根据歌曲名进行搜索 + :params song_name: 歌曲名字 + :params song_num: 下载的歌曲数 + """ + + try: + song = self.crawler.search_song(song_name, song_num, self.quiet) + except: + click.echo('download_song_by_serach error') + # 如果找到了音乐, 则下载 + if song != None: + self.download_song_by_id(song.song_id, song.song_name, song.song_num, self.folder) + + def download_song_by_id(self, song_id, song_name, song_num, folder='.'): + """ + 通过歌曲的ID下载 + :params song_id: 歌曲ID + :params song_name: 歌曲名 + :params song_num: 下载的歌曲数 + :params folder: 保存地址 + """ + try: + url = self.crawler.get_song_url(song_id) + # 去掉非法字符 + song_name = song_name.replace('/', '') + song_name = song_name.replace('.', '') + self.crawler.get_song_by_url(url, song_name, song_num, folder) + + except: + click.echo('download_song_by_id error') + + +if __name__ == '__main__': + timeout = 60 + output = 'Musics' + quiet = True + cookie_path = 'Cookie' + netease = Netease(timeout, output, quiet, cookie_path) + music_list_name = 'music_list.txt' + # 如果music列表存在, 那么开始下载 + if os.path.exists(music_list_name): + with open(music_list_name, 'r') as f: + music_list = list(map(lambda x: x.strip(), f.readlines())) + for song_num, song_name in enumerate(music_list): + netease.download_song_by_search(song_name,song_num + 1) + else: + click.echo('music_list.txt not exist.') \ No newline at end of file diff --git a/Netease/music_list.txt b/Netease/music_list.txt new file mode 100644 index 00000000..2f284874 --- /dev/null +++ b/Netease/music_list.txt @@ -0,0 +1,10 @@ +風見鶏 +外婆的话【不才】 +We Don't Talk Anymore +【电吉他】《青鸟》 +小棋童 +千本桜(古筝版) +妄为 +借我 +你到底有没有爱过我 +七月上 diff --git a/README.md b/README.md index 35529e55..1d0f06b8 100644 --- a/README.md +++ b/README.md @@ -1,60 +1,81 @@ -# Python Spider +# 注:2020年最新连载教程请移步:[Python Spider 2020](https://github.com/Jack-Cherish/python-spider/tree/master/2020 "Python Spider 2020") -* 贵有恒,何必三更起五更睡;最无益,只怕一日暴十寒。 +免责声明: -* [我的博客](http://blog.csdn.net/c406495762 "悬停显示") +大家请以学习为目的使用本仓库,爬虫违法违规的案件:https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China -* Python3爬虫 +本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。 -## 声明 +# Python Spider -* 软件均仅用于学习交流,请勿用于任何商业用途! +原创文章每周最少两篇,**后续最新文章**会在[【公众号】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)首发,视频[【B站】](https://space.bilibili.com/331507846)首发,大家可以加我[【微信】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)进**交流群**,技术交流或提意见都可以,欢迎**Star**! -## 介绍 - -* biqukan.py:《笔趣看》盗版小说网站,爬取小说工具 +

+ 微信群 + 公众号 + B站 + 知乎 + CSDN + 头条 + 掘金 +

- 第三方依赖库安装: +## 声明 - pip3 install beautifulsoup4 +* 代码、教程**仅限于学习交流,请勿用于任何商业用途!** - 使用方法: +## 目录 - python biqukan.py +* [爬虫小工具](#爬虫小工具) + * [文件下载小助手](https://github.com/Jack-Cherish/python-spider/blob/master/downloader.py "悬停显示") +* [爬虫实战](#爬虫实战) + * [笔趣看小说下载](https://github.com/Jack-Cherish/python-spider/blob/master/biqukan.py "悬停显示") + * [百度文库免费文章下载助手_rev1](https://github.com/Jack-Cherish/python-spider/blob/master/baiduwenku.py "悬停显示") + * [百度文库免费文章下载助手_rev2](https://github.com/Jack-Cherish/python-spider/blob/master/baiduwenku_pro_1.py "悬停显示") + * [《帅啊》网帅哥图片下载](https://github.com/Jack-Cherish/python-spider/blob/master/shuaia.py "悬停显示") + * [构建代理IP池](https://github.com/Jack-Cherish/python-spider/blob/master/daili.py "悬停显示") + * [《火影忍者》漫画下载](https://github.com/Jack-Cherish/python-spider/tree/master/cartoon "悬停显示") + * [财务报表下载小助手](https://github.com/Jack-Cherish/python-spider/blob/master/financical.py "悬停显示") + * [一小时入门网络爬虫](https://github.com/Jack-Cherish/python-spider/tree/master/one_hour_spider "悬停显示") + * [抖音App视频下载](https://github.com/Jack-Cherish/python-spider/tree/master/douyin "悬停显示") + * [GEETEST验证码识别](https://github.com/Jack-Cherish/python-spider/blob/master/geetest.py "悬停显示") + * [12306抢票小助手](https://github.com/Jack-Cherish/python-spider/blob/master/12306.py "悬停显示") + * [百万英雄答题辅助系统](https://github.com/Jack-Cherish/python-spider/tree/master/baiwan "悬停显示") + * [网易云音乐免费音乐批量下载](https://github.com/Jack-Cherish/python-spider/tree/master/Netease "悬停显示") + * [B站免费视频和弹幕批量下载](https://github.com/Jack-Cherish/python-spider/tree/master/bilibili "悬停显示") + * [京东商品晒单图下载](https://github.com/Jack-Cherish/python-spider/tree/master/dingdong "悬停显示") + * [正方教务管理系统个人信息查询](https://github.com/Jack-Cherish/python-spider/tree/master/zhengfang_system_spider "悬停显示") +* [其它](#其它) -* video_downloader: 爱奇艺等主流视频网站的VIP视频破解助手(暂只支持PC和手机在线观看VIP视频!) +## 爬虫小工具 - 感谢Python3二维码生成器作者:https://github.com/sylnsfar/qrcode - - 编译好的软件下载连接:http://pan.baidu.com/s/1eR4Y7aM 解压密码:`c406495762` - - 无需Python3环境,在Windows下,解压即用![软件使用方法](http://blog.csdn.net/c406495762/article/details/71334633 "悬停显示") +* downloader.py:文件下载小助手 + + 一个可以用于下载图片、视频、文件的小工具,有下载进度显示功能。稍加修改即可添加到自己的爬虫中。 - 源码可查看`video_downloader`,运行源码需要搭建Python3环境,并安装相应第三方依赖库: + 动态示意图: - 在`video_downloader`文件夹下,安装第三方依赖库: + ![image](https://raw.githubusercontent.com/Jack-Cherish/Pictures/master/9.gif) + +## 爬虫实战 + + * biqukan.py:《笔趣看》盗版小说网站,爬取小说工具 + + 第三方依赖库安装: - pip3 install -r requirements.txt + pip3 install beautifulsoup4 使用方法: - - python movie_downloader.py - 运行环境: - - Windows, Python3 - - Linux, Python3 - - Mac, Python3 + python biqukan.py -* baiduwenku.py: 百度文库word文章爬取 + * baiduwenku.py: 百度文库word文章爬取 原理说明:http://blog.csdn.net/c406495762/article/details/72331737 - 代码不完善,没有进行打包,不具通用性,纯属娱乐,以后有时间会完善。 + 代码不完善,没有进行打包,不具通用性,纯属娱乐。 -* shuaia.py: 爬取《帅啊》网,帅哥图片 + * shuaia.py: 爬取《帅啊》网,帅哥图片 《帅啊》网URL:http://www.shuaia.net/index.html @@ -64,12 +85,12 @@ pip3 install requests beautifulsoup4 -* daili.py: 构建代理IP池 + * daili.py: 构建代理IP池 原理说明:http://blog.csdn.net/c406495762/article/details/72793480 -* carton: 使用Scrapy爬取《火影忍者》漫画 + * carton: 使用Scrapy爬取《火影忍者》漫画 代码可以爬取整个《火影忍者》漫画所有章节的内容,保存到本地。更改地址,可以爬取其他漫画。保存地址可以在settings.py中修改。 @@ -77,10 +98,165 @@ 原理说明:http://blog.csdn.net/c406495762/article/details/72858983 + * hero.py: 《王者荣耀》推荐出装查询小助手 + + 网页爬取已经会了,想过爬取手机APP里的内容吗? + + 原理说明:http://blog.csdn.net/c406495762/article/details/76850843 + + * financical.py: 财务报表下载小助手 + + 爬取的数据存入数据库会吗?《跟股神巴菲特学习炒股之财务报表入库(MySQL)》也许能给你一些思路。 + + 原理说明:http://blog.csdn.net/c406495762/article/details/77801899 + 动态示意图: + ![image](https://raw.githubusercontent.com/Jack-Cherish/Pictures/master/10.gif) + * one_hour_spider:一小时入门Python3网络爬虫。 + + 原理说明: + + * 知乎:https://zhuanlan.zhihu.com/p/29809609 + * CSDN:http://blog.csdn.net/c406495762/article/details/78123502 + + 本次实战内容有: + + * 网络小说下载(静态网站)-biqukan + * 优美壁纸下载(动态网站)-unsplash + * 视频下载 + + * douyin.py:抖音App视频下载 + + 抖音App的视频下载,就是普通的App爬取。 + 原理说明: + * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html + + * douyin_pro:抖音App视频下载(升级版) + + 抖音App的视频下载,添加视频解析网站,支持无水印视频下载,使用第三方平台解析。 + + 原理说明: + + * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html + + * douyin:抖音App视频下载(升级版2) + + 抖音App的视频下载,添加视频解析网站,支持无水印视频下载,通过url解析,无需第三方平台。 + + 原理说明: + + * 个人网站:http://cuijiahua.com/blog/2018/03/spider-5.html + + 动态示意图: + + ![image](https://github.com/Jack-Cherish/Pictures/blob/master/14.gif) + + * geetest.py:GEETEST验证码识别 + + 原理说明: + + 无 + + * 12306.py:用Python抢火车票简单代码 + + 可以自己慢慢丰富,蛮简单,有爬虫基础很好操作,没有原理说明。 + + * baiwan:百万英雄辅助答题 + + 效果图: + + ![image](https://github.com/Jack-Cherish/Pictures/blob/master/11.gif) + + 原理说明: + + * 个人网站:http://cuijiahua.com/blog/2018/01/spider_3.html + + 功能介绍: + + 服务器端,使用Python(baiwan.py)通过抓包获得的接口获取答题数据,解析之后通过百度知道搜索接口匹配答案,将最终匹配的结果写入文件(file.txt)。 + + 手机抓包不会的朋友,可以看下我的早期[手机APP抓包教程](http://blog.csdn.net/c406495762/article/details/76850843 "悬停显示")。 + + Node.js(app.js)每隔1s读取一次file.txt文件,并将读取结果通过socket.io推送给客户端(index.html)。 + + 亲测答题延时在3s左右。 + + 声明:没做过后端和前端,花了一天时间,现学现卖弄好的,javascript也是现看现用,百度的程序,调试调试而已。可能有很多用法比较low的地方,用法不对,请勿见怪,有大牛感兴趣,可以自行完善。 + + * Netease:根据歌单下载网易云音乐 + + 效果图: + + ![image](https://github.com/Jack-Cherish/Pictures/blob/master/13.gif) + + 原理说明: + + 暂无 + + 功能介绍: + + 根据music_list.txt文件里的歌单的信息下载网易云音乐,将自己喜欢的音乐进行批量下载。 + + * bilibili:B站视频和弹幕批量下载 + + 原理说明: + + 暂无 + + 使用说明: + + python bilibili.py -d 猫 -k 猫 -p 10 + + 三个参数: + -d 保存视频的文件夹名 + -k B站搜索的关键字 + -p 下载搜索结果前多少页 + + * jingdong:京东商品晒单图下载 + + 效果图: + + ![image](https://github.com/Jack-Cherish/Pictures/blob/master/jd.gif) + + 原理说明: + + 暂无 + + 使用说明: + + python jd.py -k 芒果 + + 三个参数: + -d 保存图片的路径,默认为fd.py文件所在文件夹 + -k 搜索关键词 + -n 下载商品的晒单图个数,即n个商店的晒单图 + + * zhengfang_system_spider:对正方教务管理系统个人课表,个人学生成绩,绩点等简单爬取 + + 效果图: + + ![image](/zhengfang_system_spider/screenshot/zf.png) + + 原理说明: + + 暂无 + + 使用说明: + + cd zhengfang_system_spider + pip install -r requirements.txt + python spider.py + +## 其它 + + * 欢迎 Pull requests,感谢贡献。 + + 更多精彩,敬请期待! + +[画像:wechat] diff --git a/baiduwenku_pro_1.py b/baiduwenku_pro_1.py new file mode 100644 index 00000000..18a79459 --- /dev/null +++ b/baiduwenku_pro_1.py @@ -0,0 +1,101 @@ +import requests +import re +import json +import os + +session = requests.session() + + +def fetch_url(url): + return session.get(url).content.decode('gbk') + + +def get_doc_id(url): + return re.findall('view/(.*).html', url)[0] + + +def parse_type(content): + return re.findall(r"docType.*?\:.*?\'(.*?)\',円", content)[0] + + +def parse_title(content): + return re.findall(r"title.*?\:.*?\'(.*?)\',円", content)[0] + + +def parse_doc(content): + result = '' + url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content) + url_list = [addr.replace("\\\\\\/", "/") for addr in url_list] + for url in url_list[:-5]: + content = fetch_url(url) + y = 0 + txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),', content) + for item in txtlists: + if not y == item[1]: + y = item[1] + n = '\n' + else: + n = '' + result += n + result += item[0].encode('utf-8').decode('unicode_escape', 'ignore') + return result + + +def parse_txt(doc_id): + content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_id + content = fetch_url(content_url) + md5 = re.findall('"md5sum":"(.*?)"', content)[0] + pn = re.findall('"totalPageNum":"(.*?)"', content)[0] + rsign = re.findall('"rsign":"(.*?)"', content)[0] + content_url = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign + content = json.loads(fetch_url(content_url)) + result = '' + for item in content: + for i in item['parags']: + result += i['c'].replace('\\r', '\r').replace('\\n', '\n') + return result + + +def parse_other(doc_id): + content_url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt" + content = fetch_url(content_url) + url_list = re.findall('{"zoom":"(.*?)","page"', content) + url_list = [item.replace("\\", '') for item in url_list] + if not os.path.exists(doc_id): + os.mkdir(doc_id) + for index, url in enumerate(url_list): + content = session.get(url).content + path = os.path.join(doc_id, str(index) + '.jpg') + with open(path, 'wb') as f: + f.write(content) + print("图片保存在" + doc_id + "文件夹") + + +def save_file(filename, content): + with open(filename, 'w', encoding='utf8') as f: + f.write(content) + print('已保存为:' + filename) + + +# test_txt_url = 'https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search' +# test_ppt_url = 'https://wenku.baidu.com/view/2b7046e3f78a6529657d5376.html?from=search' +# test_pdf_url = 'https://wenku.baidu.com/view/dd6e15c1227916888586d795.html?from=search' +# test_xls_url = 'https://wenku.baidu.com/view/eb4a5bb7312b3169a551a481.html?from=search' +def main(): + url = input('请输入要下载的文库URL地址') + content = fetch_url(url) + doc_id = get_doc_id(url) + type = parse_type(content) + title = parse_title(content) + if type == 'doc': + result = parse_doc(content) + save_file(title + '.txt', result) + elif type == 'txt': + result = parse_txt(doc_id) + save_file(title + '.txt', result) + else: + parse_other(doc_id) + + +if __name__ == "__main__": + main() diff --git a/baiwan/app.js b/baiwan/app.js new file mode 100644 index 00000000..9220fc16 --- /dev/null +++ b/baiwan/app.js @@ -0,0 +1,52 @@ +var http = require('http'); +var fs = require('fs'); +var schedule = require("node-schedule"); +var message = {}; +var count = 0; +var server = http.createServer(function (req,res){ + fs.readFile('./index.html',function(error,data){ + res.writeHead(200,{'Content-Type':'text/html'}); + res.end(data,'utf-8'); + }); +}).listen(80); +console.log('Server running!'); +var lineReader = require('line-reader'); +function messageGet(){ + lineReader.eachLine('file.txt', function(line, last) { + count++; + var name = 'line' + count; + console.log(name); + console.log(line); + message[name] = line; + }); + if(count == 25){ + count = 0; + } + else{ + for(var i = count+1; i <= 25; i++){ + var name = 'line' + i; + message[name] = 'f'; + } + count = 0; + } +} +var io = require('socket.io').listen(server); +var rule = new schedule.RecurrenceRule(); +var times = []; +for(var i=1; i<1800; i++){ + times.push(i); +} +rule.second = times; +schedule.scheduleJob(rule, function(){ + messageGet(); +}); +io.sockets.on('connection',function(socket){ + // console.log('User connected' + count + 'user(s) present'); + socket.emit('users',message); + socket.broadcast.emit('users',message); + + socket.on('disconnect',function(){ + console.log('User disconnected'); + //socket.broadcast.emit('users',message); + }); +}); diff --git a/baiwan/baiwan.py b/baiwan/baiwan.py new file mode 100644 index 00000000..50d0ec18 --- /dev/null +++ b/baiwan/baiwan.py @@ -0,0 +1,169 @@ +# -*-coding:utf-8 -*- +import requests +from lxml import etree +from bs4 import BeautifulSoup +import urllib +import time, re, types, os + + +""" +代码写的匆忙,本来想再重构下,完善好注释再发,但是比较忙,想想算了,所以自行完善吧!写法很不规范,勿见怪。 + +作者: Jack Cui +Website:http://cuijiahua.com +注: 本软件仅用于学习交流,请勿用于任何商业用途! +""" + +class BaiWan(): + def __init__(self): + # 百度知道搜索接口 + self.baidu = 'http://zhidao.baidu.com/search?' + # 百万英雄及接口,每个人的接口都不一样,里面包含的手机信息,因此不公布,请自行抓包,有疑问欢迎留言:http://cuijiahua.com/liuyan.html + self.api = 'https://api-spe-ttl.ixigua.com/xxxxxxx={}'.format(int(time.time()*1000)) + + # 获取答案并解析问题 + def get_question(self): + to = True + while to: + list_dir = os.listdir('./') + if 'question.txt' not in list_dir: + fw = open('question.txt', 'w') + fw.write('百万英雄尚未出题请稍后!') + fw.close() + go = True + while go: + req = requests.get(self.api, verify=False) + req.encoding = 'utf-8' + html = req.text + + print(html) + if '*' in html: + question_start = html.index('*') + try: + + question_end = html.index('?') + except: + question_end = html.index('?') + question = html[question_start:question_end][2:] + if question != None: + fr = open('question.txt', 'r') + text = fr.readline() + fr.close() + if text != question: + print(question) + go = False + with open('question.txt', 'w') as f: + f.write(question) + else: + time.sleep(1) + else: + to = False + else: + to = False + + temp = re.findall(r'[\u4e00-\u9fa5a-zA-Z0-9\+\-\*/]', html[question_end+1:]) + b_index = [] + print(temp) + + for index, each in enumerate(temp): + if each == 'B': + b_index.append(index) + elif each == 'P' and (len(temp) - index) <= 3 : + b_index.append(index) + break + + if len(b_index) == 4: + a = ''.join(temp[b_index[0] + 1:b_index[1]]) + b = ''.join(temp[b_index[1] + 1:b_index[2]]) + c = ''.join(temp[b_index[2] + 1:b_index[3]]) + alternative_answers = [a,b,c] + + if '下列' in question: + question = a + ' ' + b + ' ' + c + ' ' + question.replace('下列', '') + elif '以下' in question: + question = a + ' ' + b + ' ' + c + ' ' + question.replace('以下', '') + else: + alternative_answers = [] + # 根据问题和备选答案搜索答案 + self.search(question, alternative_answers) + time.sleep(1) + + def search(self, question, alternative_answers): + print(question) + print(alternative_answers) + infos = {"word":question} + # 调用百度接口 + url = self.baidu + 'lm=0&rn=10&pn=0&fr=search&ie=gbk&' + urllib.parse.urlencode(infos, encoding='GB2312') + print(url) + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36', + } + sess = requests.Session() + req = sess.get(url = url, headers=headers, verify=False) + req.encoding = 'gbk' + # print(req.text) + bf = BeautifulSoup(req.text, 'lxml') + answers = bf.find_all('dd',class_='dd answer') + for answer in answers: + print(answer.text) + + # 推荐答案 + recommend = '' + if alternative_answers != []: + best = [] + print('\n') + for answer in answers: + # print(answer.text) + for each_answer in alternative_answers: + if each_answer in answer.text: + best.append(each_answer) + print(each_answer,end=' ') + # print(answer.text) + print('\n') + break + statistics = {} + for each in best: + if each not in statistics.keys(): + statistics[each] = 1 + else: + statistics[each] += 1 + errors = ['没有', '不是', '不对', '不正确','错误','不包括','不包含','不在','错'] + error_list = list(map(lambda x: x in question, errors)) + print(error_list) + if sum(error_list)>= 1: + for each_answer in alternative_answers: + if each_answer not in statistics.items(): + recommend = each_answer + print('推荐答案:', recommend) + break + elif statistics != {}: + recommend = sorted(statistics.items(), key=lambda e:e[1], reverse=True)[0][0] + print('推荐答案:', recommend) + + # 写入文件 + with open('file.txt', 'w') as f: + f.write('问题:' + question) + f.write('\n') + f.write('*' * 50) + f.write('\n') + if alternative_answers != []: + f.write('选项:') + for i in range(len(alternative_answers)): + f.write(alternative_answers[i]) + f.write(' ') + f.write('\n') + f.write('*' * 50) + f.write('\n') + f.write('参考答案:\n') + for answer in answers: + f.write(answer.text) + f.write('\n') + f.write('*' * 50) + f.write('\n') + if recommend != '': + f.write('最终答案请自行斟酌!\t') + f.write('推荐答案:' + sorted(statistics.items(), key=lambda e:e[1], reverse=True)[0][0]) + + +if __name__ == '__main__': + bw = BaiWan() + bw.get_question() \ No newline at end of file diff --git a/baiwan/file.txt b/baiwan/file.txt new file mode 100644 index 00000000..2f1d34a3 --- /dev/null +++ b/baiwan/file.txt @@ -0,0 +1,21 @@ +���⣺�������Ǽ��1⁄4��� +************************************************** +ѡ�7��23�� 8��1�� 10��1�� +************************************************** +�ο��𰸣� + +�Ƽ����� +��������8��1�� ÿ���İ���һ�����й��������ž����������գ�����Ҳ�С���һ�������ڡ� August 1, anniversary of the founding of the Chinese People's Liberation Army��֪�������Ⱦ��������뵽���http://baike.baidu.com/view/23211.htm +[��ϸ] + +�������ã��й��������ž��Ľ�������ÿ���İ���һ�գ����ư�һ�����ڣ������İ�һ�պ����������� +�𣺽�������8��1�գ���������7��1�ա� ÿ����8��1�����й��������ž����������գ��׳ơ���һ�������ڡ�1927��8��1�գ��й����������챱�����������x����ܶ��� ������ ��Ҷͦ ������ ����е��쵼�£��ڽ����ς�������װ���壬���췴�Թ��񵳷�����... +������7��30�� +�𣺰���һ���ǽ����ڣ�ǰһ�첻��7��31��ô +����1927��8��1������һ���ς�����,�������й���������װ�������񵳷����ɵĵ�һǹ,��־���й��������й������������쵼��װ��������ɦ��,��־���й����͵��������ӵĵ�����ÿ���İ���һ�����й��������ž����������� +��������Դ���й�����������������ɦ���й����������쵼���ς����塣1927��8��1�յ��ς����壬�������й���������װ�������񵳷����ɵĵ�һǹ����־���й��������й������������쵼��װ��������ɦ�ڣ���־���й����͵��������ӵĵ����� 1933��7�£�... +����Ԫ��1��1��������8��1��������10��1�� +�𣺰���һ�Ž����� +���������� 2015��8��1�� ũ�� ����ʮ�� ������ 2016��8��1�� ũ�� ����إ�� ������ ÿ���İ���һ�����й��������ž����������գ�����Ҳ�С���һ�������ڡ�1933��7��11�գ��л���ά�����͹���ɦ������������������������ίԱ��6��30�յĽ��飬����8��1... +************************************************** +���մ������������ã� �Ƽ��𰸣�8��1�� \ No newline at end of file diff --git a/baiwan/index.html b/baiwan/index.html new file mode 100644 index 00000000..4625d193 --- /dev/null +++ b/baiwan/index.html @@ -0,0 +1,219 @@ + + + + + + Jack Cui答题辅助系统 + +
+

百万英雄答题辅助系统

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

AltStyle によって変換されたページ (->オリジナル) /

+ diff --git a/baiwan/question.txt b/baiwan/question.txt new file mode 100644 index 00000000..46b7f930 --- /dev/null +++ b/baiwan/question.txt @@ -0,0 +1 @@ +�������Ǽ��1⁄4��� \ No newline at end of file diff --git a/bilibili/README.md b/bilibili/README.md new file mode 100644 index 00000000..787470fd --- /dev/null +++ b/bilibili/README.md @@ -0,0 +1,24 @@ +## 功能 + +下载B站视频和弹幕,将xml原生弹幕转换为ass弹幕文件,支持plotplayer等播放器的弹幕播放。 + +## 作者 + +* Website: [http://cuijiahua.com](http://cuijiahua.com "悬停显示") +* Author: Jack Cui +* Date: 2018年6月12日 + +## 更新 + +* 2018年09月12日:添加FFmpeg分段视频合并 + +## 使用说明 + +FFmpeg下载,并配置环境变量。http://ffmpeg.org/ + + python bilibili.py -d 猫 -k 猫 -p 10 + + 三个参数: + -d 保存视频的文件夹名 + -k B站搜索的关键字 + -p 下载搜索结果前多少页 diff --git a/bilibili/bilibili.py b/bilibili/bilibili.py new file mode 100644 index 00000000..469766e5 --- /dev/null +++ b/bilibili/bilibili.py @@ -0,0 +1,211 @@ +# -*-coding:utf-8 -*- +# Website: http://cuijiahua.com +# Author: Jack Cui +# Date: 2018年6月9日 + +import requests, json, re, sys, os, urllib, argparse, time +from urllib.request import urlretrieve +from contextlib import closing +from urllib import parse +import xml2ass + +class BiliBili: + def __init__(self, dirname, keyword): + self.dn_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Referer': 'https://search.bilibili.com/all?keyword=%s' % parse.quote(keyword)} + + self.search_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'application/json, text/plain, */*'} + + self.video_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'} + + self.danmu_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9'} + + self.sess = requests.Session() + + self.dir = dirname + + def video_downloader(self, video_url, video_name): + """ + 视频下载 + Parameters: + video_url: 带水印的视频地址 + video_name: 视频名 + Returns: + 无 + """ + size = 0 + with closing(self.sess.get(video_url, headers=self.dn_headers, stream=True, verify=False)) as response: + chunk_size = 1024 + content_size = int(response.headers['content-length']) + if response.status_code == 200: + sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024)) + video_name = os.path.join(self.dir, video_name) + with open(video_name, 'wb') as file: + for data in response.iter_content(chunk_size = chunk_size): + file.write(data) + size += len(data) + file.flush() + + sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r') + # sys.stdout.flush() + if size / content_size == 1: + print('\n') + else: + print('链接异常') + + def search_video(self, search_url): + """ + 搜索接口 + Parameters: + search_url: 带水印的视频地址 + Returns: + titles:视频名列表 + arcurls: 视频播放地址列表 + """ + req = self.sess.get(url=search_url, headers=self.search_headers, verify=False) + html = json.loads(req.text) + videos = html["data"]['result'] + titles = [] + arcurls = [] + for video in videos: + titles.append(video['title'].replace('','').replace('','')) + arcurls.append(video['arcurl']) + return titles, arcurls + + def get_download_url(self, arcurl): + """ + 获取视频下载地址 + Parameters: + arcurl: 视频播放地址 + oid:弹幕地址参数 + Returns: + download_url:视频下载地址 + """ + req = self.sess.get(url=arcurl, headers=self.video_headers, verify=False) + pattern = '.__playinfo__=(.*)") + tac = _tac_re.search(share_user.text).group(1) + _dytk_re = re.compile(r"dytk\s*:\s*'(.+)'") + dytk = _dytk_re.search(share_user.text).group(1) + _nickname_re = re.compile(r'

(.+?)<\/p>') + nickname = _nickname_re.search(share_user.text).group(1) + data = { + 'tac': tac.split('|')[0], + 'user_id': user_id, + } + req = requests.post(sign_api, data=data) + while req.status_code != 200: + req = requests.post(sign_api, data=data) + sign = req.json().get('signature') + user_url_prefix = 'https://www.iesdouyin.com/web/api/v2/aweme/like' if type_flag == 'f' else 'https://www.iesdouyin.com/web/api/v2/aweme/post' + print('解析视频链接中') + while has_more != 0: + user_url = user_url_prefix + '/?user_id=%s&sec_uid=&count=21&max_cursor=%s&aid=1128&_signature=%s&dytk=%s' % (user_id, max_cursor, sign, dytk) + req = requests.get(user_url, headers=self.headers) + while req.status_code != 200: + req = requests.get(user_url, headers=self.headers) + html = json.loads(req.text) + for each in html['aweme_list']: + try: + url = 'https://aweme.snssdk.com/aweme/v1/play/?video_id=%s&line=0&ratio=720p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1&is_support_h265=0&source=PackSourceEnum_PUBLISH' + vid = each['video']['vid'] + video_url = url % vid + except: + continue + share_desc = each['desc'] + if os.name == 'nt': + for c in r'\/:*?"|': + nickname = nickname.replace(c, '').strip().strip('\.') + share_desc = share_desc.replace(c, '').strip() + share_id = each['aweme_id'] + if share_desc in ['抖音-原创音乐短视频社区', 'TikTok', '']: + video_names.append(share_id + '.mp4') + else: + video_names.append(share_id + '-' + share_desc + '.mp4') + share_url = 'https://www.iesdouyin.com/share/video/%s' % share_id + share_urls.append(share_url) + video_urls.append(video_url) + max_cursor = html['max_cursor'] + has_more = html['has_more'] + + return video_names, video_urls, share_urls, nickname + + def get_download_url(self, video_url, watermark_flag): + """ + 获得带水印的视频播放地址 + Parameters: + video_url:带水印的视频播放地址 + Returns: + download_url: 带水印的视频下载地址 + """ + # 带水印视频 + if watermark_flag == True: + download_url = video_url.replace('/play/', '/playwm/') + # 无水印视频 + else: + download_url = video_url.replace('/playwm/', '/play/') + + return download_url + + def video_downloader(self, video_url, video_name, watermark_flag=False): + """ + 视频下载 + Parameters: + video_url: 带水印的视频地址 + video_name: 视频名 + watermark_flag: 是否下载带水印的视频 + Returns: + 无 + """ + size = 0 + video_url = self.get_download_url(video_url, watermark_flag=watermark_flag) + with closing(requests.get(video_url, headers=self.headers1, stream=True)) as response: + chunk_size = 1024 + content_size = int(response.headers['content-length']) + if response.status_code == 200: + sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024)) + + with open(video_name, 'wb') as file: + for data in response.iter_content(chunk_size = chunk_size): + file.write(data) + size += len(data) + file.flush() + + sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100) + '\r') + sys.stdout.flush() + + def run(self): + """ + 运行函数 + Parameters: + None + Returns: + None + """ + self.hello() + print('UID取得方式:\n分享用户页面,用浏览器打开短链接,原始链接中/share/user/后的数字即是UID') + user_id = input('请输入UID (例如60388937600):') + user_id = user_id if user_id else '60388937600' + watermark_flag = input('是否下载带水印的视频 (0-否(默认), 1-是):') + watermark_flag = watermark_flag if watermark_flag!='' else '0' + watermark_flag = bool(int(watermark_flag)) + type_flag = input('f-收藏的(默认), p-上传的:') + type_flag = type_flag if type_flag!='' else 'f' + save_dir = input('保存路径 (例如"E:/Download/", 默认"./Download/"):') + save_dir = save_dir if save_dir else "./Download/" + video_names, video_urls, share_urls, nickname = self.get_video_urls(user_id, type_flag) + nickname_dir = os.path.join(save_dir, nickname) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + if nickname not in os.listdir(save_dir): + os.mkdir(nickname_dir) + if type_flag == 'f': + if 'favorite' not in os.listdir(nickname_dir): + os.mkdir(os.path.join(nickname_dir, 'favorite')) + print('视频下载中:共有%d个作品!\n' % len(video_urls)) + for num in range(len(video_urls)): + print(' 解析第%d个视频链接 [%s] 中,请稍后!\n' % (num + 1, share_urls[num])) + if '\\' in video_names[num]: + video_name = video_names[num].replace('\\', '') + elif '/' in video_names[num]: + video_name = video_names[num].replace('/', '') + else: + video_name = video_names[num] + video_path = os.path.join(nickname_dir, video_name) if type_flag!='f' else os.path.join(nickname_dir, 'favorite', video_name) + if os.path.isfile(video_path): + print('视频已存在') + else: + self.video_downloader(video_urls[num], video_path, watermark_flag) + print('\n') + print('下载完成!') + + def hello(self): + """ + 打印欢迎界面 + Parameters: + None + Returns: + None + """ + print('*' * 100) + print('\t\t\t\t抖音App视频下载小助手') + print('\t\t作者:Jack Cui、steven7851') + print('*' * 100) + + +if __name__ == '__main__': + douyin = DouYin() + douyin.run() diff --git a/douyin/fuck-byted-acrawler.js b/douyin/fuck-byted-acrawler.js new file mode 100644 index 00000000..66be77c8 --- /dev/null +++ b/douyin/fuck-byted-acrawler.js @@ -0,0 +1,194 @@ +// Referer:https://raw.githubusercontent.com/loadchange/amemv-crawler/master/fuck-byted-acrawler.js +function generateSignature(userId) { + this.navigator = { + userAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" + } + var e = {} + + var r = (function () { + function e(e, a, r) { + return (b[e] || (b[e] = t("x,y", "return x " + e + " y")))(r, a) + } + + function a(e, a, r) { + return (k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a) + } + + function r(e, a, r) { + var n, t, s = {}, b = s.d = r ? r.d + 1 : 0; + for (s["$" + b] = s, t = 0; t < b; t++) s[n = "$" + t] = r[n]; + for (t = 0, b = s.length = a.length; t < b; t++) s[t] = a[t]; + return c(e, 0, s) + } + + function c(t, b, k) { + function u(e) { + v[x++] = e + } + + function f() { + return g = t.charCodeAt(b++) - 32, t.substring(b, b += g) + } + + function l() { + try { + y = c(t, b, k) + } catch (e) { + h = e, y = l + } + } + + for (var h, y, d, g, v = [], x = 0; ;) switch (g = t.charCodeAt(b++) - 32) { + case 1: + u(!v[--x]); + break; + case 4: + v[x++] = f(); + break; + case 5: + u(function (e) { + var a = 0, r = e.length; + return function () { + var c = a < r; + return c && u(e[a++]), c + } + }(v[--x])); + break; + case 6: + y = v[--x], u(v[--x](y)); + break; + case 8: + if (g = t.charCodeAt(b++) - 32, l(), b += g, g = t.charCodeAt(b++) - 32, y === c) b += g; else if (y !== l) return y; + break; + case 9: + v[x++] = c; + break; + case 10: + u(s(v[--x])); + break; + case 11: + y = v[--x], u(v[--x] + y); + break; + case 12: + for (y = f(), d = [], g = 0; g < y.length; g++) d[g] = y.charCodeAt(g) ^ g + y.length; + u(String.fromCharCode.apply(null, d)); + break; + case 13: + y = v[--x], h = delete v[--x][y]; + break; + case 14: + v[x++] = t.charCodeAt(b++) - 32; + break; + case 59: + u((g = t.charCodeAt(b++) - 32) ? (y = x, v.slice(x -= g, y)) : []); + break; + case 61: + u(v[--x][t.charCodeAt(b++) - 32]); + break; + case 62: + g = v[--x], k[0] = 65599 * k[0] + k[1].charCodeAt(g)>>> 0; + break; + case 65: + h = v[--x], y = v[--x], v[--x][y] = h; + break; + case 66: + u(e(t[b++], v[--x], v[--x])); + break; + case 67: + y = v[--x], d = v[--x], u((g = v[--x]).x === c ? r(g.y, y, k) : g.apply(d, y)); + break; + case 68: + u(e((g = t[b++]) < "<" ? (b--, f()) : g + g, v[--x], v[--x])); + break; + case 70: + u(!1); + break; + case 71: + v[x++] = n; + break; + case 72: + v[x++] = +f(); + break; + case 73: + u(parseInt(f(), 36)); + break; + case 75: + if (v[--x]) { + b++; + break + } + case 74: + g = t.charCodeAt(b++) - 32 << 16>> 16, b += g; + break; + case 76: + u(k[t.charCodeAt(b++) - 32]); + break; + case 77: + y = v[--x], u(v[--x][y]); + break; + case 78: + g = t.charCodeAt(b++) - 32, u(a(v, x -= g + 1, g)); + break; + case 79: + g = t.charCodeAt(b++) - 32, u(k["$" + g]); + break; + case 81: + h = v[--x], v[--x][f()] = h; + break; + case 82: + u(v[--x][f()]); + break; + case 83: + h = v[--x], k[t.charCodeAt(b++) - 32] = h; + break; + case 84: + v[x++] = !0; + break; + case 85: + v[x++] = void 0; + break; + case 86: + u(v[x - 1]); + break; + case 88: + h = v[--x], y = v[--x], v[x++] = h, v[x++] = y; + break; + case 89: + u(function () { + function e() { + return r(e.y, arguments, k) + } + + return e.y = f(), e.x = c, e + }()); + break; + case 90: + v[x++] = null; + break; + case 91: + v[x++] = h; + break; + case 93: + h = v[--x]; + break; + case 0: + return v[--x]; + default: + u((g << 16>> 16) - 16) + } + } + + var n = this, t = n.Function, s = Object.keys || function (e) { + var a = {}, r = 0; + for (var c in e) a[r++] = c; + return a.length = r, a + }, b = {}, k = {}; + return r + })() + ('gr$Daten Иb/s!l y͒yĹg,(lfi~ah`{mv,-n|jqewVxp{rvmmx,&effkx[!cs"l".Pq%widthl"@q&heightl"vr*getContextx$"2d[!cs#l#,*;?|u.|uc{uq$fontl#vr(fillTextx$$龘ฑภ경2<[#c}l#2q*shadowblurl#1q-shadowoffsetxl#$$limeq+shadowcolorl#vr#arcx88802[%c}l#vr&strokex[ c}l"v,)}eOmyoZB]mx[ cs!0s$l$Pb>>s!0s%yA0s"l"l!r&lengthb&l!l Bd>&+l!l &+l!l 6d>&+l!l &+ s,y=o!o!]/q"13o!l q"10o!],l 2d>& s.{s-yMo!o!]0q"13o!]*Ld>>b|s!o!l q"10o!],l!& s/yIo!o!].q"13o!],o!]*Jd>>b|&o!]+l &+ s0l-l!&l-l!i\'1z141z4b/@d= self.total: + end_str = '\n' + self.status = status or self.fin_status + print(self.__get_info(), end=end_str, ) + + +if __name__ == '__main__': + #url = 'http://www.demongan.com/source/game/二十四点.zip' + #filename = '二十四点.zip' + print('*' * 100) + print('\t\t\t\t欢迎使用文件下载小助手') + print('作者:Jack-Cui\n博客:http://blog.csdn.net/c406495762') + print('*' * 100) + url = input('请输入需要下载的文件链接:\n') + filename = url.split('/')[-1] + with closing(requests.get(url, stream=True)) as response: + chunk_size = 1024 + content_size = int(response.headers['content-length']) + if response.status_code == 200: + print('文件大小:%0.2f KB' % (content_size / chunk_size)) + progress = ProgressBar("%s下载进度" % filename + , total = content_size + , unit = "KB" + , chunk_size = chunk_size + , run_status = "正在下载" + , fin_status = "下载完成") + + with open(filename, "wb") as file: + for data in response.iter_content(chunk_size=chunk_size): + file.write(data) + progress.refresh(count=len(data)) + else: + print('链接异常') \ No newline at end of file diff --git a/financical.py b/financical.py new file mode 100644 index 00000000..172b80dc --- /dev/null +++ b/financical.py @@ -0,0 +1,193 @@ +#-*- coding:UTF-8 -*- +import sys +import pymysql +import requests +import json +import re +from bs4 import BeautifulSoup + +""" +类说明:获取财务数据 + +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Zhihu: + https://www.zhihu.com/people/Jack--Cui/ +Modify: + 2017年08月31日 +""" +class FinancialData(): + + def __init__(self): + #服务器域名 + self.server = 'http://quotes.money.163.com/' + self.cwnb = 'http://quotes.money.163.com/hkstock/cwsj_' + #主要财务指标 + self.cwzb_dict = {'EPS':'基本每股收益','EPS_DILUTED':'摊薄每股收益','GROSS_MARGIN':'毛利率', + 'CAPITAL_ADEQUACY':'资本充足率','LOANS_DEPOSITS':'贷款回报率','ROTA':'总资产收益率', + 'ROEQUITY':'净资产收益率','CURRENT_RATIO':'流动比率','QUICK_RATIO':'速动比率', + 'ROLOANS':'存贷比','INVENTORY_TURNOVER':'存货周转率','GENERAL_ADMIN_RATIO':'管理费用比率', + 'TOTAL_ASSET2TURNOVER':'资产周转率','FINCOSTS_GROSSPROFIT':'财务费用比率','TURNOVER_CASH':'销售现金比率','YEAREND_DATE':'报表日期'} + #利润表 + self.lrb_dict = {'TURNOVER':'总营收','OPER_PROFIT':'经营利润','PBT':'除税前利润', + 'NET_PROF':'净利润','EPS':'每股基本盈利','DPS':'每股派息', + 'INCOME_INTEREST':'利息收益','INCOME_NETTRADING':'交易收益','INCOME_NETFEE':'费用收益','YEAREND_DATE':'报表日期'} + #资产负债表 + self.fzb_dict = { + 'FIX_ASS':'固定资产','CURR_ASS':'流动资产','CURR_LIAB':'流动负债', + 'INVENTORY':'存款','CASH':'现金及银行存结','OTHER_ASS':'其他资产', + 'TOTAL_ASS':'总资产','TOTAL_LIAB':'总负债','EQUITY':'股东权益', + 'CASH_SHORTTERMFUND':'库存现金及短期资金','DEPOSITS_FROM_CUSTOMER':'客户存款', + 'FINANCIALASSET_SALE':'可供出售之证券','LOAN_TO_BANK':'银行同业存款及贷款', + 'DERIVATIVES_LIABILITIES':'金融负债','DERIVATIVES_ASSET':'金融资产','YEAREND_DATE':'报表日期'} + #现金流表 + self.llb_dict = { + 'CF_NCF_OPERACT':'经营活动产生的现金流','CF_INT_REC':'已收利息','CF_INT_PAID':'已付利息', + 'CF_INT_REC':'已收股息','CF_DIV_PAID':'已派股息','CF_INV':'投资活动产生现金流', + 'CF_FIN_ACT':'融资活动产生现金流','CF_BEG':'期初现金及现金等价物','CF_CHANGE_CSH':'现金及现金等价物净增加额', + 'CF_END':'期末现金及现金等价物','CF_EXCH':'汇率变动影响','YEAREND_DATE':'报表日期'} + #总表 + self.table_dict = {'cwzb':self.cwzb_dict,'lrb':self.lrb_dict,'fzb':self.fzb_dict,'llb':self.llb_dict} + #请求头 + self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'zh-CN,zh;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'} + + """ + 函数说明:获取股票页面信息 + + Author: + Jack Cui + Parameters: + url - 股票财务数据界面地址 + Returns: + name - 股票名 + table_name_list - 财务报表名称 + table_date_list - 财务报表年限 + url_list - 财务报表查询连接 + Blog: + http://blog.csdn.net/c406495762 + Zhihu: + https://www.zhihu.com/people/Jack--Cui/ + Modify: + 2017年08月31日 + """ + def get_informations(self, url): + req = requests.get(url = url, headers = self.headers) + req.encoding = 'utf-8' + html = req.text + page_bf = BeautifulSoup(html, 'lxml') + #股票名称,股票代码 + name = page_bf.find_all('span', class_ = 'name')[0].string + # code = page_bf.find_all('span', class_ = 'code')[0].string + # code = re.findall('\d+',code)[0] + + #存储各个表名的列表 + table_name_list = [] + table_date_list = [] + each_date_list = [] + url_list = [] + #表名和表时间 + table_name = page_bf.find_all('div', class_ = 'titlebar3') + for each_table_name in table_name: + #表名 + table_name_list.append(each_table_name.span.string) + #表时间 + for each_table_date in each_table_name.div.find_all('select', id = re.compile('.+1$')): + url_list.append(re.findall('(\w+)1',each_table_date.get('id'))[0]) + for each_date in each_table_date.find_all('option'): + each_date_list.append(each_date.string) + table_date_list.append(each_date_list) + each_date_list = [] + return name,table_name_list,table_date_list,url_list + + """ + 函数说明:财务报表入库 + + Author: + Jack Cui + Parameters: + name - 股票名 + table_name_list - 财务报表名称 + table_date_list - 财务报表年限 + url_list - 财务报表查询连接 + Returns: + 无 + Blog: + http://blog.csdn.net/c406495762 + Zhihu: + https://www.zhihu.com/people/Jack--Cui/ + Modify: + 2017年08月31日 + """ + def insert_tables(self, name, table_name_list,table_date_list, url_list): + #打开数据库连接:host-连接主机地址,port-端口号,user-用户名,passwd-用户密码,db-数据库名,charset-编码 + conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='yourpasswd',db='financialdata',charset='utf8') + #使用cursor()方法获取操作游标 + cursor = conn.cursor() + #插入信息 + for i in range(len(table_name_list)): + sys.stdout.write(' [正在下载 ] %s' % table_name_list[i] + '\r') + #获取数据地址 + url = self.server + 'hk/service/cwsj_service.php?symbol={}&start={}&end={}&type={}&unit=yuan'.format(code,table_date_list[i][-1],table_date_list[i][0],url_list[i]) + req_table = requests.get(url = url, headers = self.headers) + table = req_table.json() + nums = len(table) + value_dict = {} + for num in range(nums): + sys.stdout.write(' [正在下载 %.2f%%] ' % (((num+1) / nums)*100) + '\r') + sys.stdout.flush() + value_dict['股票名'] = name + value_dict['股票代码'] = code + for key, value in table[i].items(): + if key in self.table_dict[url_list[i]]: + value_dict[self.table_dict[url_list[i]][key]] = value + + sql1 = """ + INSERT INTO %s (`股票名`,`股票代码`,`报表日期`) VALUES ('%s','%s','%s')""" % (url_list[i],value_dict['股票名'],value_dict['股票代码'],value_dict['报表日期']) + try: + cursor.execute(sql1) + # 执行sql语句 + conn.commit() + except: + # 发生错误时回滚 + conn.rollback() + + for key, value in value_dict.items(): + if key not in ['股票名','股票代码','报表日期']: + sql2 = """ + UPDATE %s SET %s='%s' WHERE `股票名`='%s' AND `报表日期`='%s'""" % (url_list[i],key,value,value_dict['股票名'],value_dict['报表日期']) + try: + cursor.execute(sql2) + # 执行sql语句 + conn.commit() + except: + # 发生错误时回滚 + conn.rollback() + value_dict = {} + print(' [下载完成 ') + + # 关闭数据库连接 + cursor.close() + conn.close() + +if __name__ == '__main__': + print('*' * 100) + print('\t\t\t\t\t财务数据下载助手\n') + print('作者:Jack-Cui\n') + print('About Me:\n') + print(' 知乎:https://www.zhihu.com/people/Jack--Cui') + print(' Blog:http://blog.csdn.net/c406495762') + print(' Gihub:https://github.com/Jack-Cherish\n') + print('*' * 100) + fd = FinancialData() + #上市股票地址 + code = input('请输入股票代码:') + + name,table_name_list,table_date_list,url_list = fd.get_informations(fd.cwnb + code + '.html') + print('\n %s:(%s)财务数据下载中!\n' % (name,code)) + fd.insert_tables(name,table_name_list,table_date_list,url_list) + print('\n %s:(%s)财务数据下载完成!' % (name,code)) \ No newline at end of file diff --git a/geetest.py b/geetest.py new file mode 100644 index 00000000..e78fc867 --- /dev/null +++ b/geetest.py @@ -0,0 +1,328 @@ +# -*-coding:utf-8 -*- +import random +import re +import time +# 图片转换 +import base64 +from urllib.request import urlretrieve + +from bs4 import BeautifulSoup + +import PIL.Image as image +from selenium import webdriver +from selenium.webdriver import ActionChains +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +def save_base64img(data_str, save_name): + """ + 将 base64 数据转化为图片保存到指定位置 + :param data_str: base64 数据,不包含类型 + :param save_name: 保存的全路径 + """ + img_data = base64.b64decode(data_str) + file = open(save_name, 'wb') + file.write(img_data) + file.close() + + +def get_base64_by_canvas(driver, class_name, contain_type): + """ + 将 canvas 标签内容转换为 base64 数据 + :param driver: webdriver 对象 + :param class_name: canvas 标签的类名 + :param contain_type: 返回的数据是否包含类型 + :return: base64 数据 + """ + # 防止图片未加载完就下载一张空图 + bg_img = '' + while len(bg_img) < 5000: + getImgJS = 'return document.getElementsByClassName("' + class_name + '")[0].toDataURL("image/png");' + bg_img = driver.execute_script(getImgJS) + time.sleep(0.5) + # print(bg_img) + if contain_type: + return bg_img + else: + return bg_img[bg_img.find(',') + 1:] + + +def save_bg(driver, bg_path="bg.png", bg_): + """ + 保存包含缺口的背景图 + :param driver: webdriver 对象 + :param bg_path: 保存路径 + :param bg_class: 背景图的 class 属性 + :return: 保存路径 + """ + bg_img_data = get_base64_by_canvas(driver, bg_class, False) + save_base64img(bg_img_data, bg_path) + return bg_path + + +def save_full_bg(driver, full_bg_path="fbg.png", full_bg_): + """ + 保存完整的的背景图 + :param driver: webdriver 对象 + :param full_bg_path: 保存路径 + :param full_bg_class: 完整背景图的 class 属性 + :return: 保存路径 + """ + bg_img_data = get_base64_by_canvas(driver, full_bg_class, False) + save_base64img(bg_img_data, full_bg_path) + return full_bg_path + +class Crack(): + def __init__(self,keyword): + self.url = '*' + self.browser = webdriver.Chrome('D:\\chromedriver.exe') + self.wait = WebDriverWait(self.browser, 100) + self.keyword = keyword + self.BORDER = 6 + + def open(self): + """ + 打开浏览器,并输入查询内容 + """ + self.browser.get(self.url) + keyword = self.wait.until(EC.presence_of_element_located((By.ID, 'keyword_qycx'))) + bowton = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'btn'))) + keyword.send_keys(self.keyword) + bowton.click() + + def get_images(self, bg_filename = 'bg.jpg', fullbg_filename = 'fullbg.jpg'): + """ + 获取验证码图片 + :return: 图片的location信息 + """ + bg = [] + fullgb = [] + while bg == [] and fullgb == []: + bf = BeautifulSoup(self.browser.page_source, 'lxml') + bg = bf.find_all('div', class_ = 'gt_cut_bg_slice') + fullgb = bf.find_all('div', class_ = 'gt_cut_fullbg_slice') + bg_url = re.findall('url\(\"(.*)\"\);', bg[0].get('style'))[0].replace('webp', 'jpg') + fullgb_url = re.findall('url\(\"(.*)\"\);', fullgb[0].get('style'))[0].replace('webp', 'jpg') + bg_location_list = [] + fullbg_location_list = [] + for each_bg in bg: + location = {} + location['x'] = int(re.findall('background-position: (.*)px (.*)px;',each_bg.get('style'))[0][0]) + location['y'] = int(re.findall('background-position: (.*)px (.*)px;',each_bg.get('style'))[0][1]) + bg_location_list.append(location) + for each_fullgb in fullgb: + location = {} + location['x'] = int(re.findall('background-position: (.*)px (.*)px;',each_fullgb.get('style'))[0][0]) + location['y'] = int(re.findall('background-position: (.*)px (.*)px;',each_fullgb.get('style'))[0][1]) + fullbg_location_list.append(location) + + urlretrieve(url = bg_url, filename = bg_filename) + print('缺口图片下载完成') + urlretrieve(url = fullgb_url, filename = fullbg_filename) + print('背景图片下载完成') + return bg_location_list, fullbg_location_list + + def get_merge_image(self, filename, location_list): + """ + 根据位置对图片进行合并还原 + :filename:图片 + :location_list:图片位置 + """ + im = image.open(filename) + new_im = image.new('RGB', (260,116)) + im_list_upper=[] + im_list_down=[] + + for location in location_list: + if location['y'] == -58: + im_list_upper.append(im.crop((abs(location['x']),58,abs(location['x']) + 10, 166))) + if location['y'] == 0: + im_list_down.append(im.crop((abs(location['x']),0,abs(location['x']) + 10, 58))) + + new_im = image.new('RGB', (260,116)) + + x_offset = 0 + for im in im_list_upper: + new_im.paste(im, (x_offset,0)) + x_offset += im.size[0] + + x_offset = 0 + for im in im_list_down: + new_im.paste(im, (x_offset,58)) + x_offset += im.size[0] + + new_im.save(filename) + + return new_im + + def get_merge_image(self, filename, location_list): + """ + 根据位置对图片进行合并还原 + :filename:图片 + :location_list:图片位置 + """ + im = image.open(filename) + new_im = image.new('RGB', (260,116)) + im_list_upper=[] + im_list_down=[] + + for location in location_list: + if location['y']==-58: + im_list_upper.append(im.crop((abs(location['x']),58,abs(location['x'])+10,166))) + if location['y']==0: + im_list_down.append(im.crop((abs(location['x']),0,abs(location['x'])+10,58))) + + new_im = image.new('RGB', (260,116)) + + x_offset = 0 + for im in im_list_upper: + new_im.paste(im, (x_offset,0)) + x_offset += im.size[0] + + x_offset = 0 + for im in im_list_down: + new_im.paste(im, (x_offset,58)) + x_offset += im.size[0] + + new_im.save(filename) + + return new_im + + def is_pixel_equal(self, img1, img2, x, y): + """ + 判断两个像素是否相同 + :param image1: 图片1 + :param image2: 图片2 + :param x: 位置x + :param y: 位置y + :return: 像素是否相同 + """ + # 取两个图片的像素点 + pix1 = img1.load()[x, y] + pix2 = img2.load()[x, y] + threshold = 60 + if (abs(pix1[0] - pix2[0] < threshold) and abs(pix1[1] - pix2[1] < threshold) and abs(pix1[2] - pix2[2] < threshold)): + return True + else: + return False + + def get_gap(self, img1, img2): + """ + 获取缺口偏移量 + :param img1: 不带缺口图片 + :param img2: 带缺口图片 + :return: + """ + left = 43 + for i in range(left, img1.size[0]): + for j in range(img1.size[1]): + if not self.is_pixel_equal(img1, img2, i, j): + left = i + return left + return left + + def get_track(self, distance): + """ + 根据偏移量获取移动轨迹 + :param distance: 偏移量 + :return: 移动轨迹 + """ + # 移动轨迹 + track = [] + # 当前位移 + current = 0 + # 减速阈值 + mid = distance * 4 / 5 + # 计算间隔 + t = 0.2 + # 初速度 + v = 0 + + while current < distance: + if current < mid: + # 加速度为正2 + a = 2 + else: + # 加速度为负3 + a = -3 + # 初速度v0 + v0 = v + # 当前速度v = v0 + at + v = v0 + a * t + # 移动距离x = v0t + 1/2 * a * t^2 + move = v0 * t + 1 / 2 * a * t * t + # 当前位移 + current += move + # 加入轨迹 + track.append(round(move)) + return track + + def get_slider(self): + """ + 获取滑块 + :return: 滑块对象 + """ + while True: + try: + slider = self.browser.find_element_by_xpath("//div[@]") + break + except: + time.sleep(0.5) + return slider + + def move_to_gap(self, slider, track): + """ + 拖动滑块到缺口处 + :param slider: 滑块 + :param track: 轨迹 + :return: + """ + ActionChains(self.browser).click_and_hold(slider).perform() + while track: + x = random.choice(track) + ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() + track.remove(x) + time.sleep(0.5) + ActionChains(self.browser).release().perform() + + def crack(self): + # 打开浏览器 + self.open() + + # 保存的图片名字 + bg_filename = 'bg.jpg' + fullbg_filename = 'fullbg.jpg' + + # 获取图片 + bg_location_list, fullbg_location_list = self.get_images(bg_filename, fullbg_filename) + + # 根据位置对图片进行合并还原 + # 方法1 + # bg_img = self.get_merge_image(bg_filename, bg_location_list) + # fullbg_img = self.get_merge_image(fullbg_filename, fullbg_location_list) + # 方法2 + bg_img = save_bg(self.browser) + full_bg_img = save_full_bg(self.browser) + + # 获取缺口位置 + # 方法1 + # gap = self.get_gap(fullbg_img, bg_img) + # 方法2 + gap = self.get_gap(image.open(full_bg_img), image.open(bg_img)) + print('缺口位置', gap) + + track = self.get_track(gap-self.BORDER) + print('滑动滑块') + print(track) + + # # 点按呼出缺口 + # slider = self.get_slider() + # # 拖动滑块到缺口处 + # self.move_to_gap(slider, track) + +if __name__ == '__main__': + print('开始验证') + crack = Crack(u'中国移动') + crack.crack() + print('验证成功') diff --git a/hero.py b/hero.py new file mode 100644 index 00000000..678a04ac --- /dev/null +++ b/hero.py @@ -0,0 +1,155 @@ +#-*- coding: UTF-8 -*- +from urllib.request import urlretrieve +import requests +import os + +""" +函数说明:下载《英雄联盟盒子》中的英雄图片 + +Parameters: + url - GET请求地址,通过Fiddler抓包获取 + header - headers信息 +Returns: + 无 +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Modify: + 2017-08-07 +""" +def hero_imgs_download(url, header): + req = requests.get(url = url, headers = header).json() + hero_num = len(req['list']) + print('一共有%d个英雄' % hero_num) + hero_images_path = 'hero_images' + for each_hero in req['list']: + hero_photo_url = each_hero['cover'] + hero_name = each_hero['name'] + '.jpg' + filename = hero_images_path + '/' + hero_name + if hero_images_path not in os.listdir(): + os.makedirs(hero_images_path) + urlretrieve(url = hero_photo_url, filename = filename) + +""" +函数说明:打印所有英雄的名字和ID + +Parameters: + url - GET请求地址,通过Fiddler抓包获取 + header - headers信息 +Returns: + 无 +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Modify: + 2017-08-07 +""" +def hero_list(url, header): + print('*' * 100) + print('\t\t\t\t欢迎使用《王者荣耀》出装下助手!') + print('*' * 100) + req = requests.get(url = url, headers = header).json() + flag = 0 + for each_hero in req['list']: + flag += 1 + print('%s的ID为:%-7s' % (each_hero['name'], each_hero['hero_id']), end = '\t\t') + if flag == 3: + print('\n', end = '') + flag = 0 + +""" +函数说明:根据equip_id查询武器名字和价格 + +Parameters: + equip_id - 武器的ID + weapon_info - 存储所有武器的字典 +Returns: + weapon_name - 武器的名字 + weapon_price - 武器的价格 +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Modify: + 2017-08-07 +""" +def seek_weapon(equip_id, weapon_info): + for each_weapon in weapon_info: + if each_weapon['equip_id'] == str(equip_id): + weapon_name = each_weapon['name'] + weapon_price = each_weapon['price'] + return weapon_name, weapon_price + + +""" +函数说明:获取并打印出装信息 + +Parameters: + url - GET请求地址,通过Fiddler抓包获取 + header - headers信息 + weapon_info - 存储所有武器的字典 +Returns: + 无 +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Modify: + 2017-08-07 +""" +def hero_info(url, header, weapon_info): + req = requests.get(url = url, headers = header).json() + print('\n历史上的%s:\n %s' % (req['info']['name'], req['info']['history_intro'])) + for each_equip_choice in req['info']['equip_choice']: + print('\n%s:\n %s' % (each_equip_choice['title'], each_equip_choice['description'])) + total_price = 0 + flag = 0 + for each_weapon in each_equip_choice['list']: + flag += 1 + weapon_name, weapon_price = seek_weapon(each_weapon['equip_id'], weapon_info) + print('%s:%s' % (weapon_name, weapon_price), end = '\t') + if flag == 3: + print('\n', end = '') + flag = 0 + total_price += int(weapon_price) + print('神装套件价格共计:%d' % total_price) + + +""" +函数说明:获取武器信息 + +Parameters: + url - GET请求地址,通过Fiddler抓包获取 + header - headers信息 +Returns: + weapon_info_dict - 武器信息 +Author: + Jack Cui +Blog: + http://blog.csdn.net/c406495762 +Modify: + 2017-08-07 +""" +def hero_weapon(url, header): + req = requests.get(url = url, headers = header).json() + weapon_info_dict = req['list'] + return weapon_info_dict + + +if __name__ == '__main__': + headers = {'Accept-Charset': 'UTF-8', + 'Accept-Encoding': 'gzip,deflate', + 'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 6.0.1; MI 5 MIUI/V8.1.6.0.MAACNDI)', + 'X-Requested-With': 'XMLHttpRequest', + 'Content-type': 'application/x-www-form-urlencoded', + 'Connection': 'Keep-Alive', + 'Host': 'gamehelper.gm825.com'} + weapon_url = "http://gamehelper.gm825.com/wzry/equip/list?channel_id=90009a&app_id=h9044j&game_id=7622&game_name=%E7%8E%8B%E8%80%85%E8%8D%A3%E8%80%80&vcode=12.0.3&version_code=1203&cuid=2654CC14D2D3894DBF5808264AE2DAD7&ovr=6.0.1&device=Xiaomi_MI+5&net_type=1&client_id=1Yfyt44QSqu7PcVdDduBYQ%3D%3D&info_ms=fBzJ%2BCu4ZDAtl4CyHuZ%2FJQ%3D%3D&info_ma=XshbgIgi0V1HxXTqixI%2BKbgXtNtOP0%2Fn1WZtMWRWj5o%3D&mno=0&info_la=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&info_ci=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&mcc=0&clientversion=&bssid=VY%2BeiuZRJ%2FwaXmoLLVUrMODX1ZTf%2F2dzsWn2AOEM0I4%3D&os_level=23&os_id=dc451556fc0eeadb&resolution=1080_1920&dpi=480&client_ip=192.168.0.198&pdunid=a83d20d8" + heros_url = "http://gamehelper.gm825.com/wzry/hero/list?channel_id=90009a&app_id=h9044j&game_id=7622&game_name=%E7%8E%8B%E8%80%85%E8%8D%A3%E8%80%80&vcode=12.0.3&version_code=1203&cuid=2654CC14D2D3894DBF5808264AE2DAD7&ovr=6.0.1&device=Xiaomi_MI+5&net_type=1&client_id=1Yfyt44QSqu7PcVdDduBYQ%3D%3D&info_ms=fBzJ%2BCu4ZDAtl4CyHuZ%2FJQ%3D%3D&info_ma=XshbgIgi0V1HxXTqixI%2BKbgXtNtOP0%2Fn1WZtMWRWj5o%3D&mno=0&info_la=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&info_ci=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&mcc=0&clientversion=&bssid=VY%2BeiuZRJ%2FwaXmoLLVUrMODX1ZTf%2F2dzsWn2AOEM0I4%3D&os_level=23&os_id=dc451556fc0eeadb&resolution=1080_1920&dpi=480&client_ip=192.168.0.198&pdunid=a83d20d8" + hero_list(heros_url, headers) + hero_id = input("请输入要查询的英雄ID:") + hero_url = "http://gamehelper.gm825.com/wzry/hero/detail?hero_id={}&channel_id=90009a&app_id=h9044j&game_id=7622&game_name=%E7%8E%8B%E8%80%85%E8%8D%A3%E8%80%80&vcode=12.0.3&version_code=1203&cuid=2654CC14D2D3894DBF5808264AE2DAD7&ovr=6.0.1&device=Xiaomi_MI+5&net_type=1&client_id=1Yfyt44QSqu7PcVdDduBYQ%3D%3D&info_ms=fBzJ%2BCu4ZDAtl4CyHuZ%2FJQ%3D%3D&info_ma=XshbgIgi0V1HxXTqixI%2BKbgXtNtOP0%2Fn1WZtMWRWj5o%3D&mno=0&info_la=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&info_ci=9AChHTMC3uW%2BfY8%2BCFhcFw%3D%3D&mcc=0&clientversion=&bssid=VY%2BeiuZRJ%2FwaXmoLLVUrMODX1ZTf%2F2dzsWn2AOEM0I4%3D&os_level=23&os_id=dc451556fc0eeadb&resolution=1080_1920&dpi=480&client_ip=192.168.0.198&pdunid=a83d20d8".format(hero_id) + weapon_info_dict = hero_weapon(weapon_url, headers) + hero_info(hero_url, headers, weapon_info_dict) \ No newline at end of file diff --git a/one_hour_spider/biquge20180731.py b/one_hour_spider/biquge20180731.py new file mode 100644 index 00000000..5cc55760 --- /dev/null +++ b/one_hour_spider/biquge20180731.py @@ -0,0 +1,52 @@ +# -*- coding:utf-8 -*- +import requests +from bs4 import BeautifulSoup +import os + +""" +从www.biqubao.com笔趣阁爬取小说,楼主教程中的网址我当时没打开, +就参照楼主教程,爬取了笔趣阁小说网的内容。 + 2018-07-31 +""" + +if __name__=='__main__': + #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可 + + # 本地保存爬取的文本根路径 + save_path = 'G:/pythonlearn' + #笔趣阁网站根路径 + index_path='https://www.biqubao.com' + + req=requests.get(url=target) + #查看request默认的编码,发现与网站response不符,改为网站使用的gdk + print(req.encoding) + req.encoding = 'gbk' + #解析html + soup=BeautifulSoup(req.text,"html.parser") + list_tag=soup.div(id="list") + print('list_tag:',list_tag) + #获取小说名称 + story_title=list_tag[0].dl.dt.string + # 根据小说名称创建一个文件夹,如果不存在就新建 + dir_path=save_path+'/'+story_title + if not os.path.exists(dir_path): + os.path.join(save_path,story_title) + os.mkdir(dir_path) + #开始循环每一个章节,获取章节名称,与章节对应的网址 + for dd_tag in list_tag[0].dl.find_all('dd'): + #章节名称 + chapter_name=dd_tag.string + #章节网址 + chapter_url=index_path+dd_tag.a.get('href') + #访问该章节详情网址,爬取该章节正文 + chapter_req = requests.get(url=chapter_url) + chapter_req.encoding = 'gbk' + chapter_soup = BeautifulSoup(chapter_req.text, "html.parser") + #解析出来正文所在的标签 + content_tag = chapter_soup.div.find(id="content") + #获取正文文本,并将空格替换为换行符 + content_text = str(content_tag.text.replace('\xa0','\n')) + #将当前章节,写入以章节名字命名的txt文件 + with open(dir_path+'/'+chapter_name+'.txt', 'w') as f: + f.write('本文网址:'+chapter_url) + f.write(content_text) \ No newline at end of file diff --git a/one_hour_spider/biqukan.py b/one_hour_spider/biqukan.py new file mode 100644 index 00000000..c066d58c --- /dev/null +++ b/one_hour_spider/biqukan.py @@ -0,0 +1,87 @@ +# -*- coding:UTF-8 -*- +from bs4 import BeautifulSoup +import requests, sys + +""" +类说明:下载《笔趣看》网小说《一念永恒》 +Parameters: + 无 +Returns: + 无 +Modify: + 2017-09-13 +""" +class downloader(object): + + def __init__(self): + self.server = 'http://www.biqukan.com/' + self.target = 'http://www.biqukan.com/1_1094/' + self.names = [] #存放章节名 + self.urls = [] #存放章节链接 + self.nums = 0 #章节数 + + """ + 函数说明:获取下载链接 + Parameters: + 无 + Returns: + 无 + Modify: + 2017-09-13 + """ + def get_download_url(self): + req = requests.get(url = self.target) + html = req.text + div_bf = BeautifulSoup(html) + div = div_bf.find_all('div', class_ = 'listmain') + a_bf = BeautifulSoup(str(div[0])) + a = a_bf.find_all('a') + self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数 + for each in a[15:]: + self.names.append(each.string) + self.urls.append(self.server + each.get('href')) + + """ + 函数说明:获取章节内容 + Parameters: + target - 下载连接(string) + Returns: + texts - 章节内容(string) + Modify: + 2017-09-13 + """ + def get_contents(self, target): + req = requests.get(url = target) + html = req.text + bf = BeautifulSoup(html) + texts = bf.find_all('div', class_ = 'showtxt') + texts = texts[0].text.replace('\xa0'*8,'\n\n') + return texts + + """ + 函数说明:将爬取的文章内容写入文件 + Parameters: + name - 章节名称(string) + path - 当前路径下,小说保存名称(string) + text - 章节内容(string) + Returns: + 无 + Modify: + 2017-09-13 + """ + def writer(self, name, path, text): + write_flag = True + with open(path, 'a', encoding='utf-8') as f: + f.write(name + '\n') + f.writelines(text) + f.write('\n\n') + +if __name__ == "__main__": + dl = downloader() + dl.get_download_url() + print('《一年永恒》开始下载:') + for i in range(dl.nums): + dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i])) + sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums*100) + '\r') + sys.stdout.flush() + print('《一年永恒》下载完成') diff --git a/one_hour_spider/unsplash.py b/one_hour_spider/unsplash.py new file mode 100644 index 00000000..bd630601 --- /dev/null +++ b/one_hour_spider/unsplash.py @@ -0,0 +1,64 @@ +# -*- coding:UTF-8 -*- +import requests, json, time, sys +from contextlib import closing + +class get_photos(object): + + def __init__(self): + self.photos_id = [] + self.download_server = 'https://unsplash.com/photos/xxx/download?force=trues' + self.target = 'http://unsplash.com/napi/feeds/home' + self.headers = {'authorization':'Client-ID c94869b36aa272dd62dfaeefed769d4115fb3189a9d1ec88ed457207747be626'} + + """ + 函数说明:获取图片ID + Parameters: + 无 + Returns: + 无 + Modify: + 2017-09-13 + """ + def get_ids(self): + req = requests.get(url=self.target, headers=self.headers, verify=False) + html = json.loads(req.text) + next_page = html['next_page'] + for each in html['photos']: + self.photos_id.append(each['id']) + time.sleep(1) + for i in range(5): + req = requests.get(url=next_page, headers=self.headers, verify=False) + html = json.loads(req.text) + next_page = html['next_page'] + for each in html['photos']: + self.photos_id.append(each['id']) + time.sleep(1) + + + """ + 函数说明:图片下载 + Parameters: + 无 + Returns: + 无 + Modify: + 2017-09-13 + """ + def download(self, photo_id, filename): + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'} + target = self.download_server.replace('xxx', photo_id) + with closing(requests.get(url=target, stream=True, verify = False, headers = self.headers)) as r: + with open('%d.jpg' % filename, 'ab+') as f: + for chunk in r.iter_content(chunk_size = 1024): + if chunk: + f.write(chunk) + f.flush() + +if __name__ == '__main__': + gp = get_photos() + print('获取图片连接中:') + gp.get_ids() + print('图片下载中:') + for i in range(len(gp.photos_id)): + print(' 正在下载第%d张图片' % (i+1)) + gp.download(gp.photos_id[i], (i+1)) \ No newline at end of file diff --git a/one_hour_spider/unsplash20180731.py b/one_hour_spider/unsplash20180731.py new file mode 100644 index 00000000..2b6c7279 --- /dev/null +++ b/one_hour_spider/unsplash20180731.py @@ -0,0 +1,35 @@ +# -*- coding:utf-8 -*- +import requests +import json +import os +from contextlib import closing + +""" +从https://unsplash.com/爬取壁纸代码,使用时我是开启了代理软件 +国内网速貌似有些限制,很慢 + 2018-07-31 +""" + +# 本地保存图片根路径(请确保根路径存在) +save_path = 'G:/pythonlearn' +dir_path=save_path+'/'+'unsplash-image' +if not os.path.exists(dir_path): + os.path.join(save_path, 'unsplash-image') + os.mkdir(dir_path) +n=10 +#n建议从第2页开始,因为第一页的per_page可能是1,不是12 +while n>2: + print('当前爬取第'+str(n)+'次加载图片(本次共12张)') + url='https://unsplash.com/napi/photos?page='+str(n)+'&per_page=12&order_by=latest' + req=requests.get(url=url) + html=json.loads(req.text) + for each in html: + downloadurl=each['links']["download"] + jpgrep=requests.get(url=downloadurl) + with closing(requests.get(url=downloadurl, stream=True)) as r: + with open(dir_path+'/'+each['id']+'.jpg', 'ab+') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + f.flush() + n=n-1 \ No newline at end of file diff --git a/one_hour_spider/vidoe_downloader.py b/one_hour_spider/vidoe_downloader.py new file mode 100644 index 00000000..883b8609 --- /dev/null +++ b/one_hour_spider/vidoe_downloader.py @@ -0,0 +1,90 @@ +#-*- coding:UTF-8 -*- +import requests,re, json, sys +from bs4 import BeautifulSoup +from urllib import request + +class video_downloader(): + def __init__(self, url): + self.server = 'http://api.xfsub.com' + self.api = 'http://api.xfsub.com/xfsub_api/?url=' + self.get_url_api = 'http://api.xfsub.com/xfsub_api/url.php' + self.url = url.split('#')[0] + self.headers = {'Referer': 'http://api.xfsub.com/xfsub_api/?url=%s?qqdrsign=055a4' % self.url} + self.target = self.api + self.url + self.s = requests.session() + + """ + 函数说明:获取key、time、url等参数 + Parameters: + 无 + Returns: + 无 + Modify: + 2017年09月18日 + """ + def get_key(self): + req = self.s.get(url=self.target) + req.encoding = 'utf-8' + self.info = json.loads(re.findall('"url.php",\ (.+),', req.text)[0]) #使用正则表达式匹配结果,将匹配的结果存入info变量中 + + """ + 函数说明:获取视频地址 + Parameters: + 无 + Returns: + video_url - 视频存放地址 + Modify: + 2017年09月18日 + """ + def get_url(self): + data = {'time':self.info['time'], + 'key':self.info['key'], + 'url':self.info['url'], + 'type':''} + req = self.s.post(url=self.get_url_api,data=data, headers=self.headers) + url = self.server + json.loads(req.text)['url'] + req = self.s.get(url=url, headers=self.headers) + bf = BeautifulSoup(req.text,'xml') #因为文件是xml格式的,所以要进行xml解析。 + video_url = bf.find('file').string #匹配到视频地址 + return video_url + + """ + 函数说明:回调函数,打印下载进度 + Parameters: + a b c - 返回信息 + Returns: + 无 + Modify: + 2017年09月18日 + """ + def Schedule(self, a, b, c): + per = 100.0*a*b/c + if per> 100 : + per = 1 + sys.stdout.write(" " + "%.2f%% 已经下载的大小:%ld 文件大小:%ld" % (per,a*b,c) + '\r') + sys.stdout.flush() + + """ + 函数说明:视频下载 + Parameters: + url - 视频地址 + filename - 视频名字 + Returns: + 无 + Modify: + 2017年09月18日 + """ + def video_download(self, url, filename): + request.urlretrieve(url=url,filename=filename,reporthook=self.Schedule) + + +if __name__ == '__main__': + url = 'http://www.iqiyi.com/v_19rr7qhfg0.html#vfrm=19-9-0-1' + vd = video_downloader(url) + filename = '加勒比海盗5' + print('%s下载中:' % filename) + vd.get_key() + video_url = vd.get_url() + print(' 获取地址成功:%s' % video_url) + vd.video_download(video_url, filename+'.mp4') + print('\n下载完成!') diff --git a/video_downloader/requirements.txt b/video_downloader/requirements.txt index 5c39ca9e..8c4dbe5e 100644 --- a/video_downloader/requirements.txt +++ b/video_downloader/requirements.txt @@ -1,4 +1,4 @@ -imageio==1.5 -numpy==1.11.1 -Pillow==3.3.1 -beautifulsoup4==4.3.2 \ No newline at end of file +imageio +numpy +Pillow +beautifulsoup4 diff --git a/zhengfang_system_spider/README.md b/zhengfang_system_spider/README.md new file mode 100644 index 00000000..29eb71aa --- /dev/null +++ b/zhengfang_system_spider/README.md @@ -0,0 +1,38 @@ +# ZhengFang_System_Spider +对正方教务管理系统的个人课表,个人学生成绩,绩点等简单爬取 + +## 依赖环境 +python 3.6 +### python库 +http请求:requests,urllib +数据提取:re,lxml,bs4 +存储相关:os,sys +验证码处理:PIL + +## 下载安装 +在终端输入如下命令: +```bash +git clone git@github.com:Jack-Cherish/python-spider.git +``` + +## 使用方法 + +### 安装依赖包 +```bash +pip install -r requirements.txt +``` + +### 运行 +在当前目录下输入: +``` +cd zhengfang_system_spider +python spider.py +``` +运行爬虫,按提示输入学校教务网,学号,密码,输入验证码 + +![运行时](/zhengfang_system_spider/screenshot/spider.png) + +稍等几秒钟,当前ZhengFang_System_Spider文件夹下就会生成zhengfang.txt +个人课表,成绩绩点均已保存到该文本文件中 + +![结果](/zhengfang_system_spider/screenshot/zf.png) diff --git a/zhengfang_system_spider/code.jpg b/zhengfang_system_spider/code.jpg new file mode 100644 index 00000000..15f5ddfb Binary files /dev/null and b/zhengfang_system_spider/code.jpg differ diff --git a/zhengfang_system_spider/requirements.txt b/zhengfang_system_spider/requirements.txt new file mode 100644 index 00000000..522810d0 --- /dev/null +++ b/zhengfang_system_spider/requirements.txt @@ -0,0 +1,4 @@ +lxml==4.6.3 +requests==2.20.0 +Pillow>=6.2.2 +beautifulsoup4==4.6.0 diff --git a/zhengfang_system_spider/screenshot/spider.png b/zhengfang_system_spider/screenshot/spider.png new file mode 100644 index 00000000..87995828 Binary files /dev/null and b/zhengfang_system_spider/screenshot/spider.png differ diff --git a/zhengfang_system_spider/screenshot/zf.png b/zhengfang_system_spider/screenshot/zf.png new file mode 100644 index 00000000..c4a3787b Binary files /dev/null and b/zhengfang_system_spider/screenshot/zf.png differ diff --git a/zhengfang_system_spider/spider.py b/zhengfang_system_spider/spider.py new file mode 100644 index 00000000..e4c005d0 --- /dev/null +++ b/zhengfang_system_spider/spider.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python +#-*- coding: utf-8 -*- + +__author__ = 'ZYSzys' + +import requests +import re +import os +import sys +import urllib +import getpass +from lxml import etree +from PIL import Image +from imp import reload +from bs4 import BeautifulSoup + + +class Who: + def __init__(self, user, pswd): + self.user = user + self.pswd = pswd + + +class Tool: + rma = re.compile('|') + rmtb = re.compile('
|
|
') + rmtr = re.compile(' ||

|
|
|
') + rmtime1 = re.compile(' .*?') + rmtime2 = re.compile(' .*?') + + def replace(self, x): + x = re.sub(self.rma, ' ', x) + x = re.sub(self.rmtb, '---', x) + x = re.sub(self.rmtr, ' ', x) + x = re.sub(self.rmtime1, '\n', x) + x = re.sub(self.rmtime2, '', x) + return x.strip() + + +def Getgrade(response): + html = response.content + soup = BeautifulSoup(html, 'lxml') + trs = soup.find(id="Datagrid1").findAll("tr") + Grades = [] + keys = [] + tds = trs[0].findAll("td") + tds = tds[:2] + tds[3:5] + tds[6:9] + for td in tds: + keys.append(td.string) + for tr in trs[1:]: + tds = tr.findAll("td") + tds = tds[:2] + tds[3:5] + tds[6:9] + values = [] + for td in tds: + values.append(td.string) + one = dict((key, value) for key, value in zip(keys, values)) + Grades.append(one) + return Grades + + +def Getgradetestresults(trs): + results = [] + k = [] + for td in trs[0].xpath('.//td/text()'): + k.append(td) + trs = trs[1:] + for tr in trs: + tds = tr.xpath('.//td/text()') + v = [] + for td in tds: + v.append(td) + one = dict((i, j) for i, j in zip(k, v)) + results.append(one) + return results + + +class University: + def __init__(self, student, baseurl): + reload(sys) + self.student = student + self.baseurl = baseurl + self.session = requests.session() + self.session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' + + def Login(self): + url = self.baseurl+'/default2.aspx' + res = self.session.get(url) + cont = res.content + selector = etree.HTML(cont) + __VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0] + imgurl = self.baseurl + '/CheckCode.aspx' + imgres = self.session.get(imgurl, stream=True) + img = imgres.content + with open('code.jpg', 'wb') as f: + f.write(img) + jpg = Image.open('{}/code.jpg'.format(os.getcwd())) + jpg.show() + jpg.close + code = input('输入验证码:') + RadioButtonList1 = u"学生" + data = { + "__VIEWSTATE": __VIEWSTATE, + "txtUserName": self.student.user, + "TextBox1": self.student.pswd, + "TextBox2": self.student.pswd, + "txtSecretCode": code, + "RadioButtonList1": RadioButtonList1, + "Button1": "", + "lbLanguage": "" + } + loginres = self.session.post(url, data=data) + logcont = loginres.text + pattern = re.compile( + '
', re.S) + res = re.findall(pattern, logcont) + try: + if res[0][17:29] == self.student.user: + print('Login succeed!') + except: + print('Login failed! Maybe Wrong password ! ! !') + return + pattern = re.compile('(.*?)') + xhxm = re.findall(pattern, logcont) + name = xhxm[0].replace('同学', '') + self.student.urlname = urllib.parse.quote_plus(str(name)) + return True + + def GetClass(self): + self.session.headers['Referer'] = self.baseurl + \ + '/xs_main.aspx?xh=' + self.student.user + kburl = self.baseurl + '/xskbcx.aspx?xh='+self.student.user + \ + '&xm='+self.student.urlname+'&gnmkdm=N121603' + kbresponse = self.session.get(kburl) + kbcont = kbresponse.text + pattern = re.compile(' (.*?)', re.S) + contents = re.findall(pattern, kbcont) + tool = Tool() + f = open(os.getcwd()+'/zhengfang.txt', 'w') + f.write(u'本学期课表:'+'\n') + cnt = 1 + l = [u'周一', u'周二', u'周三', u'周四', u'周五', u'周六', u'周日'] + for day in l: + for i in contents: + if u'星期' in i: + continue + elif u'第' in i: + if day in i: + con = tool.replace(i) + f.write(str(cnt)+':\t'+con+'\n') + cnt += 1 + else: + continue + f.write('\n') + f.close() + print('Download class succeed!') + + def GetGrade(self): + self.session.headers['Referer'] = self.baseurl + \ + '/xs_main.aspx?xh=' + self.student.user + gradeurl = self.baseurl + '/xscjcx.aspx?xh='+self.student.user + \ + '&xm='+self.student.urlname+'&gnmkdm=N121605' + graderesponse = self.session.get(gradeurl) + gradecont = graderesponse.content + soup = BeautifulSoup(gradecont, 'lxml') + __VIEWSTATE = soup.findAll(name="input")[2]["value"] + self.session.headers['Referer'] = gradeurl + data = { + "__EVENTTARGET": "", + "__EVENTARGUMENT": "", + "__VIEWSTATE": __VIEWSTATE, + "hidLanguage": "", + "ddlXN": "", + "ddlXQ": "", + "ddl_kcxz": "", + "btn_zcj": u'历年成绩' + } + grares = self.session.post(gradeurl, data=data) + grades = Getgrade(grares) + totup = 0 + totdown = 0 + f = open(os.getcwd()+'/zhengfang.txt', 'a+') + f.write('\n\n\n'+u'历年成绩:'+'\n') + for i in grades[0]: + f.write('%-13s\t' % i) + f.write('\n') + for each in grades: + for one in each: + f.write('%-15s\t' % each[one]) + f.write('\n') + totup = totup + float(each[u'绩点']) * float(each[u'学分']) + totdown = totdown + float(each[u'学分']) + f.write('\n'+u'平均绩点: '+'%.2f\t\t\t' % (totup / totdown) + + u'总学分绩点: '+'%.2f\t\t\t' % totup + u'总学分: '+'%.2f\n' % totdown) + f.close() + print('Download grade succeed!') + + def GradeTestResults(self): + self.session.headers['Referer'] = self.baseurl + \ + '/xs_main.aspx?xh=' + self.student.user + gtrurl = self.baseurl + '/xsdjkscx.aspx?xh='+self.student.user + \ + '&xm='+self.student.urlname+'&gnmkdm=N121606' + gtrresponse = self.session.get(gtrurl) + gtrcontent = gtrresponse.text + gtrhtml = etree.HTML(gtrcontent) + trs = gtrhtml.xpath('//table[@class="datelist"]/tr') + f = open(os.getcwd()+'/zhengfang.txt', 'a+') + f.write('\n\n\n'+u'等级考试成绩:'+'\n') + results = Getgradetestresults(trs) + for one in results[0]: + f.write('%-10s\t' % one) + f.write('\n') + for each in results: + for one in each: + f.write('%-10s\t' % each[one]) + f.write('\n') + f.close() + print('Download grade test results succeed!') + + +if __name__ == "__main__": + url = input("学校教务网站(如http://115.236.84.162):") + user = input("学号:") + pswd = getpass.getpass("密码:") + who = Who(user, pswd) + univ = University(who, url) + if univ.Login(): + univ.GetClass() + univ.GetGrade() + univ.GradeTestResults() diff --git a/zhengfang_system_spider/zhengfang.txt b/zhengfang_system_spider/zhengfang.txt new file mode 100644 index 00000000..8a47c301 --- /dev/null +++ b/zhengfang_system_spider/zhengfang.txt @@ -0,0 +1,85 @@ +本学期课表: +1: 电工电子技术基础AII---周一第1,2节{第2-16周|双周}---郜园园/章云(章云,郜园园)---学10609(实验室) +2: 计算机网络A---周一第3,4节{第2-16周|双周}---吴晓平(吴晓平)---学10311(实验室) +3: 数据库原理与技术B---周一第6,7节{第1-12周}---刘丽娟(刘丽娟)---学1502(智慧教室) +4: 数据库原理与技术B---周一第8节{第1-12周}---刘丽娟(刘丽娟)---学1502(智慧教室) + +5: 数据库原理与技术B---周二第1,2节{第1-16周}---刘丽娟(刘丽娟)---学10309(实验室) +6: 计算机网络A---周二第3,4,5节{第1-16周}---吴晓平(吴晓平)---教1512(多媒体) +7: J2EE程序设计---周二第6,7节{第1-16周}---陈文辉(陈文辉)---教5402(多媒体) + +8: 大学体育(篮球)---周三第3,4节{第1-17周}---田晓鹏---东湖风雨操场 +9: J2EE程序设计---周三第6,7节{第1-16周}---陈文辉(陈文辉)---学10309(实验室) +10: 毛泽东思想和中国特色社会主义理论体系概论---周三第8,9节{第2-16周}---张国泉---教1401(多媒体)---2018年06月30日(10:20-11:10)---学10203(实验室) + +11: 中国文化英语---周四第1,2节{第1-16周}---陈献---教5302(多媒体) +12: 电工电子技术基础AII---周四第3,4,5节{第1-16周}---郜园园/章云(章云,郜园园)---教5403(多媒体) +13: 物联网工程概论A---周四第6,7节{第2-16周|双周}---孔汶汶/张建锋/冯海林/吴剑(孔汶汶,张建锋)---学10603(实验室) + +14: 毛泽东思想和中国特色社会主义理论体系概论---周五第3,4节{第2-16周}---张国泉---教1401(多媒体)---2018年06月30日(10:20-11:10)---学10203(实验室) +15: 物联网工程概论A---周五第8,9节{第1-16周}---孔汶汶/张建锋/冯海林/吴剑(孔汶汶,张建锋)---教1512(多媒体) + + +16: 思辨与创新(网络课程)---周日第12节{第1-15周}---网络教师---------用经济学智慧解读中国(网络课程)---周日第12节{第1-15周}---网络教师--- + + + + +历年成绩: +学年 学期 课程名称 课程性质 学分 绩点 成绩 +2016-2017 1 思想道德修养与法律基础 必修 3 2.80 78 +2016-2017 1 形势与政策 必修 0.5 4.50 优 +2016-2017 1 大学生心理健康教育 必修 1 3.30 83 +2016-2017 1 大学生职业发展 必修 0.5 3.80 88 +2016-2017 1 高级语言程序设计 必修 4 4.40 94 +2016-2017 1 学业指导 必修 0.5 4.40 94 +2016-2017 1 大学计算机基础A 必修 1 3.90 89 +2016-2017 1 高等数学AI 必修 4 4.50 95 +2016-2017 1 线性代数A 必修 3 4.60 96 +2016-2017 1 大学英语BI 必修 4 3.60 86 +2016-2017 1 军事理论 必修 0.5 3.50 良 +2016-2017 1 军事技能训练 必修 0.5 4.50 95 +2016-2017 1 大学体育I 必修 0.75 2.30 73 +2016-2017 2 马克思主义基本原理概论 必修 3 2.20 72 +2016-2017 2 形势与政策 必修 0.5 4.50 优 +2016-2017 2 信息技术导论 必修 1.5 4.40 94 +2016-2017 2 数据结构C 必修 3.5 4.10 91 +2016-2017 2 数据结构C实习 必修 1 4.50 优 +2016-2017 2 应用文写作 必修 2 3.90 89 +2016-2017 2 高等数学AII 必修 5 4.20 92 +2016-2017 2 大学物理AI 必修 3 2.90 79 +2016-2017 2 大学英语BII 必修 4 3.20 82 +2016-2017 2 大学体育II 必修 0.75 2.50 75 +2017-2018 1 中国近现代史纲要 必修 2 2.30 73 +2017-2018 1 形势与政策 必修 0.5 4.50 优 +2017-2018 1 电工电子技术基础AI 必修 3.5 2.60 76 +2017-2018 1 概率论与数理统计A 必修 4 4.70 97 +2017-2018 1 大学物理AII 必修 3 3.30 83 +2017-2018 1 大学物理A实验 必修 1.5 3.40 84 +2017-2018 1 英语报刊选读 必修 2 3.40 84 +2017-2018 1 大学体育III 必修 0.75 3.00 80 +2017-2018 1 生命科学与生物技术导论B 选修 2 3.50 85 +2017-2018 1 动物福利B(双语) 选修 2 4.50 95 +2017-2018 1 JAVA程序设计B 选修 3 4.50 95 +2017-2018 1 面向对象程序设计B 选修 3.5 4.10 91 +2017-2018 1 专业认知实习 选修 0.5 4.70 97 +2017-2018 1 个人理财规划(网络课程) 选修 2 4.50 优 +2017-2018 1 淘宝店铺设计与制作(实验室开放项目) 选修 1 3.20 82 +2017-2018 2 大学体育(篮球) 必修 0.75 2.10 71 +2017-2018 2 电工电子技术基础AII 必修 3.5 2.30 73 +2017-2018 2 电工电子技术基础实习A 必修 1 2.50 中 +2017-2018 2 中国文化英语 必修 2 3.20 82 +2017-2018 2 数据库原理与技术B 选修 3 4.10 91 +2017-2018 2 数据库原理与技术实习B 选修 1 3.50 良 +2017-2018 2 思辨与创新(网络课程) 选修 2 3.90 89 +2017-2018 2 用经济学智慧解读中国(网络课程) 选修 3 4.80 97.97 + +平均绩点: 3.70 总学分绩点: 351.68 总学分: 95.00 + + + +等级考试成绩: +学年 学期 等级考试名称 准考证号 考试日期 成绩 听力成绩 阅读成绩 写作成绩 综合成绩 +2016-2017 1 英语四级 330391162105502 2016年12月17日 563 212 190 161 0 +2016-2017 2 英语六级 330391171213315 2017年6月17日 434 112 195 127 0 +2017-2018 1 英语六级 330391172204918 2017年12月16日 415 135 151 129 0