Commit 806f1e7

committed

no message

1 parent 9605ade commit 806f1e7Copy full SHA for 806f1e7

File tree

1 file changed

+130

-0

lines changed

moumoubaimifan/zhihu
- zhihu.py

1 file changed

+130

-0

lines changed

`‎moumoubaimifan/zhihu/zhihu.py`

Lines changed: 130 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,130 @@`
	`1`	`+# -- coding:utf-8 --`
	`2`	`+`
	`3`	`+import re`
	`4`	`+import requests`
	`5`	`+import os`
	`6`	`+import urllib.request`
	`7`	`+import ssl`
	`8`	`+`
	`9`	`+from urllib.parse import urlsplit`
	`10`	`+from os.path import basename`
	`11`	`+import json`
	`12`	`+`
	`13`	`+ssl._create_default_https_context = ssl._create_unverified_context`
	`14`	`+`
	`15`	`+headers = {`
	`16`	`+ 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",`
	`17`	`+ 'Accept-Encoding': 'gzip, deflate'`
	`18`	`+}`
	`19`	`+`
	`20`	`+def get_image_url(qid, title):`
	`21`	+ answers_url = 'https://www.zhihu.com/api/v4/questions/'+str(qid)+'/answers?include=data%5B%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%5D.mark_infos%5B%5D.url%3Bdata%5B%5D.author.follower_count%2Cbadge%5B%5D.topics%3Bdata%5B%5D.settings.table_of_content.enabled&offset={}&limit=10&sort_by=default&platform=desktop'
	`22`	`+ offset = 0`
	`23`	`+ session = requests.Session()`
	`24`	`+`
	`25`	`+ while True:`
	`26`	`+ page = session.get(answers_url.format(offset), headers = headers)`
	`27`	`+ json_text = json.loads(page.text)`
	`28`	`+ answers = json_text['data']`
	`29`	`+`
	`30`	`+ offset += 10`
	`31`	`+`
	`32`	`+ if not answers:`
	`33`	`+ print('获取图片地址完成')`
	`34`	`+ return`
	`35`	`+`
	`36`	`+ pic_re = re.compile('data-original="(.*?)"', re.S)`
	`37`	`+`
	`38`	`+ for answer in answers:`
	`39`	`+ tmp_list = []`
	`40`	`+ pic_urls = re.findall(pic_re, answer['content'])`
	`41`	`+`
	`42`	`+ for item in pic_urls:`
	`43`	`+ # 去掉转移字符 \`
	`44`	`+ pic_url = item.replace("\\", "")`
	`45`	`+ pic_url = pic_url.split('?')[0]`
	`46`	`+`
	`47`	`+ # 去重复`
	`48`	`+ if pic_url not in tmp_list:`
	`49`	`+ tmp_list.append(pic_url)`
	`50`	`+`
	`51`	`+`
	`52`	`+ for pic_url in tmp_list:`
	`53`	`+ if pic_url.endswith('r.jpg'):`
	`54`	`+ print(pic_url)`
	`55`	`+ write_file(title, pic_url)`
	`56`	`+`
	`57`	`+def write_file(title, pic_url):`
	`58`	`+ file_name = title + '.txt'`
	`59`	`+`
	`60`	`+ f = open(file_name, 'a')`
	`61`	`+ f.write(pic_url + '\n')`
	`62`	`+ f.close()`
	`63`	`+`
	`64`	`+def read_file(title):`
	`65`	`+ file_name = title + '.txt'`
	`66`	`+`
	`67`	`+ pic_urls = []`
	`68`	`+`
	`69`	`+ # 判断文件是否存在`
	`70`	`+ if not os.path.exists(file_name):`
	`71`	`+ return pic_urls`
	`72`	`+`
	`73`	`+ with open(file_name, 'r') as f:`
	`74`	`+ for line in f:`
	`75`	`+ url = line.replace("\n", "")`
	`76`	`+ if url not in pic_urls:`
	`77`	`+ pic_urls.append(url)`
	`78`	`+`
	`79`	`+ print("文件中共有{}个不重复的 URL".format(len(pic_urls)))`
	`80`	`+ return pic_urls`
	`81`	`+`
	`82`	`+def download_pic(pic_urls, title):`
	`83`	`+`
	`84`	`+ # 创建文件夹`
	`85`	`+ if not os.path.exists(title):`
	`86`	`+ os.makedirs(title)`
	`87`	`+`
	`88`	`+ error_pic_urls = []`
	`89`	`+ success_pic_num = 0`
	`90`	`+ repeat_pic_num = 0`
	`91`	`+`
	`92`	`+ index = 1`
	`93`	`+`
	`94`	`+ for url in pic_urls:`
	`95`	`+ file_name = os.sep.join((title,basename(urlsplit(url)[2])))`
	`96`	`+`
	`97`	`+ if os.path.exists(file_name):`
	`98`	`+ print("图片{}已存在".format(file_name))`
	`99`	`+ index += 1`
	`100`	`+ repeat_pic_num += 1`
	`101`	`+ continue`
	`102`	`+`
	`103`	`+ try:`
	`104`	`+ urllib.request.urlretrieve(url, file_name)`
	`105`	`+ success_pic_num += 1`
	`106`	`+ index += 1`
	`107`	`+ print("下载{}完成!({}/{})".format(file_name, index, len(pic_urls)))`
	`108`	`+ except:`
	`109`	`+ print("下载{}失败!({}/{})".format(file_name, index, len(pic_urls)))`
	`110`	`+ error_pic_urls.append(url)`
	`111`	`+ index += 1`
	`112`	`+ continue`
	`113`	`+`
	`114`	`+ print("图片全部下载完毕!(成功:{}/重复:{}/失败:{})".format(success_pic_num, repeat_pic_num, len(error_pic_urls)))`
	`115`	`+`
	`116`	`+ if len(error_pic_urls) > 0:`
	`117`	`+ print('下面打印失败的图片地址')`
	`118`	`+ for error_url in error_pic_urls:`
	`119`	`+ print(error_url)`
	`120`	`+`
	`121`	`+if __name__ == '__main__':`
	`122`	`+`
	`123`	`+ qid = 406321189`
	`124`	`+ title = '你们身边有什么素人美女吗(颜值身材巨好的那种)?'`
	`125`	`+`
	`126`	`+ get_image_url(qid, title)`
	`127`	`+`
	`128`	`+ pic_urls = read_file(title)`
	`129`	`+ # 下载文件`
	`130`	`+ download_pic(pic_urls, title)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 806f1e7

File tree

1 file changed

1 file changed

`‎moumoubaimifan/zhihu/zhihu.py`

0 commit comments