Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 806f1e7

Browse files
committed
no message
1 parent 9605ade commit 806f1e7

File tree

1 file changed

+130
-0
lines changed

1 file changed

+130
-0
lines changed

‎moumoubaimifan/zhihu/zhihu.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# -*- coding:utf-8 -*-
2+
3+
import re
4+
import requests
5+
import os
6+
import urllib.request
7+
import ssl
8+
9+
from urllib.parse import urlsplit
10+
from os.path import basename
11+
import json
12+
13+
ssl._create_default_https_context = ssl._create_unverified_context
14+
15+
headers = {
16+
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
17+
'Accept-Encoding': 'gzip, deflate'
18+
}
19+
20+
def get_image_url(qid, title):
21+
answers_url = 'https://www.zhihu.com/api/v4/questions/'+str(qid)+'/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={}&limit=10&sort_by=default&platform=desktop'
22+
offset = 0
23+
session = requests.Session()
24+
25+
while True:
26+
page = session.get(answers_url.format(offset), headers = headers)
27+
json_text = json.loads(page.text)
28+
answers = json_text['data']
29+
30+
offset += 10
31+
32+
if not answers:
33+
print('获取图片地址完成')
34+
return
35+
36+
pic_re = re.compile('data-original="(.*?)"', re.S)
37+
38+
for answer in answers:
39+
tmp_list = []
40+
pic_urls = re.findall(pic_re, answer['content'])
41+
42+
for item in pic_urls:
43+
# 去掉转移字符 \
44+
pic_url = item.replace("\\", "")
45+
pic_url = pic_url.split('?')[0]
46+
47+
# 去重复
48+
if pic_url not in tmp_list:
49+
tmp_list.append(pic_url)
50+
51+
52+
for pic_url in tmp_list:
53+
if pic_url.endswith('r.jpg'):
54+
print(pic_url)
55+
write_file(title, pic_url)
56+
57+
def write_file(title, pic_url):
58+
file_name = title + '.txt'
59+
60+
f = open(file_name, 'a')
61+
f.write(pic_url + '\n')
62+
f.close()
63+
64+
def read_file(title):
65+
file_name = title + '.txt'
66+
67+
pic_urls = []
68+
69+
# 判断文件是否存在
70+
if not os.path.exists(file_name):
71+
return pic_urls
72+
73+
with open(file_name, 'r') as f:
74+
for line in f:
75+
url = line.replace("\n", "")
76+
if url not in pic_urls:
77+
pic_urls.append(url)
78+
79+
print("文件中共有{}个不重复的 URL".format(len(pic_urls)))
80+
return pic_urls
81+
82+
def download_pic(pic_urls, title):
83+
84+
# 创建文件夹
85+
if not os.path.exists(title):
86+
os.makedirs(title)
87+
88+
error_pic_urls = []
89+
success_pic_num = 0
90+
repeat_pic_num = 0
91+
92+
index = 1
93+
94+
for url in pic_urls:
95+
file_name = os.sep.join((title,basename(urlsplit(url)[2])))
96+
97+
if os.path.exists(file_name):
98+
print("图片{}已存在".format(file_name))
99+
index += 1
100+
repeat_pic_num += 1
101+
continue
102+
103+
try:
104+
urllib.request.urlretrieve(url, file_name)
105+
success_pic_num += 1
106+
index += 1
107+
print("下载{}完成!({}/{})".format(file_name, index, len(pic_urls)))
108+
except:
109+
print("下载{}失败!({}/{})".format(file_name, index, len(pic_urls)))
110+
error_pic_urls.append(url)
111+
index += 1
112+
continue
113+
114+
print("图片全部下载完毕!(成功:{}/重复:{}/失败:{})".format(success_pic_num, repeat_pic_num, len(error_pic_urls)))
115+
116+
if len(error_pic_urls) > 0:
117+
print('下面打印失败的图片地址')
118+
for error_url in error_pic_urls:
119+
print(error_url)
120+
121+
if __name__ == '__main__':
122+
123+
qid = 406321189
124+
title = '你们身边有什么素人美女吗(颜值身材巨好的那种)?'
125+
126+
get_image_url(qid, title)
127+
128+
pic_urls = read_file(title)
129+
# 下载文件
130+
download_pic(pic_urls, title)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /