-
Notifications
You must be signed in to change notification settings - Fork 104
/
Copy pathzhihu_answer.py
156 lines (147 loc) · 6.17 KB
/
zhihu_answer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from urllib.parse import unquote
from tqdm import tqdm
import requests
import json
import time
import sys
import re
from config import config
import login
sys.path.append('..')
from utils import net, fs
requests.packages.urllib3.disable_warnings()
def wait_response(url):
sleep_lengh = 30
response = None
while True:
# todo:不显示为一行的问题
wait_process_bar = tqdm(total=sleep_lengh, ncols=75)
wait_process_bar.set_description('等待{}秒'.format(sleep_lengh))
for i in range(sleep_lengh):
time.sleep(1)
wait_process_bar.update(1)
wait_process_bar.close()
try:
response = requests.get(url=url, headers=login.headers, verify=False)
except Exception as e:
print('requests Exception:{}'.format(e))
if response.ok:
print('获取成功。response.ok:{}'.format(response.ok))
break
else:
print('response.ok:{}'.format(response.ok))
if sleep_lengh > 60:# 等待时长大于 60 秒时尝试验证或登录
msg = response.json()
error = msg['error']
if not error['need_login'] and error['redirect']:
print('需要验证。')
login.main()
elif error['need_login'] and error['redirect']:
print('需要登录。')
login.main()
sleep_lengh += 30
return response
""" 分页面获取全部回答,返回一个包含所有分页的 List(每个分页包含若干回答) """
def get_all_answers(question_id):
if not net.is_connected():
print('请求失败,请检查网络。')
return
answers = []
sum = 0
process_bar = None
response = None
url = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data[*].content,voteup_count&limit=5&offset=0&sort_by=default'.format(question_id)
while True:
try:
response = requests.get(url=url, headers=login.headers, verify=False)
except Exception as e:# “远程主机强迫关闭了一个现有的连接。”
print('获取出错:{}'.format(e))
print(unquote('url:{}'.format(url), encoding='utf-8'))
response = wait_response(url)
if not response.ok: # id 不存在或被检测到
if '404' in response.text:
print('获取出错:{}。可能输入了错误的问题 id。'.format(response.text))
break
print(unquote('url:{}'.format(url), encoding='utf-8'))
response = wait_response(url)
json = response.json()
if json['data']:
answers.append(json)
sum += len(json['data'])
# 进度条
if len(answers) == 1:
print('标题:{}'.format(answers[0]['data'][0]['question']['title']))
process_bar = tqdm(total=answers[0]['paging']['totals'], ncols=75)
process_bar.set_description("获取")
process_bar.update(len(json['data']))
# 检查是否为最后一页
if not json['paging']['is_end']:
url = json['paging']['next']
else:
break
else:
break
process_bar.close()
print('获取结束\t总回答数:{}\t获取回答数:{}'.format(answers[0]['paging']['totals'], sum))
return answers
""" 处理单页数据,返回包含作者、回答内容及点赞数的字符串 """
def get_content(json, config={}):
res = ''
answers = json['data']
count = 0
for answer in answers:
count += 1
author_home_page = ''
if config['author']:
author_name = answer['author']['name']
author_url_token = answer['author']['url_token']
author_home_page = '[{}]({})'.format(
author_name, 'https://www.zhihu.com/people/' + author_url_token)
else:
author_home_page = '回答 {}'.format(count)
# 无用 img、figure、noscript 替换为空
content = re.sub(r'(<img[^>]*src="data[^>]*\/>|<figure>|<noscript>|<\/figure>|<\/noscript>)', '', answer['content'])
# <br/> 及 <br> 替换为 \n(会影响 HTML,暂时不使用)
# content = re.sub(r'(<br\/>|<br>)', '\n', content)
voteup_count = ''
if config['voteup_count']:
voteup_count = '**{}**\n\n'.format(answer['voteup_count'])
res = '{}# {}\n\n{}\n\n{}'.format(res, author_home_page, content, voteup_count)
return res
""" 将回答写入文件,每个回答页面单独存储在一个文件中 """
def write_to_files(answers, config={}):
question_dir = ''
file_names = []
if answers[0]:
current_time = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())
question_dir = 'temp\\{question_title}\\{current_time}'.format(
question_title=answers[0]['data'][0]['question']['title'], current_time=current_time)
else:
return
fs.create_dir(question_dir)
page_count = 0
# 创建 pages 文件
print('创建 pages...')
for page_data in answers:
page_count += 1
file_name = 'page{}'.format(page_count)
# 处理各页的数据之后将其写入文件
with open(file='.\\{}\\{}.md'.format(question_dir, file_name), mode='w', encoding='utf-8') as file:
file.write(get_content(page_data, config))
file_names.append(
'[{file_name}](./{file_name}.md)'.format(file_name=file_name))
# 创建各回答的索引文件
print('创建 index.md...')
with open(file='.\\{}\\index.md'.format(question_dir,), mode='w', encoding='utf-8') as file:
for name in file_names:
file.write(name+'\n\n')
# 数据保存为 json
print('创建 answers.json...')
with open(file='.\\{}\\answers.json'.format(question_dir), mode='w', encoding='utf-8') as file:
file.write(json.dumps(answers, ensure_ascii=False))
print('路径:{}'.format(question_dir))
# 返回 answers.json 的路径,方便进行词频分析
return '.\\{}\\answers.json'.format(question_dir)
if __name__ == '__main__':
answers = get_all_answers(268384579)
write_to_files(answers, config)