-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscanner.py
76 lines (66 loc) · 2.64 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# 这个程序简单记录所有有效的知乎问题链接
import requests
import re
import random
from time import sleep
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
nummber = 19551349 # 初次运行时,默认开启知乎第一个问题
try:
with open('zhihu_valid_links.txt', 'r') as f:
data = f.readlines()[-1]
# print(data)
zhihu_link = data.split('|*|')[0]
# print(zhihu_link.split('/'))
nummber = int(zhihu_link.split('/')[-1].strip())
except Exception as e:
print('no breakpoint:', e)
num_need = 1000000
num_end = nummber + num_need
print(f'from {nummber} to {num_end}')
while nummber >= 19550224 and nummber <= num_end:
# 随机休息
time_sleep = random.uniform(1, 2)
sleep(time_sleep)
nummber = nummber + 1
url = 'https://www.zhihu.com/question/' + str(nummber)
try: # 网络错误情况处理
response = requests.get(url, headers=headers)
except Exception as e:
print('**')
time_sleep = random.uniform(300, 600)
print('网络错误,暂停{time_sleep}s:', e)
sleep(time_sleep)
print('question id:', nummber, ';error:', e)
nummber -= 1
continue
# print(url)
# print(response.text)
if response.status_code == 200:
if '你似乎来到了没有知识存在的荒原' in response.text:
continue
else:
try:
title = re.findall('<title data-rh="true">(.*)</title>', response.text)[0]
except Exception as e:
print('question id:', nummber, ';extract title error:', e)
continue
print('title:', title)
if title == "安全验证 - 知乎":
print('question id:', nummber, ';触发安全验证,请降低速度')
with open('breakpoint.txt', 'a') as f:
f.write(str(nummber))
break
try:
num_answer = re.findall('<span>(.*)<!-- --> 个回答</span>', response.text)[0]
except Exception as e: # 一般是没有回答的问题
# print('question id:', nummber, ';error:', e)
# continue
num_answer = 0
print('num_answer:', num_answer)
with open('zhihu_valid_links.txt', 'a') as f:
f.write(url + '|*|' + title +'|*|' + str(num_answer) + '\n')
# print(response.text)
print(url)
print('--')
else:
print('question id:', nummber, ';error code:', response.status_code)