-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdownLoadByDoiAndSave.py
77 lines (67 loc) · 3.06 KB
/
downLoadByDoiAndSave.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# 根据dio号在网上进行pdf的下载
# 注意考虑搜索不到的情况
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import re
import os
# 根据dio号和下载到文件名为file_name的pdf,输出路径在output下
def downloadByDoi(doi, file_name):
url = 'https://www.sci-hub.ren/' + doi
html = askUrl(url) # 得到html
pdfUrl = parseHtml(html, file_name)
downLoadPdf(pdfUrl, file_name)
def parseHtml(html, file_name):
try:
bs = BeautifulSoup(html, "html.parser")
t_list = bs.select('#pdf')
rePdfUrl = r'src="(.*?)"'
pdfUrl = re.findall(rePdfUrl, str(t_list[0]))[0]
return pdfUrl
except:
print("paper-->" + file_name + " not found!")
with open("./not_found.txt", "a", encoding='utf-8') as f:
f.write(file_name)
f.write("\n")
return None
def askUrl(url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 Edg/93.0.961.52',
"Cookie": 'UM_distinctid=17a29eaff3f963-071e9fe5226b6-7a697d6e-144000-17a29eaff40fd; __gads=ID=3b7918afdf2b673c:T=1624200904:S=ALNI_MbaN4K1VfpLCOqpVRrExY11AKe_MA; _a_d3t6sf=dumciURjNgzwzSDQ5cNhTwWI; CNZZDATA1276057484=1027570305-1624198255-|1632096741'
}
DoiRequest = urllib.request.Request(url, headers=headers)
html = ''
try:
response = urllib.request.urlopen(DoiRequest)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html
def downLoadPdf(url, file_name):
if url is not None:
# <embed type="application/pdf" src="(.*?)" id="pdf"> 格式很多类似的 但是src和id是固定的
rootPath = r"./output/"
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 Edg/93.0.961.52',
"Cookie": 'UM_distinctid=17a29eaff3f963-071e9fe5226b6-7a697d6e-144000-17a29eaff40fd; __gads=ID=3b7918afdf2b673c:T=1624200904:S=ALNI_MbaN4K1VfpLCOqpVRrExY11AKe_MA; _a_d3t6sf=dumciURjNgzwzSDQ5cNhTwWI; CNZZDATA1276057484=1027570305-1624198255-|1632096741'
}
try:
DoiRequest = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(DoiRequest)
except ValueError as e:
if "unknown url type" in str(e):
DoiRequest = urllib.request.Request("http:" + url, headers=headers)
response = urllib.request.urlopen(DoiRequest)
html = response.read() # 这里别整utf-8转码
file_name = file_name.replace(":", "")
with open(rootPath + file_name + ".pdf", "wb") as f:
# 写文件用bytes而不是str,所以要转码
f.write(html)
#
if __name__ == '__main__':
downloadByDoi("10.1109/tit.2014.2354403", "test/download")