-
Notifications
You must be signed in to change notification settings - Fork 1
/
down_woodward.py
117 lines (101 loc) · 3.62 KB
/
down_woodward.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
import pandas as pd
import shutil
from io import StringIO
import sys
import json
import re
from typing import Union, Dict
import random
import time
def generate_jquery_string():
"""
根据: https://woodward.library.ubc.ca/wp-content/plugins/enable-jquery-migrate-helper/js/jquery/jquery-1.12.4-wp.js?ver=1.12.4-wp
的响应内容生成 jQuery 字符串
js的代码: "jQuery" + (m + Math.random()).replace(/\D/g, "")
这里改写为Python代码
"""
m = "1.12.4" # 从响应内容中获取, 版本号
combined_string = m + str(random.random())
# 将m转换为字符串,并去除非数字字符
filtered_string = "".join(filter(str.isdigit, combined_string))
jquery_string = "jQuery" + filtered_string
return jquery_string
def parse_jsonp(jsonp_str: str) -> Union[Dict, None]:
"""
解析 JSONP 字符串, 返回 JSON 对象
Args:
jsonp_str: JSONP 字符串
Returns:
JSON 对象
"""
# 去除 JSONP 字符串的前后括号
jsonp_str = jsonp_str[jsonp_str.find("(") + 1 : jsonp_str.rfind(")")]
# 转换为 JSON 格式
try:
json_data = json.loads(jsonp_str)
return json_data
except json.JSONDecodeError as e:
print(f"Error decoding JSONP string: {e}")
return None
headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "https://woodward.library.ubc.ca/",
"Sec-Fetch-Dest": "script",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-site",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
"sec-ch-ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
}
if __name__ == "__main__":
jquery_string = generate_jquery_string()
now_time = int(time.time() * 1000)
params = {"callback": f"{jquery_string}_{now_time}", "_": str(now_time)}
url = "https://journal-abbreviations.library.ubc.ca/dump.php"
response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
print(f"Failed to get data from {url}.")
sys.exit(1)
jsonp_str = response.text
json_data = parse_jsonp(jsonp_str)
if json_data is None:
print("Failed to parse JSONP.")
sys.exit(1)
html_data = json_data["html"]
df = pd.read_html((StringIO(html_data))) # returns a list of DataFrames
if len(df) == 0:
print("No data found.")
sys.exit(1)
print(f"type(df): {type(df)}")
print(f"len(df): {len(df)}")
# 数据清理
data = (
df[0] # 获取第一个DataFrame
.dropna() # 删除含有NaN的所有行
.drop_duplicates() # 删除重复行
.query('Abbreviation != "top"') # 删除 Abbreviation = top 的行
.query(
'not Abbreviation.str.contains("^[0-9A-Z] top$")', engine="python"
) # 删除 Abbreviation = * top 的行
.reset_index(drop=True) # 重置索引
)
print(data.head(10)) # 查看前10行数据
# 保存数据
df = data[["Title", "Abbreviation"]]
if not df.empty:
df.to_csv(
"woodward_library_new.csv",
index=False,
header=False,
sep=",",
quoting=1,
quotechar='"',
encoding="utf-8-sig",
)
print(f"数据共有{df.shape[0]}行, 保存在./woodward_library_new.csv 文件中!!!")