-
Notifications
You must be signed in to change notification settings - Fork 4
/
Get_Passage_W1.py
187 lines (163 loc) · 6.84 KB
/
Get_Passage_W1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import requests
from bs4 import BeautifulSoup
import json
import re
import threading
import time
import os
import urllib.parse
from tqdm import tqdm
stop_event = threading.Event()
TIMEOUT = 10 # seconds
def get_categories():
url = 'http://www.h528.com/'
try:
response = requests.get(url, timeout=TIMEOUT)
response.encoding = 'utf-8' # 确保正确处理中文编码
except requests.exceptions.RequestException as e:
print(f"请求主页面时发生错误:{e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
categories = []
# 查找所有符合条件的类别链接
for li in soup.find_all('li', class_=re.compile('^cat-item cat-item-')):
if stop_event.is_set():
break
a_tag = li.find('a')
if a_tag:
category_url = a_tag['href']
# 检查链接中是否包含 URL 编码的字符(%开头的十六进制数)
if re.search(r'%[0-9A-Fa-f]{2}', category_url):
decoded_url = urllib.parse.unquote(category_url)
# 检查解码后的链接中是否包含中文字符
if re.search(r'[\u4e00-\u9fff]', decoded_url):
category_name = a_tag.get_text(strip=True)
categories.append({'name': category_name, 'url': category_url})
return categories
def get_article_links(category_url, page_number):
if stop_event.is_set():
return None
if page_number == 1:
url = category_url
else:
url = f"{category_url}/page/{page_number}"
try:
response = requests.get(url, timeout=TIMEOUT)
response.encoding = 'utf-8'
except requests.exceptions.RequestException as e:
print(f"请求页面 {url} 时发生错误:{e}")
return None
if '<h2 class="center">Error 404 - Not Found</h2>' in response.text:
return None # 结束条件
soup = BeautifulSoup(response.text, 'html.parser')
article_links = []
for a in soup.find_all('a', rel='bookmark'):
if stop_event.is_set():
break
if 'Permanent Link to' in a.get('title', ''):
article_links.append({'url': a['href'], 'title': a.get_text(strip=True)})
return article_links
def remove_invisible_characters(text):
# 定义不可见字符的Unicode范围
invisible_chars = [
('\u200b', '\u200f'), # 零宽字符和其他格式字符
('\u202a', '\u202e'), # 左到右嵌入等
('\ufeff', '\ufeff'), # BOM
]
for start, end in invisible_chars:
pattern = f'[{start}-{end}]'
text = re.sub(pattern, '', text)
# 删除其他不可见字符,但保留换行符 \n
text = ''.join(ch for ch in text if ch.isprintable() or ch == '\n')
return text
def get_article_content(article_url):
if stop_event.is_set():
return '', ''
try:
response = requests.get(article_url, timeout=TIMEOUT)
response.encoding = 'utf-8'
except requests.exceptions.RequestException as e:
print(f"请求文章 {article_url} 时发生错误:{e}")
return '', ''
soup = BeautifulSoup(response.text, 'html.parser')
# 获取文章标题
title_tag = soup.find('h1', class_='entry-title')
article_title = title_tag.get_text(strip=True) if title_tag else '未知标题'
content = ''
for p in soup.find_all('p'):
if stop_event.is_set():
break
# 获取 <p> 标签内的文本,并替换 <br> 为换行符
paragraph = p.decode_contents().replace('<br/>', '\n').replace('<br>', '\n').replace('<br />', '\n')
# 移除可能残留的 HTML 标签
paragraph_text = BeautifulSoup(paragraph, 'html.parser').get_text()
content += paragraph_text + '\n' # 每个段落后添加一个换行符
# 删除出现的两个连续空格
content = re.sub(r' {2}', '', content)
# 移除不可见的 Unicode 字符
content = remove_invisible_characters(content)
return article_title, content.strip()
def process_category(category):
print(f"开始处理类别:{category['name']}")
filename = f"{category['name']}.json"
# 打开文件,写入 JSON 数组的开头
with open(filename, 'w', encoding='utf-8') as f:
f.write('[\n')
page_number = 1
first_entry = True # 标志,用于处理 JSON 对象之间的逗号
total_articles = 0 # 用于计算总文章数
while not stop_event.is_set():
article_links = get_article_links(category['url'], page_number)
if not article_links:
if page_number == 1:
print(f"类别「{category['name']}」没有找到任何文章。")
else:
print(f"类别「{category['name']}」的所有页面已处理完毕。")
break
num_articles = len(article_links)
total_articles += num_articles
# 使用 tqdm 进度条
with tqdm(total=num_articles, desc=f"类别「{category['name']}」第 {page_number} 页", unit="篇", ncols=80) as pbar:
for article_info in article_links:
if stop_event.is_set():
break
article_url = article_info['url']
article_title = article_info['title']
title, content = get_article_content(article_url)
if content:
# 创建 JSON 对象
document = {'text': content}
# 写入 JSON 文件
with open(filename, 'a', encoding='utf-8') as f:
if not first_entry:
f.write(',\n') # 在下一个 JSON 对象前写入逗号和换行
else:
first_entry = False
json.dump(document, f, ensure_ascii=False)
pbar.update(1)
page_number += 1
# 写入 JSON 数组的结尾
with open(filename, 'a', encoding='utf-8') as f:
f.write('\n]')
print(f"类别「{category['name']}」的数据已保存到文件 {filename}。")
def main():
categories = get_categories()
if not categories:
print("未能获取任何类别,程序将退出。")
return
threads = []
for category in categories:
t = threading.Thread(target=process_category, args=(category,))
threads.append(t)
t.start()
try:
while any(t.is_alive() for t in threads):
time.sleep(0.1)
except KeyboardInterrupt:
print("\n程序已被用户中断,正在尝试停止所有线程...")
stop_event.set()
for t in threads:
t.join()
print("所有线程已结束,程序退出。")
if __name__ == "__main__":
main()