-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhihu_toc_extractor.py
70 lines (53 loc) · 2.02 KB
/
zhihu_toc_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import requests
from bs4 import BeautifulSoup
import time
import os
def create_temp_directory():
timestamp = int(time.time())
temp_dir = f"temp_dir_{timestamp}"
os.makedirs(temp_dir)
return temp_dir
def extract_toc_and_title(url):
response = requests.get(url)
if response.status_code != 200:
print(
f"Error: Failed to fetch URL {url}. Status code: {response.status_code}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
title_element = soup.find('h1', class_='Post-Title')
catalog_elements = soup.find_all(
['h2', 'h3'], attrs={"data-into-catalog-status": True})
title = title_element.get_text() if title_element else "No Title"
toc_content = ""
for catalog_element in catalog_elements:
text_content = catalog_element.get_text()
toc_content += text_content + "\n"
return title, toc_content
def save_to_file(output_dir, url, title, toc_content):
filename = "".join(c if c.isalnum() else "_" for c in url)
file_path = os.path.join(output_dir, f"{filename}.txt")
with open(file_path, "w", encoding="utf-8") as output_file:
output_file.write(title + "\n\n" + toc_content)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Zhihu Table of Contents Extractor")
parser.add_argument(
"--file", help="Specify a file containing a list of URLs.")
parser.add_argument("--url", nargs='+',
help="Specify one or more URLs separated by spaces")
args = parser.parse_args()
if args.file:
with open(args.file, "r") as file:
urls = file.read().splitlines()
elif args.url:
urls = args.url
else:
parser.print_help()
exit(1)
output_dir = create_temp_directory()
print(f"Output directory: {output_dir}")
for url in urls:
title, toc_content = extract_toc_and_title(url)
if title or toc_content:
save_to_file(output_dir, url, title, toc_content)