forked from ofou/graham-essays
-
Notifications
You must be signed in to change notification settings - Fork 0
/
graham.py
executable file
·106 lines (82 loc) · 3.18 KB
/
graham.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from asyncio.log import logger
import feedparser
import urllib.request
import time
import os.path
import html2text
import unidecode
import regex as re
from htmldate import find_date
import csv
"""
Download a collection of Paul Graham essays in EPUB & Markdown.
"""
rss = feedparser.parse("http://www.aaronsw.com/2002/feeds/pgessays.rss")
h = html2text.HTML2Text()
h.ignore_images = True
h.ignore_tables = True
h.escape_all = True
h.reference_links = True
h.mark_code = True
ART_NO = 1
FILE = "./essays.csv"
if ART_NO == 1:
if os.path.isfile(FILE):
os.remove(FILE)
def update_links_in_md(joined):
matches = re.findall(b"\[\d+\]", joined)
if not matches:
return joined
for match in set(matches):
def update_links(match):
counter[0] += 1
note_name = f"{title}_note{note_number}"
if counter[0] == 1:
return bytes(f"[{note_number}](#{note_name})", "utf-8")
elif counter[0] == 2:
return bytes(f"<a name={note_name}>[{note_number}]</a>", "utf-8")
counter = [0]
note_number = int(match.decode().strip("[]"))
match_regex = match.replace(b"[", b"\[").replace(b"]", b"\]")
joined = re.sub(match_regex, update_links, joined)
return joined
for entry in reversed(rss.entries):
URL = entry["link"]
if "http://www.paulgraham.com/https://" in URL:
URL = URL.replace("http://www.paulgraham.com/https://", "https://")
TITLE = entry["title"]
try:
with urllib.request.urlopen(URL) as website:
content = website.read().decode(r"unicode_escape", "utf-8")
parsed = h.handle(content)
title = "_".join(TITLE.split(" ")).lower()
title = re.sub(r"[\W\s]+", "", title)
with open(f"./essays/{ART_NO:03}_{title}.md", "wb+") as file:
file.write(f"# {ART_NO:03} {TITLE}\n\n".encode())
parsed = parsed.replace("[](index.html) \n \n", "")
parsed = [
(
p.replace("\n", " ")
if re.match(r"^[\p{Z}\s]*(?:[^\p{Z}\s][\p{Z}\s]*){5,100}$", p)
else "\n" + p + "\n"
)
for p in parsed.split("\n")
]
encoded = " ".join(parsed).encode()
update_with_links = update_links_in_md(encoded)
file.write(update_with_links)
print(f"✅ {ART_NO:03} {TITLE}")
with open(FILE, "a+", newline="\n") as f:
csvwriter = csv.writer(
f, quoting=csv.QUOTE_MINIMAL, delimiter=",", quotechar='"'
)
if ART_NO == 1:
fieldnames = ["Article no.", "Title", "Date", "URL"]
csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
csvwriter.writeheader()
line = [ART_NO, TITLE, DATE, URL]
csvwriter.writerow(line)
except Exception as e:
print(f"❌ {ART_NO:03} {entry['title']}, ({e})")
ART_NO += 1
time.sleep(0.05) # half sec/article is ~2min, be nice with servers!