-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_rss.py
executable file
·264 lines (205 loc) · 8.42 KB
/
generate_rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python3
"""
A simple-as-possible script to generate an RSS feed out of the latest 10 markdown files
in a given directory.
We depend on Markdown from Daring Fireball being on the $PATH:
https://daringfireball.net/projects/markdown/
We also depend on `date`:
https://www.gnu.org/software/coreutils/manual/html_node/date-invocation.html
"""
import datetime
import math
import os
import pathlib
import subprocess
import sys
import time
import typing
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element, ElementTree
DATETIME_FORMAT = "%a, %d %b %Y %X %z"
TZ_INFO = datetime.timezone(
datetime.timedelta(seconds=time.localtime().tm_gmtoff)
) # whew
class InvalidMarkdownFile(Exception):
"""Exception raised when a file cannot be parsed as a MarkdownFile."""
class MarkdownFile:
"""A wrapper for a file containing Markdown.
:param stream: a file-like object that contains the Markdown text.
"""
def __init__(self, stream: typing.TextIO):
title_line = stream.readline()
_ = stream.readline()
date_line = stream.readline()
stream.seek(0)
if not title_line.startswith("# "):
raise InvalidMarkdownFile("Missing title header")
if not date_line.startswith("## "):
raise InvalidMarkdownFile("Missing date header")
self.stream = stream
self.title = title_line.lstrip("#").strip()
try:
date = datetime.datetime.fromisoformat(date_line.lstrip("#").strip())
except ValueError:
raise InvalidMarkdownFile("Date header is not a valid ISO date")
self.pub_date = date.combine(date.date(), date.time(), TZ_INFO)
@property
def content(self) -> str:
"""The contents of the backing file-like object, converted to Markdown."""
try:
completed_process = subprocess.run(
["markdown", "--html4tags"],
input=self.stream.read(),
capture_output=True,
check=True,
text=True,
)
except subprocess.CalledProcessError as e:
# This is pretty rare - all text can be converted to markdown, so it's not a parsing
# error.
raise InvalidMarkdownFile(f"Could not convert to Markdown {e.stderr}")
return completed_process.stdout
def close(self) -> None:
"""Closes the backing file-like object."""
return self.stream.close()
class RssElementTree(ElementTree):
"""An XML ElementTree representing an RSS feed.
:param title: The title of the site the RSS feed represents.
:param description: The description of the RSS feed.
:param url: The URL of the site the RSS feed represents.
:param pub_date: The publication date of the RSS feed. If `None`, it is generated from local
time. If provided, it is not validated.
"""
def __init__(
self, title: str, description: str, url: str, pub_date: str | None = None
):
if pub_date is None:
pub_date = generate_pub_date()
element = Element(
"rss",
attrib={"version": "2.0", "xmlns:atom": "http://www.w3.org/2005/Atom"},
)
channel = Element("channel")
ET.SubElement(channel, "title").text = title
ET.SubElement(channel, "description").text = description
ET.SubElement(channel, "link").text = url
channel.append(
Element(
"atom:link",
attrib={
"href": f"{url}feed.xml",
"rel": "self",
"type": "application/rss+xml",
},
)
)
ET.SubElement(channel, "pubDate").text = pub_date
ET.SubElement(channel, "lastBuildDate").text = pub_date
ET.SubElement(channel, "generator").text = "simple-site"
element.append(channel)
self.channel = channel
self.url = url
return super().__init__(element=element)
def append_item(self, md_file: MarkdownFile, path: pathlib.Path) -> None:
"""Adds `md_file` to the list of RSS items."""
item = Element("item")
str_path = str(path)
if str_path.endswith("/index.md"):
url = self.url + str_path.replace("markdown/", "").replace("/index.html", "/")
else:
url = self.url + str_path.replace("markdown/", "").replace(".md", ".html")
ET.SubElement(item, "title").text = md_file.title
ET.SubElement(item, "description").text = md_file.content
ET.SubElement(item, "pubDate").text = md_file.pub_date.strftime(DATETIME_FORMAT)
ET.SubElement(item, "link").text = url
ET.SubElement(item, "guid", attrib={"isPermaLink": "true"}).text = url
self.channel.append(item)
def collect_md_files(
directory: pathlib.Path, max_files: int | None
) -> tuple[dict[pathlib.Path, MarkdownFile], list[str]]:
"""Collects Markdown files in `directory`, searching recursively.
Files are sorted on two fields: first, their publication date, derived from the second line,
then by their title, derived from the first line. If a file is lacking either of these, it is
omitted with a warning.
:param directory: the directory to recursively search for Markdown files.
:param max_files: the maximum number of Markdown files to include. Files are sorted on two
fields: first, their publication date, derived from the second line, then by their title
"""
md_files: list[tuple[pathlib.Path, MarkdownFile]] = []
skipped = []
stack = [directory]
target = max_files if max_files else math.inf
while stack and len(md_files) < target:
current = stack.pop()
for dir_entry in os.scandir(current):
path = pathlib.Path(dir_entry.path)
if dir_entry.is_dir():
stack.append(path)
continue
try:
fp = open(path)
md_files.append((path, MarkdownFile(fp)))
except InvalidMarkdownFile as e:
skipped.append(f"Skipped {path}: {e}")
if len(md_files) >= target:
break
md_files.sort()
return dict(md_files), skipped
def generate_pub_date() -> str:
"""Generates a properly-formatted publication date based on local time."""
now = datetime.datetime.now()
now = now.combine(now.date(), now.time(), TZ_INFO)
return now.strftime(DATETIME_FORMAT)
def main(
directory: pathlib.Path,
title: str,
description: str,
url: str,
max_files: int | None = None,
) -> tuple[RssElementTree | None, list[pathlib.Path], list[str]]:
"""Collects Markdown files in `directory`, generating and outputting an RSS feed.
Markdown files are identified by ".md" endings only.
:param directory: the directory to recursively search for Markdown files.
:param title: the title for the RSS feed.
:param description: the description of the RSS feed.
:param url: the URL of the site the RSS feed represents.
:param max_files: the maximum number of Markdown files to include in the RSS feed. Files are
sorted on two fields: first, their publication date, derived from the second line, then by
their title.
:returns: the completed RSS feed
"""
md_files, errors = collect_md_files(directory, max_files)
if not md_files:
return (
None,
[],
[
f'No markdown files found in {directory} (we are dumb and only check for ".md" file'
"extensions)"
],
)
rss_tree = RssElementTree(title, description, url)
included_files = []
for path, md_file in md_files.items():
try:
rss_tree.append_item(md_file, path)
included_files.append(path)
except InvalidMarkdownFile as e:
errors.append(f"Skipped {path}: {e}")
md_file.close()
return rss_tree, included_files, errors
if __name__ == "__main__":
directory = pathlib.Path("./markdown/")
rss_tree, included, errors = main(
directory,
"Perfect5th, 2.0",
'I make-a the software go "beep-boop"',
"https://mitchellburton.ca/blog/",
)
for path in included:
print(f"Processed file: {path}")
for error in errors:
print(error, file=sys.stderr)
if rss_tree is not None:
rss_tree.write("./feed.xml", encoding="unicode", xml_declaration=True)
print("Wrote feed to ./feed.xml")