From 83d5addee6fd485d0b3693fcb763b02905ce081b Mon Sep 17 00:00:00 2001 From: Mitchell Burton Date: Wed, 24 Jan 2024 20:19:50 -0800 Subject: [PATCH] let's try an RSS/Atom feed --- .github/workflows/static.yml | 2 + generate_rss.py | 256 +++++++++++++++++++++++++++++++++++ markdown/simple-site.md | 2 +- 3 files changed, 259 insertions(+), 1 deletion(-) create mode 100755 generate_rss.py diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index bd2f367..45b68d6 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -38,6 +38,8 @@ jobs: sudo apt update sudo apt install markdown ./convert.sh + ./generate_rss.py + mv ./feed.xml ./_site/rss/feed.xml - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: diff --git a/generate_rss.py b/generate_rss.py new file mode 100755 index 0000000..adf684a --- /dev/null +++ b/generate_rss.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +""" +A simple-as-possible script to generate an RSS feed out of the latest 10 markdown files +in a given directory. + +We depend on Markdown from Daring Fireball being on the $PATH: +https://daringfireball.net/projects/markdown/ +We also depend on `date`: +https://www.gnu.org/software/coreutils/manual/html_node/date-invocation.html +""" +import datetime +import math +import os +import pathlib +import subprocess +import sys +import time +import typing +from xml.etree import ElementTree as ET +from xml.etree.ElementTree import Element, ElementTree + +DATETIME_FORMAT = "%a, %d %b %Y %X %z" +TZ_INFO = datetime.timezone( + datetime.timedelta(seconds=time.localtime().tm_gmtoff) +) # whew + + +class InvalidMarkdownFile(Exception): + """Exception raised when a file cannot be parsed as a MarkdownFile.""" + + +class MarkdownFile: + """A wrapper for a file containing Markdown. + + :param stream: a file-like object that contains the Markdown text. + """ + + def __init__(self, stream: typing.TextIO): + title_line = stream.readline() + _ = stream.readline() + date_line = stream.readline() + stream.seek(0) + + if not title_line.startswith("# "): + raise InvalidMarkdownFile("Missing title header") + + if not date_line.startswith("## "): + raise InvalidMarkdownFile("Missing date header") + + self.stream = stream + self.title = title_line.lstrip("#").strip() + + try: + date = datetime.datetime.fromisoformat(date_line.lstrip("#").strip()) + except ValueError: + raise InvalidMarkdownFile("Date header is not a valid ISO date") + + self.pub_date = date.combine(date.date(), date.time(), TZ_INFO) + + @property + def content(self) -> str: + """The contents of the backing file-like object, converted to Markdown.""" + try: + completed_process = subprocess.run( + ["markdown", "--html4tags"], + input=self.stream.read(), + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + # This is pretty rare - all text can be converted to markdown, so it's not a parsing + # error. + raise InvalidMarkdownFile(f"Could not convert to Markdown {e.stderr}") + + return completed_process.stdout + + def close(self) -> None: + """Closes the backing file-like object.""" + return self.stream.close() + + +class RssElementTree(ElementTree): + """An XML ElementTree representing an RSS feed. + + :param title: The title of the site the RSS feed represents. + :param description: The description of the RSS feed. + :param url: The URL of the site the RSS feed represents. + :param pub_date: The publication date of the RSS feed. If `None`, it is generated from local + time. If provided, it is not validated. + """ + + def __init__( + self, title: str, description: str, url: str, pub_date: str | None = None + ): + if pub_date is None: + pub_date = generate_pub_date() + + element = Element( + "rss", + attrib={"version": "2.0", "xlmns:atom": "http://www.w3.org/2005/Atom"}, + ) + + channel = ET.SubElement(element, "channel") + ET.SubElement(channel, "title").text = title + ET.SubElement(channel, "description").text = description + ET.SubElement(channel, "link").text = url + channel.append( + Element( + "atom:link", + attrib={ + "href": f"{url}feed.xml", + "rel": "self", + "type": "application/rss+xml", + }, + ) + ) + ET.SubElement(channel, "pubDate").text = pub_date + ET.SubElement(channel, "lastBuildDate").text = pub_date + ET.SubElement(channel, "generator").text = "simple-site" + + element.append(channel) + + self.channel = channel + + return super().__init__(element=element) + + def append_item(self, md_file: MarkdownFile) -> None: + """Adds `md_file` to the list of RSS items.""" + item = Element("item") + ET.SubElement(item, "title").text = md_file.title + ET.SubElement(item, "description").text = md_file.content + ET.SubElement(item, "pubDate").text = md_file.pub_date.strftime(DATETIME_FORMAT) + + self.channel.append(item) + + +def collect_md_files( + directory: pathlib.Path, max_files: int | None +) -> tuple[dict[pathlib.Path, MarkdownFile], list[str]]: + """Collects Markdown files in `directory`, searching recursively. + + Files are sorted on two fields: first, their publication date, derived from the second line, + then by their title, derived from the first line. If a file is lacking either of these, it is + omitted with a warning. + + :param directory: the directory to recursively search for Markdown files. + :param max_files: the maximum number of Markdown files to include. Files are sorted on two + fields: first, their publication date, derived from the second line, then by their title + """ + md_files: list[tuple[pathlib.Path, MarkdownFile]] = [] + skipped = [] + stack = [directory] + + target = max_files if max_files else math.inf + + while stack and len(md_files) < target: + current = stack.pop() + + for dir_entry in os.scandir(current): + path = pathlib.Path(dir_entry.path) + + if dir_entry.is_dir(): + stack.append(path) + continue + + try: + fp = open(path) + md_files.append((path, MarkdownFile(fp))) + except InvalidMarkdownFile as e: + skipped.append(f"Skipped {path}: {e}") + + if len(md_files) >= target: + break + + md_files.sort() + + return dict(md_files), skipped + + +def generate_pub_date() -> str: + """Generates a properly-formatted publication date based on local time.""" + now = datetime.datetime.now() + now = now.combine(now.date(), now.time(), TZ_INFO) + + return now.strftime(DATETIME_FORMAT) + + +def main( + directory: pathlib.Path, + title: str, + description: str, + url: str, + max_files: int | None = None, +) -> tuple[RssElementTree | None, list[pathlib.Path], list[str]]: + """Collects Markdown files in `directory`, generating and outputting an RSS feed. + Markdown files are identified by ".md" endings only. + + :param directory: the directory to recursively search for Markdown files. + :param title: the title for the RSS feed. + :param description: the description of the RSS feed. + :param url: the URL of the site the RSS feed represents. + :param max_files: the maximum number of Markdown files to include in the RSS feed. Files are + sorted on two fields: first, their publication date, derived from the second line, then by + their title. + + :returns: the completed RSS feed + """ + md_files, errors = collect_md_files(directory, max_files) + + if not md_files: + return ( + None, + [], + [ + f'No markdown files found in {directory} (we are dumb and only check for ".md" file' + "extensions)" + ], + ) + + rss_tree = RssElementTree(title, description, url) + included_files = [] + + for path, md_file in md_files.items(): + try: + rss_tree.append_item(md_file) + included_files.append(path) + except InvalidMarkdownFile as e: + errors.append(f"Skipped {path}: {e}") + + md_file.close() + + return rss_tree, included_files, errors + + +if __name__ == "__main__": + directory = pathlib.Path("./markdown/") + + rss_tree, included, errors = main( + directory, + "Perfect5th, 2.0", + 'I make-a the software go "beep-boop"', + "https://mitchellburton.ca/blog/", + ) + + for path in included: + print(f"Processed file: {path}") + + for error in errors: + print(error, file=sys.stderr) + + if rss_tree is not None: + print() + rss_tree.write(sys.stdout, encoding="unicode", xml_declaration=True) + rss_tree.write("feed.xml", encoding="unicode", xml_declaration=True) + print() diff --git a/markdown/simple-site.md b/markdown/simple-site.md index 6d0ab87..cdf9a00 100644 --- a/markdown/simple-site.md +++ b/markdown/simple-site.md @@ -1,6 +1,6 @@ # Simple Site -*Created 2024-01-22* +## 2024-01-22 A couple days ago, my friend [tendermario][1] sent me a message: