Skip to content

Commit

Permalink
let's try an RSS/Atom feed
Browse files Browse the repository at this point in the history
  • Loading branch information
Perfect5th committed Jan 25, 2024
1 parent 9c6d53c commit 83d5add
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .github/workflows/static.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ jobs:
sudo apt update
sudo apt install markdown
./convert.sh
./generate_rss.py
mv ./feed.xml ./_site/rss/feed.xml
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
Expand Down
256 changes: 256 additions & 0 deletions generate_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
#!/usr/bin/env python3
"""
A simple-as-possible script to generate an RSS feed out of the latest 10 markdown files
in a given directory.
We depend on Markdown from Daring Fireball being on the $PATH:
https://daringfireball.net/projects/markdown/
We also depend on `date`:
https://www.gnu.org/software/coreutils/manual/html_node/date-invocation.html
"""
import datetime
import math
import os
import pathlib
import subprocess
import sys
import time
import typing
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element, ElementTree

DATETIME_FORMAT = "%a, %d %b %Y %X %z"
TZ_INFO = datetime.timezone(
datetime.timedelta(seconds=time.localtime().tm_gmtoff)
) # whew


class InvalidMarkdownFile(Exception):
"""Exception raised when a file cannot be parsed as a MarkdownFile."""


class MarkdownFile:
"""A wrapper for a file containing Markdown.
:param stream: a file-like object that contains the Markdown text.
"""

def __init__(self, stream: typing.TextIO):
title_line = stream.readline()
_ = stream.readline()
date_line = stream.readline()
stream.seek(0)

if not title_line.startswith("# "):
raise InvalidMarkdownFile("Missing title header")

if not date_line.startswith("## "):
raise InvalidMarkdownFile("Missing date header")

self.stream = stream
self.title = title_line.lstrip("#").strip()

try:
date = datetime.datetime.fromisoformat(date_line.lstrip("#").strip())
except ValueError:
raise InvalidMarkdownFile("Date header is not a valid ISO date")

self.pub_date = date.combine(date.date(), date.time(), TZ_INFO)

@property
def content(self) -> str:
"""The contents of the backing file-like object, converted to Markdown."""
try:
completed_process = subprocess.run(
["markdown", "--html4tags"],
input=self.stream.read(),
capture_output=True,
check=True,
text=True,
)
except subprocess.CalledProcessError as e:
# This is pretty rare - all text can be converted to markdown, so it's not a parsing
# error.
raise InvalidMarkdownFile(f"Could not convert to Markdown {e.stderr}")

return completed_process.stdout

def close(self) -> None:
"""Closes the backing file-like object."""
return self.stream.close()


class RssElementTree(ElementTree):
"""An XML ElementTree representing an RSS feed.
:param title: The title of the site the RSS feed represents.
:param description: The description of the RSS feed.
:param url: The URL of the site the RSS feed represents.
:param pub_date: The publication date of the RSS feed. If `None`, it is generated from local
time. If provided, it is not validated.
"""

def __init__(
self, title: str, description: str, url: str, pub_date: str | None = None
):
if pub_date is None:
pub_date = generate_pub_date()

element = Element(
"rss",
attrib={"version": "2.0", "xlmns:atom": "http://www.w3.org/2005/Atom"},
)

channel = ET.SubElement(element, "channel")
ET.SubElement(channel, "title").text = title
ET.SubElement(channel, "description").text = description
ET.SubElement(channel, "link").text = url
channel.append(
Element(
"atom:link",
attrib={
"href": f"{url}feed.xml",
"rel": "self",
"type": "application/rss+xml",
},
)
)
ET.SubElement(channel, "pubDate").text = pub_date
ET.SubElement(channel, "lastBuildDate").text = pub_date
ET.SubElement(channel, "generator").text = "simple-site"

element.append(channel)

self.channel = channel

return super().__init__(element=element)

def append_item(self, md_file: MarkdownFile) -> None:
"""Adds `md_file` to the list of RSS items."""
item = Element("item")
ET.SubElement(item, "title").text = md_file.title
ET.SubElement(item, "description").text = md_file.content
ET.SubElement(item, "pubDate").text = md_file.pub_date.strftime(DATETIME_FORMAT)

self.channel.append(item)


def collect_md_files(
directory: pathlib.Path, max_files: int | None
) -> tuple[dict[pathlib.Path, MarkdownFile], list[str]]:
"""Collects Markdown files in `directory`, searching recursively.
Files are sorted on two fields: first, their publication date, derived from the second line,
then by their title, derived from the first line. If a file is lacking either of these, it is
omitted with a warning.
:param directory: the directory to recursively search for Markdown files.
:param max_files: the maximum number of Markdown files to include. Files are sorted on two
fields: first, their publication date, derived from the second line, then by their title
"""
md_files: list[tuple[pathlib.Path, MarkdownFile]] = []
skipped = []
stack = [directory]

target = max_files if max_files else math.inf

while stack and len(md_files) < target:
current = stack.pop()

for dir_entry in os.scandir(current):
path = pathlib.Path(dir_entry.path)

if dir_entry.is_dir():
stack.append(path)
continue

try:
fp = open(path)
md_files.append((path, MarkdownFile(fp)))
except InvalidMarkdownFile as e:
skipped.append(f"Skipped {path}: {e}")

if len(md_files) >= target:
break

md_files.sort()

return dict(md_files), skipped


def generate_pub_date() -> str:
"""Generates a properly-formatted publication date based on local time."""
now = datetime.datetime.now()
now = now.combine(now.date(), now.time(), TZ_INFO)

return now.strftime(DATETIME_FORMAT)


def main(
directory: pathlib.Path,
title: str,
description: str,
url: str,
max_files: int | None = None,
) -> tuple[RssElementTree | None, list[pathlib.Path], list[str]]:
"""Collects Markdown files in `directory`, generating and outputting an RSS feed.
Markdown files are identified by ".md" endings only.
:param directory: the directory to recursively search for Markdown files.
:param title: the title for the RSS feed.
:param description: the description of the RSS feed.
:param url: the URL of the site the RSS feed represents.
:param max_files: the maximum number of Markdown files to include in the RSS feed. Files are
sorted on two fields: first, their publication date, derived from the second line, then by
their title.
:returns: the completed RSS feed
"""
md_files, errors = collect_md_files(directory, max_files)

if not md_files:
return (
None,
[],
[
f'No markdown files found in {directory} (we are dumb and only check for ".md" file'
"extensions)"
],
)

rss_tree = RssElementTree(title, description, url)
included_files = []

for path, md_file in md_files.items():
try:
rss_tree.append_item(md_file)
included_files.append(path)
except InvalidMarkdownFile as e:
errors.append(f"Skipped {path}: {e}")

md_file.close()

return rss_tree, included_files, errors


if __name__ == "__main__":
directory = pathlib.Path("./markdown/")

rss_tree, included, errors = main(
directory,
"Perfect5th, 2.0",
'I make-a the software go "beep-boop"',
"https://mitchellburton.ca/blog/",
)

for path in included:
print(f"Processed file: {path}")

for error in errors:
print(error, file=sys.stderr)

if rss_tree is not None:
print()
rss_tree.write(sys.stdout, encoding="unicode", xml_declaration=True)
rss_tree.write("feed.xml", encoding="unicode", xml_declaration=True)
print()
2 changes: 1 addition & 1 deletion markdown/simple-site.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Simple Site

*Created 2024-01-22*
## 2024-01-22

A couple days ago, my friend [tendermario][1] sent me a message:

Expand Down

0 comments on commit 83d5add

Please sign in to comment.