diff --git a/README.md b/README.md index 08055fc..64987f6 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,23 @@ pandoc_reader ============= -A pandoc [markdown] reader plugin for [pelican] +A pandoc [markdown][] reader plugin for [pelican][] Requirements ------------ - - [pandoc] in $PATH - + - [pandoc][] in `$PATH` Installation ------------ Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst). - Configuration ------------- -Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter. +Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter. PANDOC_ARGS = [ '--mathjax', @@ -29,14 +27,19 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p '--number-sections', ] -Pandoc's markdown extensions can be enabled or disabled via the -PANDOC_EXTENSIONS parameter. +Pandoc's syntactic extensions to Markdown can be enabled or disabled via the +`PANDOC_EXTENSIONS` parameter. PANDOC_EXTENSIONS = [ '+hard_line_breaks', '-citations' ] +File Metadata +------------- + +For compatibility with older versions of this plugin that parsed MultiMarkdown-like title blocks internally, the [`mmd_title_block`][mmd_title_block] syntax extension is enabled by default. Unfortunately, this causes Pandoc to misinterpret YAML metadata and possibly also native title blocks (see [Pandoc issue 2026][]). Therefore, those metadata formats are *disabled* by default. To revert to Pandoc's default behavior (accepting native title blocks and YAML metadata, but not MMD title blocks), include `-mmd_title_block` in `PANDOC_EXTENSIONS`. + Contributing ------------ @@ -50,3 +53,5 @@ Contributing [markdown]: http://daringfireball.net/projects/markdown/ [pandoc]: http://johnmacfarlane.net/pandoc/ [pelican]: http://getpelican.com +[mmd_title_block]: http://johnmacfarlane.net/pandoc/README.html#extension-mmd_title_block +[Pandoc issue 2026]: https://github.com/jgm/pandoc/issues/2026 diff --git a/embed_metadata_filter.py b/embed_metadata_filter.py new file mode 100644 index 0000000..52b2175 --- /dev/null +++ b/embed_metadata_filter.py @@ -0,0 +1,47 @@ +# This is a filter script which embeds all of the metadata parsed by +# Pandoc into the HTML output, where the main body of the reader can +# pick it up. In order to preserve Pandoc's translation of Markdown +# in metadata values, we convert the metadata structure into an HTML +# tree structure. A
separates the translated metadata from the +# document itself. +# +# See http://johnmacfarlane.net/pandoc/scripting.html for documentation +# of the JSON-serialized AST that we are manipulating. + +import json +import sys + +def N(t, c, cls=None): + if cls is not None: c = [ ["", [cls], []], c ] + return { "t": t, "c": c } + +def cvt_metainlines(c): + return N("Plain", [N("Span", c, "metavalue")]) + +def cvt_metamap(c): + return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] ) + for key, val in sorted(c.items()) ]) + +CONVERTERS = { + "MetaMap": cvt_metamap, + "MetaInlines": cvt_metainlines, + "MetaBool": lambda c: cvt_metainlines([N("Str", str(c).lower())]), + "MetaString": lambda c: cvt_metainlines([N("Str", c)]), + "MetaBlocks": lambda c: N("Div", c, "metavalue"), + "MetaList": lambda c: N("BulletList", [ [convert(item)] for item in c ]) +} + +def convert(item): + return CONVERTERS[item["t"]](item["c"]) + +def main(): + blob = json.load(sys.stdin) + metadata = blob[0]['unMeta'] + rendered = [cvt_metamap(metadata), N("HorizontalRule", [])] + rendered.extend(blob[1]) + blob = [blob[0], rendered] + json.dump(blob, sys.stdout, separators=(',',':')) + +# This filter script is imported by pandoc_reader in order to learn its +# actual filename, so don't do anything unless invoked as __main__. +if __name__ == '__main__': main() diff --git a/pandoc_reader.py b/pandoc_reader.py index 87c7735..98f2133 100644 --- a/pandoc_reader.py +++ b/pandoc_reader.py @@ -1,44 +1,206 @@ import subprocess +import sys + +import logging +logger = logging.getLogger(__name__) + +try: import xml.etree.cElementTree as ET +except ImportError: import xml.etree.ElementTree as ET + +try: from io import StringIO +except ImportError: from cStringIO import StringIO + from pelican import signals from pelican.readers import BaseReader -from pelican.utils import pelican_open + +from . import embed_metadata_filter + +def check_command(proc, cmd): + """Roughly as subprocess.check_call does, wait for PROC and throw + an exception if it didn't exit successfully. CMD should be the + command passed to subprocess.Popen.""" + status = proc.wait() + if status: + raise subprocess.CalledProcessError(status, cmd) + +def extract_metadata(text): + """A filter script converts Pandoc's internal representation of the + metadata into an HTML tree structure so that it will make it to + the output, with strings properly formatted. Separate that + tree from the HTML for the document itself, and decode it into + Pelican's desired representation.""" + + def walk_dl(e): + rv = {} + key = None + for child in e: + if child.tag == "dt": + assert key is None + assert len(child) == 0 + key = child.text + else: + assert child.tag == "dd" + assert key is not None + assert len(child) == 1 + rv[key] = walk(child[0]) + key = None + return rv + + def walk_ul(e): + rv = [] + for child in e: + assert child.tag == "li" + assert len(child) == 1 + rv.append(walk(child[0])) + return rv + + def walk_value(e): + assert e.get("class") == "metavalue" + # Setting e.tag and e.tail to None temporarily seems to be the + # least-hassle way to persuade ET.tostring to dump the *contents* + # of e but not e itself. + tag = e.tag + tail = e.tail + try: + e.tag = None + e.tail = None + return (ET.tostring(e, encoding="utf-8", method="html") + .decode("utf-8").strip()) + finally: + e.tag = tag + e.tail = tail + + def walk(e): + if e.tag == "dl": + return walk_dl(e) + elif e.tag == "ul": + return walk_ul(e) + elif e.tag == "div" or e.tag == "span": + return walk_value(e) + else: + logger.error("unexpected metadata structure: " + + ET.tostring(e, encoding="utf-8", method="html") + .decode("utf-8")) + + + metadata, _, document = text.partition("
") + document = document.strip() + + # Remove namespaces from all metadata elements while parsing them. + # This is necessary because Pandoc thinks you have to put an + # xmlns= on every use of , and that makes ET.tostring + # generate tags like , which an HTML (not XHTML) parser + # will not understand. + it = ET.iterparse(StringIO(metadata)) + for _, el in it: + if "}" in el.tag: + el.tag = el.tag.split("}", 1)[1] + + assert it.root.tag == "dl" + return document, walk(it.root) class PandocReader(BaseReader): enabled = True - file_extensions = ['md', 'markdown', 'mkd', 'mdown'] + file_extensions = ["md", "markdown", "mkd", "mdown"] - def read(self, filename): - with pelican_open(filename) as fp: - text = list(fp.splitlines()) + def memoize_settings(self): + """Load settings and compute the various subprocess invocations we + will be using.""" + if hasattr(self, "pd_extensions"): return - metadata = {} - for i, line in enumerate(text): - kv = line.split(':', 1) - if len(kv) == 2: - name, value = kv[0].lower(), kv[1].strip() - metadata[name] = self.process_metadata(name, value) - else: - content = "\n".join(text[i:]) - break + extra_args = self.settings.get("PANDOC_ARGS", []) + + pos_extensions = set() + neg_extensions = set() + for ext in self.settings.get("PANDOC_EXTENSIONS", []): + if len(ext) >= 2: + if ext[0] == "-": + neg_extensions.add(ext[1:]) + continue + elif ext[0] == "+": + pos_extensions.add(ext[1:]) + continue + logger.error("invalid PANDOC_EXTENSIONS item {!r}".format(ext)) + + # For compatibility with older versions of this plugin that + # parsed vaguely MMD-style metadata blocks themselves, we + # default to +mmd_title_block. Unfortunately, + # +mmd_title_block causes Pandoc to mis-parse YAML and + # possibly also native title blocks (see + # https://github.com/jgm/pandoc/issues/2026). Therefore, + # if there's nothing about title blocks in PANDOC_EXTENSIONS, + # we also explicitly disable YAML and native title blocks. + + if ("mmd_title_block" not in pos_extensions and + "mmd_title_block" not in neg_extensions and + "pandoc_title_block" not in pos_extensions and + "pandoc_title_block" not in neg_extensions and + "yaml_metadata_block" not in pos_extensions and + "yaml_metadata_block" not in neg_extensions): + pos_extensions.add("mmd_title_block") + neg_extensions.add("pandoc_title_block") + neg_extensions.add("yaml_metadata_block") - extra_args = self.settings.get('PANDOC_ARGS', []) - extensions = self.settings.get('PANDOC_EXTENSIONS', '') - if isinstance(extensions, list): - extensions = ''.join(extensions) + both_exts = pos_extensions & neg_extensions + if both_exts: + logger.error("Pandoc syntax extensions both enabled and disabled: " + + " ".join(sorted(both_exts))) + pos_extensions -= both_exts + neg_extensions -= both_exts - pandoc_cmd = ["pandoc", "--from=markdown" + extensions, "--to=html5"] - pandoc_cmd.extend(extra_args) + syntax = "markdown" + if pos_extensions: + syntax += "".join(sorted("+"+ext for ext in pos_extensions)) + if neg_extensions: + syntax += "".join(sorted("-"+ext for ext in neg_extensions)) - proc = subprocess.Popen(pandoc_cmd, - stdin = subprocess.PIPE, - stdout = subprocess.PIPE) + pd_cmd_1 = ["pandoc", "-f", syntax, "-t", "json"] + pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"] + # We don't know whether the extra_args are relevant to the reader or + # writer, and it is harmless to supply them to both. + pd_cmd_1.extend(extra_args) + pd_cmd_2.extend(extra_args) - output = proc.communicate(content.encode('utf-8'))[0].decode('utf-8') - status = proc.wait() - if status: - raise subprocess.CalledProcessError(status, pandoc_cmd) + self.pd_cmd_1 = pd_cmd_1 + self.pd_cmd_2 = pd_cmd_2 + self.filt_cmd = [sys.executable, embed_metadata_filter.__file__] + logger.debug("Reader command: " + " ".join(self.pd_cmd_1)) + logger.debug("Writer command: " + " ".join(self.pd_cmd_2)) + logger.debug("Filter command: " + " ".join(self.filt_cmd)) + + def read(self, filename): + self.memoize_settings() + + # We do not use --filter because that requires the filter to + # be directly executable. By constructing a pipeline by hand + # we can use sys.executable and not worry about #! lines or + # execute bits. + PIPE = subprocess.PIPE + fp = None + p1 = None + p2 = None + p3 = None + try: + fp = open(filename, "rb") + p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE) + p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE) + p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE) + + text = p3.stdout.read().decode("utf-8") + + finally: + if fp is not None: fp.close() + if p1 is not None: check_command(p1, self.pd_cmd_1) + if p2 is not None: check_command(p2, self.filt_cmd) + if p3 is not None: check_command(p3, self.pd_cmd_2) + + document, raw_metadata = extract_metadata(text) + metadata = {} + for k, v in raw_metadata.items(): + k = k.lower() + metadata[k] = self.process_metadata(k, v) - return output, metadata + return document, metadata def add_reader(readers): for ext in PandocReader.file_extensions: