diff --git a/README.md b/README.md
index 08055fc..64987f6 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,23 @@
pandoc_reader
=============
-A pandoc [markdown] reader plugin for [pelican]
+A pandoc [markdown][] reader plugin for [pelican][]
Requirements
------------
- - [pandoc] in $PATH
-
+ - [pandoc][] in `$PATH`
Installation
------------
Instructions for installation of pelican plugins can be obtained from the [pelican plugin manual](https://github.com/getpelican/pelican-plugins/blob/master/Readme.rst).
-
Configuration
-------------
-Additional command line parameters can be passed to pandoc via the PANDOC_ARGS parameter.
+Additional command line parameters can be passed to pandoc via the `PANDOC_ARGS` parameter.
PANDOC_ARGS = [
'--mathjax',
@@ -29,14 +27,19 @@ Additional command line parameters can be passed to pandoc via the PANDOC_ARGS p
'--number-sections',
]
-Pandoc's markdown extensions can be enabled or disabled via the
-PANDOC_EXTENSIONS parameter.
+Pandoc's syntactic extensions to Markdown can be enabled or disabled via the
+`PANDOC_EXTENSIONS` parameter.
PANDOC_EXTENSIONS = [
'+hard_line_breaks',
'-citations'
]
+File Metadata
+-------------
+
+For compatibility with older versions of this plugin that parsed MultiMarkdown-like title blocks internally, the [`mmd_title_block`][mmd_title_block] syntax extension is enabled by default. Unfortunately, this causes Pandoc to misinterpret YAML metadata and possibly also native title blocks (see [Pandoc issue 2026][]). Therefore, those metadata formats are *disabled* by default. To revert to Pandoc's default behavior (accepting native title blocks and YAML metadata, but not MMD title blocks), include `-mmd_title_block` in `PANDOC_EXTENSIONS`.
+
Contributing
------------
@@ -50,3 +53,5 @@ Contributing
[markdown]: http://daringfireball.net/projects/markdown/
[pandoc]: http://johnmacfarlane.net/pandoc/
[pelican]: http://getpelican.com
+[mmd_title_block]: http://johnmacfarlane.net/pandoc/README.html#extension-mmd_title_block
+[Pandoc issue 2026]: https://github.com/jgm/pandoc/issues/2026
diff --git a/embed_metadata_filter.py b/embed_metadata_filter.py
new file mode 100644
index 0000000..52b2175
--- /dev/null
+++ b/embed_metadata_filter.py
@@ -0,0 +1,47 @@
+# This is a filter script which embeds all of the metadata parsed by
+# Pandoc into the HTML output, where the main body of the reader can
+# pick it up. In order to preserve Pandoc's translation of Markdown
+# in metadata values, we convert the metadata structure into an HTML
+# tree structure. A
separates the translated metadata from the
+# document itself.
+#
+# See http://johnmacfarlane.net/pandoc/scripting.html for documentation
+# of the JSON-serialized AST that we are manipulating.
+
+import json
+import sys
+
+def N(t, c, cls=None):
+ if cls is not None: c = [ ["", [cls], []], c ]
+ return { "t": t, "c": c }
+
+def cvt_metainlines(c):
+ return N("Plain", [N("Span", c, "metavalue")])
+
+def cvt_metamap(c):
+ return N("DefinitionList", [ ( [N("Str", key)], [[ convert(val) ]] )
+ for key, val in sorted(c.items()) ])
+
+CONVERTERS = {
+ "MetaMap": cvt_metamap,
+ "MetaInlines": cvt_metainlines,
+ "MetaBool": lambda c: cvt_metainlines([N("Str", str(c).lower())]),
+ "MetaString": lambda c: cvt_metainlines([N("Str", c)]),
+ "MetaBlocks": lambda c: N("Div", c, "metavalue"),
+ "MetaList": lambda c: N("BulletList", [ [convert(item)] for item in c ])
+}
+
+def convert(item):
+ return CONVERTERS[item["t"]](item["c"])
+
+def main():
+ blob = json.load(sys.stdin)
+ metadata = blob[0]['unMeta']
+ rendered = [cvt_metamap(metadata), N("HorizontalRule", [])]
+ rendered.extend(blob[1])
+ blob = [blob[0], rendered]
+ json.dump(blob, sys.stdout, separators=(',',':'))
+
+# This filter script is imported by pandoc_reader in order to learn its
+# actual filename, so don't do anything unless invoked as __main__.
+if __name__ == '__main__': main()
diff --git a/pandoc_reader.py b/pandoc_reader.py
index 87c7735..98f2133 100644
--- a/pandoc_reader.py
+++ b/pandoc_reader.py
@@ -1,44 +1,206 @@
import subprocess
+import sys
+
+import logging
+logger = logging.getLogger(__name__)
+
+try: import xml.etree.cElementTree as ET
+except ImportError: import xml.etree.ElementTree as ET
+
+try: from io import StringIO
+except ImportError: from cStringIO import StringIO
+
from pelican import signals
from pelican.readers import BaseReader
-from pelican.utils import pelican_open
+
+from . import embed_metadata_filter
+
+def check_command(proc, cmd):
+ """Roughly as subprocess.check_call does, wait for PROC and throw
+ an exception if it didn't exit successfully. CMD should be the
+ command passed to subprocess.Popen."""
+ status = proc.wait()
+ if status:
+ raise subprocess.CalledProcessError(status, cmd)
+
+def extract_metadata(text):
+ """A filter script converts Pandoc's internal representation of the
+ metadata into an HTML tree structure so that it will make it to
+ the output, with strings properly formatted. Separate that
+ tree from the HTML for the document itself, and decode it into
+ Pelican's desired representation."""
+
+ def walk_dl(e):
+ rv = {}
+ key = None
+ for child in e:
+ if child.tag == "dt":
+ assert key is None
+ assert len(child) == 0
+ key = child.text
+ else:
+ assert child.tag == "dd"
+ assert key is not None
+ assert len(child) == 1
+ rv[key] = walk(child[0])
+ key = None
+ return rv
+
+ def walk_ul(e):
+ rv = []
+ for child in e:
+ assert child.tag == "li"
+ assert len(child) == 1
+ rv.append(walk(child[0]))
+ return rv
+
+ def walk_value(e):
+ assert e.get("class") == "metavalue"
+ # Setting e.tag and e.tail to None temporarily seems to be the
+ # least-hassle way to persuade ET.tostring to dump the *contents*
+ # of e but not e itself.
+ tag = e.tag
+ tail = e.tail
+ try:
+ e.tag = None
+ e.tail = None
+ return (ET.tostring(e, encoding="utf-8", method="html")
+ .decode("utf-8").strip())
+ finally:
+ e.tag = tag
+ e.tail = tail
+
+ def walk(e):
+ if e.tag == "dl":
+ return walk_dl(e)
+ elif e.tag == "ul":
+ return walk_ul(e)
+ elif e.tag == "div" or e.tag == "span":
+ return walk_value(e)
+ else:
+ logger.error("unexpected metadata structure: " +
+ ET.tostring(e, encoding="utf-8", method="html")
+ .decode("utf-8"))
+
+
+ metadata, _, document = text.partition("
")
+ document = document.strip()
+
+ # Remove namespaces from all metadata elements while parsing them.
+ # This is necessary because Pandoc thinks you have to put an
+ # xmlns= on every use of