-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathpandoc_reader.py
210 lines (177 loc) · 7.47 KB
/
pandoc_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import subprocess
import sys
import logging
logger = logging.getLogger(__name__)
try: import xml.etree.cElementTree as ET
except ImportError: import xml.etree.ElementTree as ET
try: from io import StringIO
except ImportError: from cStringIO import StringIO
from pelican import signals
from pelican.readers import BaseReader
from . import embed_metadata_filter
def check_command(proc, cmd):
"""Roughly as subprocess.check_call does, wait for PROC and throw
an exception if it didn't exit successfully. CMD should be the
command passed to subprocess.Popen."""
status = proc.wait()
if status:
raise subprocess.CalledProcessError(status, cmd)
def extract_metadata(text):
"""A filter script converts Pandoc's internal representation of the
metadata into an HTML tree structure so that it will make it to
the output, with strings properly formatted. Separate that
tree from the HTML for the document itself, and decode it into
Pelican's desired representation."""
def walk_dl(e):
rv = {}
key = None
for child in e:
if child.tag == "dt":
assert key is None
assert len(child) == 0
key = child.text
else:
assert child.tag == "dd"
assert key is not None
assert len(child) == 1
rv[key] = walk(child[0])
key = None
return rv
def walk_ul(e):
rv = []
for child in e:
assert child.tag == "li"
assert len(child) == 1
rv.append(walk(child[0]))
return rv
def walk_value(e):
assert e.get("class") == "metavalue"
# Setting e.tag and e.tail to None temporarily seems to be the
# least-hassle way to persuade ET.tostring to dump the *contents*
# of e but not e itself.
tag = e.tag
tail = e.tail
try:
e.tag = None
e.tail = None
return (ET.tostring(e, encoding="utf-8", method="html")
.decode("utf-8").strip())
finally:
e.tag = tag
e.tail = tail
def walk(e):
if e.tag == "dl":
return walk_dl(e)
elif e.tag == "ul":
return walk_ul(e)
elif e.tag == "div" or e.tag == "span":
return walk_value(e)
else:
logger.error("unexpected metadata structure: " +
ET.tostring(e, encoding="utf-8", method="html")
.decode("utf-8"))
metadata, _, document = text.partition("<hr />")
document = document.strip()
# Remove namespaces from all metadata elements while parsing them.
# This is necessary because Pandoc thinks you have to put an
# xmlns= on every use of <math>, and that makes ET.tostring
# generate tags like <ns0:math>, which an HTML (not XHTML) parser
# will not understand.
it = ET.iterparse(StringIO(metadata))
for _, el in it:
if "}" in el.tag:
el.tag = el.tag.split("}", 1)[1]
assert it.root.tag == "dl"
return document, walk(it.root)
class PandocReader(BaseReader):
enabled = True
file_extensions = ["md", "markdown", "mkd", "mdown"]
def memoize_settings(self):
"""Load settings and compute the various subprocess invocations we
will be using."""
if hasattr(self, "pd_extensions"): return
extra_args = self.settings.get("PANDOC_ARGS", [])
pos_extensions = set()
neg_extensions = set()
for ext in self.settings.get("PANDOC_EXTENSIONS", []):
if len(ext) >= 2:
if ext[0] == "-":
neg_extensions.add(ext[1:])
continue
elif ext[0] == "+":
pos_extensions.add(ext[1:])
continue
logger.error("invalid PANDOC_EXTENSIONS item {!r}".format(ext))
# For compatibility with older versions of this plugin that
# parsed vaguely MMD-style metadata blocks themselves, we
# default to +mmd_title_block. Unfortunately,
# +mmd_title_block causes Pandoc to mis-parse YAML and
# possibly also native title blocks (see
# https://github.com/jgm/pandoc/issues/2026). Therefore,
# if there's nothing about title blocks in PANDOC_EXTENSIONS,
# we also explicitly disable YAML and native title blocks.
if ("mmd_title_block" not in pos_extensions and
"mmd_title_block" not in neg_extensions and
"pandoc_title_block" not in pos_extensions and
"pandoc_title_block" not in neg_extensions and
"yaml_metadata_block" not in pos_extensions and
"yaml_metadata_block" not in neg_extensions):
pos_extensions.add("mmd_title_block")
neg_extensions.add("pandoc_title_block")
neg_extensions.add("yaml_metadata_block")
both_exts = pos_extensions & neg_extensions
if both_exts:
logger.error("Pandoc syntax extensions both enabled and disabled: "
+ " ".join(sorted(both_exts)))
pos_extensions -= both_exts
neg_extensions -= both_exts
syntax = "markdown"
if pos_extensions:
syntax += "".join(sorted("+"+ext for ext in pos_extensions))
if neg_extensions:
syntax += "".join(sorted("-"+ext for ext in neg_extensions))
pd_cmd_1 = ["pandoc", "-f", syntax, "-t", "json"]
pd_cmd_2 = ["pandoc", "-f", "json", "-t", "html5"]
# We don't know whether the extra_args are relevant to the reader or
# writer, and it is harmless to supply them to both.
pd_cmd_1.extend(extra_args)
pd_cmd_2.extend(extra_args)
self.pd_cmd_1 = pd_cmd_1
self.pd_cmd_2 = pd_cmd_2
self.filt_cmd = [sys.executable, embed_metadata_filter.__file__]
logger.debug("Reader command: " + " ".join(self.pd_cmd_1))
logger.debug("Writer command: " + " ".join(self.pd_cmd_2))
logger.debug("Filter command: " + " ".join(self.filt_cmd))
def read(self, filename):
self.memoize_settings()
# We do not use --filter because that requires the filter to
# be directly executable. By constructing a pipeline by hand
# we can use sys.executable and not worry about #! lines or
# execute bits.
PIPE = subprocess.PIPE
fp = None
p1 = None
p2 = None
p3 = None
try:
fp = open(filename, "rb")
p1 = subprocess.Popen(self.pd_cmd_1, stdin=fp, stdout=PIPE)
p2 = subprocess.Popen(self.filt_cmd, stdin=p1.stdout, stdout=PIPE)
p3 = subprocess.Popen(self.pd_cmd_2, stdin=p2.stdout, stdout=PIPE)
text = p3.stdout.read().decode("utf-8")
finally:
if fp is not None: fp.close()
if p1 is not None: check_command(p1, self.pd_cmd_1)
if p2 is not None: check_command(p2, self.filt_cmd)
if p3 is not None: check_command(p3, self.pd_cmd_2)
document, raw_metadata = extract_metadata(text)
metadata = {}
for k, v in raw_metadata.items():
k = k.lower()
metadata[k] = self.process_metadata(k, v)
return document, metadata
def add_reader(readers):
for ext in PandocReader.file_extensions:
readers.reader_classes[ext] = PandocReader
def register():
signals.readers_init.connect(add_reader)