-
-
Notifications
You must be signed in to change notification settings - Fork 43
/
pandoc-ruby.rb
366 lines (321 loc) · 12 KB
/
pandoc-ruby.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
require 'open3'
require 'tempfile'
require 'timeout'
class PandocRuby
# Use the pandoc command with a custom executable path.
@pandoc_path = 'pandoc'
class << self
attr_accessor :pandoc_path
end
# The available readers and their corresponding names. The keys are used to
# generate methods and specify options to Pandoc.
READERS = {
'biblatex' => 'BibLaTeX bibliography',
'bibtex' => 'BibTeX bibliography',
'commonmark' => 'CommonMark Markdown',
'commonmark_x' => 'CommonMark Markdown with extensions',
'creole' => 'Creole 1.0',
'csljson' => 'CSL JSON bibliography',
'csv' => 'CSV table',
'docbook' => 'DocBook',
'docx' => 'Word docx',
'dokuwiki' => 'DokuWiki markup',
'endnotexml' => 'EndNote XML bibliography',
'epub' => 'EPUB',
'fb2' => 'FictionBook2 e-book',
'gfm' => 'GitHub-Flavored Markdown',
'haddock' => 'Haddock markup',
'html' => 'HTML',
'ipynb' => 'Jupyter notebook',
'jats' => 'JATS XML',
'jira' => 'Jira wiki markup',
'json' => 'JSON version of native AST',
'latex' => 'LaTex',
'man' => 'roff man',
'markdown' => "Pandoc's Markdown",
'markdown_mmd' => 'MultiMarkdown',
'markdown_phpextra' => 'PHP Markdown Extra',
'markdown_strict' => 'original unextended Markdown',
'mediawiki' => 'MediaWiki markup',
'muse' => 'Muse',
'native' => 'native Haskell',
'odt' => 'ODT',
'opml' => 'OPML',
'org' => 'Emacs Org mode',
'ris' => 'RIS bibliography',
'rst' => 'reStructuredText',
'rtf' => 'Rich Text Format',
't2t' => 'txt2tags',
'textile' => 'Textile',
'tikiwiki' => 'TikiWiki markup',
'tsv' => 'TSV table',
'twiki' => 'TWiki markup',
'vimwiki' => 'Vimwiki'
}.freeze
# The available string writers and their corresponding names. The keys are
# used to generate methods and specify options to Pandoc.
STRING_WRITERS = {
'asciidoc' => 'AsciiDoc',
'asciidoctor' => 'AsciiDoctor',
'beamer' => 'LaTeX beamer slide show',
'biblatex' => 'BibLaTeX bibliography',
'bibtex' => 'BibTeX bibliography',
'chunkedhtml' => 'zip archive of multiple linked HTML files',
'commonmark' => 'CommonMark Markdown',
'commonmark_x' => 'CommonMark Markdown with extensions',
'context' => 'ConTeXt',
'csljson' => 'CSL JSON bibliography',
'docbook' => 'DocBook 4',
'docbook4' => 'DocBook 4',
'docbook5' => 'DocBook 5',
'dokuwiki' => 'DokuWiki markup',
'fb2' => 'FictionBook2 e-book',
'gfm' => 'GitHub-Flavored Markdown',
'haddock' => 'Haddock markup',
'html' => 'HTML, i.e. HTML5/XHTML polyglot markup',
'html5' => 'HTML, i.e. HTML5/XHTML polyglot markup',
'html4' => 'XHTML 1.0 Transitional',
'icml' => 'InDesign ICML',
'ipynb' => 'Jupyter notebook',
'jats_archiving' => 'JATS XML, Archiving and Interchange Tag Set',
'jats_articleauthoring' => 'JATS XML, Article Authoring Tag Set',
'jats_publishing' => 'JATS XML, Journal Publishing Tag Set',
'jats' => 'alias for jats_archiving',
'jira' => 'Jira wiki markup',
'json' => 'JSON version of native AST',
'latex' => 'LaTex',
'man' => 'roff man',
'markdown' => "Pandoc's Markdown",
'markdown_mmd' => 'MultiMarkdown',
'markdown_phpextra' => 'PHP Markdown Extra',
'markdown_strict' => 'original unextended Markdown',
'markua' => 'Markua',
'mediawiki' => 'MediaWiki markup',
'ms' => 'roff ms',
'muse' => 'Muse',
'native' => 'native Haskell',
'opml' => 'OPML',
'opendocument' => 'OpenDocument',
'org' => 'Emacs Org mode',
'pdf' => 'PDF',
'plain' => 'plain text',
'pptx' => 'PowerPoint slide show',
'rst' => 'reStructuredText',
'rtf' => 'Rich Text Format',
'texinfo' => 'GNU Texinfo',
'textile' => 'Textile',
'slideous' => 'Slideous HTML and JavaScript slide show',
'slidy' => 'Slidy HTML and JavaScript slide show',
'dzslides' => 'DZSlides HTML5 + JavaScript slide show',
'revealjs' => 'reveal.js HTML5 + JavaScript slide show',
's5' => 'S5 HTML and JavaScript slide show',
'tei' => 'TEI Simple',
'xwiki' => 'XWiki markup',
'zimwiki' => 'ZimWiki markup'
}.freeze
# The available binary writers and their corresponding names. The keys are
# used to generate methods and specify options to Pandoc.
BINARY_WRITERS = {
'odt' => 'OpenOffice text document',
'docx' => 'Word docx',
'epub' => 'EPUB v3',
'epub2' => 'EPUB v2',
'epub3' => 'EPUB v3'
}.freeze
# All of the available Writers.
WRITERS = STRING_WRITERS.merge(BINARY_WRITERS)
# A shortcut method that creates a new PandocRuby object and immediately
# calls `#convert`. Options passed to this method are passed directly to
# `#new` and treated the same as if they were passed directly to the
# initializer.
def self.convert(*args)
new(*args).convert
end
attr_writer :binary_output
def binary_output
@binary_output ||= false
end
attr_writer :options
def options
@options ||= []
end
attr_writer :option_string
def option_string
@option_string ||= ''
end
attr_writer :writer
def writer
@writer ||= 'html'
end
attr_accessor :input_files
attr_accessor :input_string
# Create a new PandocRuby converter object. The first argument contains the
# input either as string or as an array of filenames.
#
# Any other arguments will be converted to pandoc options.
#
# Usage:
# new("# A String", :option1 => :value, :option2)
# new(["/path/to/file.md"], :option1 => :value, :option2)
# new(["/to/file1.html", "/to/file2.html"], :option1 => :value)
def initialize(*args)
case args[0]
when String
self.input_string = args.shift
when Array
self.input_files = args.shift.map { |f| "'#{f}'" }.join(' ')
end
self.options = args
end
# Run the conversion. The convert method can take any number of arguments,
# which will be converted to pandoc options. If options were already
# specified in an initializer or reader method, they will be combined with
# any that are passed to this method.
#
# Returns a string with the converted content.
#
# Example:
#
# PandocRuby.new("# text").convert
# # => "<h1 id=\"text\">text</h1>\n"
def convert(*args)
self.options += args if args
self.option_string = prepare_options(self.options)
if self.binary_output
convert_binary
else
convert_string
end
end
alias to_s convert
# Generate class methods for each of the readers in PandocRuby::READERS.
# When one of these methods is called, it simply calls the initializer
# with the `from` option set to the reader key, and returns the object.
#
# Example:
#
# PandocRuby.markdown("# text")
# # => #<PandocRuby:0x007 @input_string="# text", @options=[{:from=>"markdown"}]
class << self
READERS.each_key do |r|
define_method(r) do |*args|
args += [{ :from => r }]
new(*args)
end
end
end
# Generate instance methods for each of the writers in PandocRuby::WRITERS.
# When one of these methods is called, it simply calls the `#convert` method
# with the `to` option set to the writer key, thereby returning the
# converted string.
#
# Example:
#
# PandocRuby.new("# text").to_html
# # => "<h1 id=\"text\">text</h1>\n"
WRITERS.each_key do |w|
define_method(:"to_#{w}") do |*args|
args += [{ :to => w.to_sym }]
convert(*args)
end
end
private
# Execute the pandoc command for binary writers. A temp file is created
# and written to, then read back into the program as a string, then the
# temp file is closed and unlinked.
def convert_binary
tmp_file = Tempfile.new('pandoc-conversion')
begin
self.options += [{ :output => tmp_file.path }]
self.option_string = "#{self.option_string} --output \"#{tmp_file.path}\""
execute_pandoc
return IO.binread(tmp_file)
ensure
tmp_file.close
tmp_file.unlink
end
end
# Execute the pandoc command for string writers.
def convert_string
execute_pandoc
end
# Wrapper to run pandoc in a consistent, DRY way
def execute_pandoc
if !self.input_files.nil?
execute("#{PandocRuby.pandoc_path} #{self.input_files}#{self.option_string}")
else
execute("#{PandocRuby.pandoc_path}#{self.option_string}")
end
end
# Run the command and returns the output.
def execute(command)
output = error = exit_status = nil
@timeout ||= 31_557_600
Open3.popen3(command) do |stdin, stdout, stderr, wait_thr|
begin
Timeout.timeout(@timeout) do
stdin.puts self.input_string
stdin.close
output = stdout.read
error = stderr.read
exit_status = wait_thr.value
end
rescue Timeout::Error => ex
Process.kill 9, wait_thr.pid
maybe_ex = "\n#{ex}" if ex
error = "Pandoc timed out after #{@timeout} seconds.#{maybe_ex}"
end
end
raise error unless exit_status && exit_status.success?
output
end
# Builds the option string to be passed to pandoc by iterating over the
# opts passed in. Recursively calls itself in order to handle hash options.
def prepare_options(opts = [])
opts.inject('') do |string, (option, value)|
string + if value
create_option(option, value)
elsif option.respond_to?(:each_pair)
prepare_options(option)
else
create_option(option)
end
end
end
# Takes a flag and optional argument, uses it to set any relevant options
# used by the library, and returns string with the option formatted as a
# command line options. If the option has an argument, it is also included.
def create_option(flag, argument = nil)
return '' unless flag
flag = flag.to_s
set_pandoc_ruby_options(flag, argument)
return '' if flag == 'timeout' # pandoc doesn't accept timeouts yet
if argument.nil?
format_flag(flag)
else
"#{format_flag(flag)} \"#{argument}\""
end
end
# Formats an option flag in order to be used with the pandoc command line
# tool.
def format_flag(flag)
if flag.length == 1
" -#{flag}"
elsif flag =~ /^-|\+/
" #{flag}"
else
" --#{flag.to_s.tr('_', '-')}"
end
end
# Takes an option and optional argument and uses them to set any flags
# used by PandocRuby.
def set_pandoc_ruby_options(flag, argument = nil)
case flag
when 't', 'to'
self.writer = argument.to_s
self.binary_output = true if BINARY_WRITERS.key?(self.writer)
when 'timeout'
@timeout = argument
end
end
end