This repository has been archived by the owner on Aug 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.rb
executable file
·413 lines (329 loc) · 12.3 KB
/
convert.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/bin/ruby
require 'fileutils'
require 'nokogiri'
require 'pandoc-ruby'
require 'yaml'
require 'csv'
require 'shellwords'
require 'ruby-progressbar'
require 'wikicloth'
require 'kramdown'
require 'fuzzystringmatch'
require 'active_support'
# Restrict WikiCloth's escaping to just 'nowiki' tags
# and do so without a Ruby warning about chaging constants
orig_verbosity = $VERBOSE
$VERBOSE = nil
WikiCloth::WikiBuffer::HTMLElement.const_set(:ESCAPED_TAGS, ['nowiki'])
$VERBOSE = orig_verbosity
# Load configuration
config = YAML.load_file('config.yml')
new_urls = YAML.load_file('new_urls.yaml')
# Load settings, fall back to defaults otherwise
path = config['output'] || '/tmp/mw2md-output'
path_sub = File.join path, (config['output_subdir'] || '')
authors_csv = config['authors_csv'] || 'authors.csv'
dump_xml = config['wiki_xml'] || 'dump.xml'
history = config['history'].nil? ? true : config['history']
fuzzymatch = FuzzyStringMatch::JaroWinkler.create(:pure)
# Functions (TODO: Move to a library file)
def fix_headings html, offset = 0
heading_depth = html.scan(/^(#+ ).*/) || ['']
heading_depth = heading_depth.map { |c| c.first.strip.length }
heading_depth.pop
return html if heading_depth.min.nil? || heading_depth.min <= 1 + offset
hash = "#" * (heading_depth.min - 1)
hash_offset = "#" * offset
html.gsub(/^#{hash}(#?) (.*)/, "#{hash_offset}\\1 \\2")
end
# Begin the logic
errors = {}
puts "Prossing #{dump_xml}. Output directory: #{path}"
# Create git repo
FileUtils.mkdir_p path
Process.wait Kernel.spawn('git init .', chdir: path) if history
# Open and process MediaWiki dump
mw_xml = open dump_xml
mw = Nokogiri::XML(mw_xml)
# Authors
wiki_author = {}
CSV.foreach(authors_csv) do |col|
wiki_author[col[0].downcase] = { name: col[1], email: col[2] }
wiki_author[col[0].downcase][:name] = col[0] if col[1].to_s.strip == ''
end
# Discover all redirects
redirect = { wiki_redirects: {}, map: {} }
mw.css('page').select { |page| page.css('redirect') }.each do |page|
title = page.css('title').text
redir = page.css('redirect').attr('title').text rescue ''
next if title.match(/^File:/)
next if title.match(Regexp.new(config['skip'], Regexp::IGNORECASE))
unless redir.strip == ''
# puts "Redirect! #{title} => #{redir}"
redirect[:wiki_redirects][title] = redir
end
end
# Break all revisions out from being grouped into pages
revision = []
mw.css('page').each do |page|
title = page.css('title').text.strip
page_revisions = page.css('revision')
next if title.match(/^File:/)
next if title.match(Regexp.new(config['skip'], Regexp::IGNORECASE))
authors = page.css('username').map { |u| u.text.downcase.strip }.sort.uniq
final_revision = page_revisions.sort_by { |r| r.css('timestamp').text }.last
revision_count = page_revisions.count
page_revisions = [final_revision] unless history
page_revisions.each do |rev|
revision.push page: page,
revision: rev,
final_revision: final_revision,
title: title,
authors: authors,
timestamp: rev.css('timestamp').text,
revision_count: revision_count,
last_updated: page.css('timestamp').sort.last.text
end
end
# Sort all revisions by time, process, and commit
number_of_pages = revision.count
current_page = 0
progress = ProgressBar.create format: '%a |%e |%b>%i| %p%% %t',
smoothing: 0.7,
throttle_rate: 1.0,
total: number_of_pages
revision.sort_by { |r| r[:timestamp] }.each do |rev_info|
current_page += 1
rev = rev_info[:revision]
title = rev_info[:title]
authors = rev_info[:authors]
wikitext = rev.css('text').text
wikitext_final = rev_info[:final_revision].css('text').text
id = rev.css('id').text
timestamp = rev.css('timestamp').text
username = rev.css('username').text
comment = rev.css('comment').text.gsub(/\/\*|\*\//, '').strip
dirs = title.gsub(/&action=.*/, '').downcase.split(/[:\/]/)
filename = dirs.pop.gsub(/&/, ' and ')
dirs = dirs.join('/').strip
dirs = nil if dirs.empty?
category_match = /\[\[Category\:([^\]]*)\]\]/i
category = wikitext_final.match(category_match)
category = category[1].strip if category.class == MatchData
category_dirs = category.downcase.strip.split(/[|\/]/).first if category
# Wipe category text from the MediaWiki source so it's not converted
# in-page (however, it's still preserved in metadata, thanks to above)
wikitext_final.gsub!(category_match, '') if category
category_match = nil
unless config['catmatch'].nil?
config['catmatch'].each do |k, v|
next if category_match
category_match = v if filename.match(Regexp.new k)
end
end
unless config['rewrite_file'].nil?
config['rewrite_file'].each do |k, v|
filename.gsub!(Regexp.new(k), v)
end
end
dir = category_match || category_dirs || dirs || 'uncategorized'
unless config['rewrite_dir'].nil?
config['rewrite_dir'].each do |k, v|
dir.gsub!(Regexp.new(k), v)
end
end
if title.match(/^(home|main page)$/i)
dir = ''
filename = 'index'
end
dir.gsub!(/[_\s:]/, '-')
dir.strip! if dir
# Rewrite some wiki constructs into HTML, to be processed into Markdown
config['rewrite_wiki'].each do |k, v|
wikitext.gsub!(Regexp.new(k), v)
end
begin
markdown = PandocRuby.convert(
wikitext, :s, {
from: :mediawiki,
to: :markdown_github
},
'atx-headers')
conversion_error = false
rescue
begin
# Fallback conversion, as pandoc bailed on us
# Invoke WikiCloth
wikicloth = WikiCloth::Parser.new(data: wikitext).to_html
# Pass the WikiCloth HTML to Nokogiri for additional processing
wiki_html = Nokogiri::HTML::DocumentFragment.parse(wikicloth)
# Remove various MediaWiki-isms
wiki_html.css('#toc').remove
wiki_html.css('.editsection').remove
wiki_html.css('a[name]').each { |n| n.remove if n.text.empty? }
wiki_html.css('.mw-headline').each { |n| n.replace n.text }
# Simplify tables (to increase the liklihood of conversion)
wiki_html.css('table,tr,th,td').each do |n|
n.keys.each { |key| n.delete(key) unless key.match(/span/) }
end
# Call upon Pandoc again, but this time with scrubbed HTML
markdown = PandocRuby.convert(
wiki_html, :s, {
from: :html,
to: :markdown_github
},
'atx-headers')
rescue
puts "Error converting #{title}. Fallback even failed. #sadface"
errors[title.to_s] = wikitext
next
end
conversion_error = true
end
next unless markdown
# Demote headings if H1 exists
markdown.gsub!(/^#/, '##') if markdown.match(/^# /)
# Clean up generated Markdown
output = markdown
.gsub(/__TOC__/, "* ToC\n{:toc}\n\n") # Convert table of contents
.gsub(/__NOTOC__/, '{:.no_toc}') # Handle explicit no-ToC
.gsub(/\\([_#"'<>$])/, '\\1') # Unescape overly-escaped
.gsub(/ "wikilink"\)/, ')') # Remove wikilink link classes
.gsub(/^- /, '* ') # Change first item of bulleted lists
.gsub(/^`(.*)`$/, ' \\1') # Use indents for blockquotes
.gsub(/\[(\/\/[^ \]]*) ([^\]]*)\]/, '[\2](\1)') # handle // links
.gsub(/(^\|+$)/, '') # Wipe out empty table rows
# Custom markdown rewriting rules
config['rewrite_markdown'].each do |k, v|
output.gsub!(Regexp.new(k), v)
end
title_pretty = title.split(/[:\/]/).pop
metadata = {
'title' => title_pretty,
'category' => category_match || category_dirs,
'authors' => authors.join(', '),
'wiki_category' => category,
'wiki_title' => title,
'wiki_revision_count' => rev_info[:revision_count],
'wiki_last_updated' => Date.parse(rev_info[:last_updated])
# 'wiki_date' => Date.parse(timestamp)
# "wiki_id" => id
}
# Add frontmatter based on matchers from config
# (matchers apply to wiki source, which has data the conversion lacks)
unless config['frontmatter'].nil?
config['frontmatter'].each do |k, v|
matches = wikitext.gsub(/<!\-\-[^\-\->]*\-\->/m, '').match(Regexp.new k)
metadata[v] = matches.captures.join(', ').squeeze(' ').strip if matches
end
end
if conversion_error
metadata['wiki_conversion_fallback'] = true
metadata['wiki_warnings'] = 'conversion-fallback'
end
config['warnings'].each do |k, v|
if wikitext.gsub(/<!\-\-[^\-\->]*\-\->/m, '').match(Regexp.new k)
warnz = metadata['wiki_warnings'].to_s.split(/, /)
metadata['wiki_warnings'] = warnz.push(v).uniq.join(', ')
end
end
frontmatter = metadata.select { |_, v| !v.nil? && !v.to_s.empty? }.to_yaml
headings = output.match(/^#+ (.*)/)
heading_diff = if headings
fuzzymatch.getDistance(title_pretty, headings[1])
else
0
end
if heading_diff >= 0.75
# The existing heading is similar enough to the page title
complete = "#{frontmatter}---\n\n#{fix_headings output}"
else
# Add cleaned up title as a heading
title_prettier = if title.match(/ /)
title_pretty
else
title_pretty
.gsub(/([a-z])([A-Z0-9])/, '\1 \2')
.gsub(/([A-Z])([A-Z])([a-z])/, '\1 \2\3')
end
complete = "#{frontmatter}---\n\n# #{title_prettier}\n\n#{fix_headings output, 1}"
end
ext = '.html.md'
full_file = new_urls[title] ||
"#{dir.strip}/#{filename.strip}#{ext}"
.downcase
.squeeze(' ')
.gsub(/[_\s:]/, '-')
.gsub(/-+/, '-')
.gsub(/["';]/, '')
.squeeze('-')
# puts "#{full_file}"
config['rewrite_full'].each do |k, v|
full_file.gsub!(Regexp.new(k, Regexp::IGNORECASE), v)
dir = File.dirname full_file
end
# Update progressbar
progress.increment
if wikitext.match(/^#REDIRECT/) || wikitext.strip.empty?
# puts "REDIRECTED! #{title} => #{redirect[title]}"
begin
File.delete "#{path_sub}/#{full_file}"
rescue
# puts "Error deleting file: #{path_sub}/#{full_file}"
end
else
begin
FileUtils.mkdir_p "#{path_sub}/#{dir}"
rescue
puts "Error creating directory! #{path_sub}/#{dir}"
end
begin
File.write "#{path_sub}/#{full_file}", complete
rescue
puts "Error writing file! #{path_sub}/#{full_file} — #{frontmatter.inspect}"
end
end
# Add document path info to the redirect file, in mappings
redirect[:map][title] = full_file.chomp(ext)
# Add to git (when history is preserved)
unless comment.match(/^Created page with/) && redirect[title] || !history
git_author = wiki_author[username.downcase]
git_name = git_author.nil? ? username.downcase : (git_author[:name] || username.downcase)
git_email = git_author.nil? || git_author[:email].to_s.empty? ? "#{username.downcase.gsub(/ /, '_')}@wiki.conversion" : git_author[:email]
git_comment = comment.strip.empty? ? "Updated #{title_pretty}" : comment
git_comment = "Created #{title_pretty}" if comment.match(/^Created page with/)
# Shell-escape strings before they hit the command line
git_comment = Shellwords.escape git_comment
git_author = Shellwords.escape "#{git_name} <#{git_email}>"
command = 'git add * && git commit -q -a ' \
"--author=#{git_author} --date='#{timestamp}' -m #{git_comment}" \
' &> /dev/null'
begin
Process.wait Kernel.spawn(command, chdir: path)
rescue
puts 'Error committing!'
end
end
end
progress.finish
if errors
FileUtils.mkdir_p 'errors/'
errors.each do |fname, text|
filename_clean = fname.gsub(/[:\/<>&]/, '').squeeze(' ').gsub(/[: ]/, '_')
File.write "errors/#{filename_clean}.html.md", text
end
end
puts 'Conversion done!'
puts "#{errors.count} error#{errors.count != 1 ? 's' : ''} " \
'found, and saved in ./errors/' if errors.count > 0
# Output redirect mappings
File.write "#{path_sub}/redirects.yaml", redirect.to_yaml
# Add redirect mapping file to git
command_redirects = "git commit #{path_sub}/redirects.yaml" \
"&& git commit -a -m 'Added redirects'"
Process.wait Kernel.spawn(command_redirects, chdir: path)
# Clean up repo
if history
puts 'Re-packing repo:'
Process.wait Kernel.spawn('git gc --aggressive', chdir: path)
end