-
Notifications
You must be signed in to change notification settings - Fork 0
/
wp-to-rest.py
executable file
·392 lines (338 loc) · 13.1 KB
/
wp-to-rest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/python3.5
#
# Copyright (c) 2018, James C. McPherson. All Rights Reserved.
#
# Available under the terms of the MIT license:
#
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import bs4
import getopt
import os
import re
import sys
import time
__USAGE = """
Usage:
wp-to-rest <directory tree to process> <output directory>
This is a very simple script which aims to turn WordPress posts
into reST-formatted documents suitable to importing into a blog
engine such as Nikola or Pelican.
This tool operates on a BEST EFFORT basis - each translated file
mst be checked for accuracy prior to use with a blog engine.
It works for me, but you might need to hack on it for your purposes.
"""
__USAGE = __USAGE.strip()
__doc__ = __USAGE
# We search the posts that we find for three specific <div..> attributes:
#
# post-headline, post-bodycopy, post-footer
#
# post-headline gives us the title of the post without any extra bits
# post-bodycopy gives the text of the post which we want to massage into reST
# post-footer gives us the categories which the post appeared in.
#
# Rather than using the post-footer to tell us the publication date, we
# construct that from the post directory name and assert that all posts
# occurred at 10:00 in (time.strftime("%z")) - your utc offset.
#
# Once we have the information required for the post metadata, we start
# constructing the output file.
METADATA_TMPL="""
.. title: {0}
.. date: {1}T10:00:00 {2}
.. tags: {3}
.. category: {4}\n
"""
URL_TMPL=".. _{0}: {1}\n"
IMG_TMPL="\n.. image:: {0}\n"
PRE_TMPL="\n.. code-block::\n "
def urlreplace(instr):
instr = instr.replace("http://www.jmcpdotcom.com/rollerhttp", "http")
instr = instr.replace("http://www.jmcpdotcom.com/blog/wp-content/uploads",
"")
instr = instr.replace("http://www.jmcpdotcom.com/blog/wp-includes",
"")
instr = instr.replace(
"http://www.jmcpdotcom.com/wordpress/3.3/wp-content/uploads",
"")
return instr
def handle_img(tag):
imgdecl = IMG_TMPL.format(tag.get("src"))
imgdecl = urlreplace(imgdecl)
if tag.get("height"):
imgdecl += (" :height: {0}\n".format(tag.get("height")))
if tag.get("width"):
imgdecl += (" :width: {0}\n".format(tag.get("width")))
if tag.get("alt"):
imgdecl += (" :alt: {0}\n".format(tag.get("alt")))
if tag.get("title"):
print("\timage ref {0} has 'title' when it should just have 'alt'".
format(tag.get("src")))
imgdecl += "\n"
return imgdecl
def handle_thtd(tdata):
cells = []
for tdel in tdata:
if isinstance(tdel, bs4.element.NavigableString):
if tdel.string != '\n':
cells.append(tdel)
else:
cells.append(tags_r(tdel))
#print("\tcalled with {0}\n\treturning {1}\n".format(tdata, cells))
return cells
def format_thtd(cells, celltype):
# Called on a per-row basis
outstr = ""
# Handle the top of the table, key off whether we're given '='
# for celltype.
if celltype is "=":
outstr += "+"
for col in cells:
outstr += '-' * (len(col) + 2) + "+"
outstr += "\n"
tstr = "| "
lstr = "+"
for col in cells:
tstr += col + " | "
lstr += celltype * (len(col) + 2) + "+"
tstr += "\n"
tstr = urlreplace(tstr)
lstr += "\n"
outstr += tstr
outstr += lstr
return(outstr)
def handle_table(tag):
allrows = []
restr = ""
for jk in tag.children:
#print(jk.name)
if not jk.name or jk.name is "tbody":
#print("skipping tbody")
continue
if not isinstance(jk, bs4.element.NavigableString):
allrows.append(handle_thtd(jk))
retstr = format_thtd(allrows[0], "=")
for _l in range(1, len(allrows)):
retstr += format_thtd(allrows[_l], "-")
return retstr
def handle_pre(tag):
predecl = PRE_TMPL
for line in tag.contents:
if isinstance(line, bs4.element.Tag):
if not line.is_empty_element:
predecl += tags_r(line)
else:
predecl += line.replace("\n", "\n ")
predecl += "\n\n"
return predecl
def handle_blockquote(tag):
line = "::\n "
for el in tag.contents:
if isinstance(el, bs4.element.Tag):
line += tags_r(el)
else:
line += el + "\n "
line += "\n"
return line
def handle_a(tag):
retstr = ""
if isinstance(tag, bs4.element.NavigableString):
retstr += tag
else:
retstr = tags_r(tag)
url = "`{0}`_\ ".format(retstr)
return url
def tags_r(arg):
retstr = ""
for el in arg.contents:
if isinstance(el, bs4.element.NavigableString) and len(el) > 0:
retstr += el.strip("\n")
elif el.__dict__['name'] == "br":
continue
elif el.__dict__['name'] == "blockquote":
retstr += handle_blockquote(el)
elif el.__dict__['name'].startswith("img"):
retstr += handle_img(el)
elif el.__dict__['name'].startswith("a"):
retstr += handle_a(el)
elif el.__dict__['name'] == "tt":
retstr += "``" + tags_r(el) + "``"
elif el.__dict__['name'] == "it" or el.__dict__['name'] == "i":
retstr += "*" + tags_r(el) + "*"
elif el.__dict__['name'] == "b" or el.__dict__['name'] == "strong":
retstr += "**" + tags_r(el) + "**"
elif el.__dict__['name'] == "ul":
retstr += "\n" + tags_r(el)
elif el.__dict__['name'] == "li":
retstr += "\n - " + tags_r(el)
elif el.__dict__['name'] == "ol":
retstr += "\n" + tags_r(el)
elif el.__dict__['name'] == "hr":
retstr += "\n........ \n"
elif el.__dict__['name'] == "h1":
outstr = "".join(el.contents)
retstr += "\n" + outstr
retstr += "\n" + '=' * len(outstr)
elif el.__dict__['name'] == "h2" or el.__dict__['name'] == "h3":
outstr = ""
if not isinstance(el, bs4.element.NavigableString):
outstr += tags_r(el)
else:
#print(type(el))
outstr += el.strip("\n")
retstr += "\n" + outstr
retstr += "\n" + '-' * len(outstr)
elif el.__dict__['name'] == "p":
retstr += "\n"
retstr += tags_r(el)
retstr += "\n"
elif el.__dict__['name'] == "div":
print("\tGot a <div> with attrs {0}".format(el.attrs))
elif el.__dict__['name'] == "table":
retstr += handle_table(el)
retstr += "\n"
elif el.__dict__['name'].startswith("pre") or \
el.__dict__['name'].startswith("code"):
retstr += handle_pre(el)
retstr += "\n"
elif el.__dict__['name'] == "font":
retstr += "\n.. raw:: html\n" + el.contents + "\n"
else:
print("\tunknown tag name: {0}".format(el.__dict__['name']))
print("\ttag contents:\n{0}".format(el.contents))
retstr += tags_r(el)
return retstr
def get_other_meta(footer):
"""
Obtain the category list, publish date and slug from the footer.
"""
catlist = []
for elem in footer.findAll("a", {'rel': 'category tag'}):
catlist.append(elem.text)
theREs = re.compile(r"(.*/blog/)((\d{4}/\d{2}/\d{2})(.*))/#.*")
try:
slugall = footer.find("a", {'class': 'comments-link'}).get('href')
except AttributeError as _exc:
return catlist, None, None
slugfn = theREs.match(slugall).group(2).replace('/', '-')
pubdate = theREs.match(slugall).group(3).replace('/', '-')
return catlist, slugfn, pubdate
def get_post_title(post_head):
""" Obtain the title of the post, returns a string."""
try:
title = post_head.find("h1").text
except AttributeError as _exc:
try:
title = post_head.find("h2").text
except AttributeError as _exc2:
title = "ERROR: NO TITLE DECLARATION FOUND IN {0}".format(
post_head)
return title.replace("\n", "").replace("\t", "")
def get_list_of_posts(startdir, strippath):
"""
Returns a dict of {slugified-post-title: filename} to work on. If
strippath is not None, then we remove every strippath from the
slugified entry.
"""
postlist = {}
for dirname, _dirs, fname in os.walk(startdir):
if len(fname) > 0:
dirk = dirname
if strippath:
dirk = dirk.replace(strippath, "")
dirk = dirk.replace("/", "-")
for i in fname:
postlist[dirk] = os.path.join(dirname, i)
return postlist
if __name__ == "__main__":
""" main function, where we provide direction. """
startdir = sys.argv[1]
outfdir = sys.argv[2]
# get the list of files to process
allposts = get_list_of_posts(startdir, None)
utcoff = time.strftime("%z")
if not os.path.exists(outfdir):
os.makedirs(outfdir)
for slug in allposts:
fname = allposts[slug]
print("opening {0}".format(fname))
soup = bs4.BeautifulSoup(open(fname, "r"), "html.parser")
# How many posts do we have in this file? I really wish that
# findNext() was an iterator. Instead, find all the headers,
# bodycopy and footer elements, and make the assumption that
# they've been returned in associated order. We rely on the
# human to check this.
post_heads = soup.findAll(name="div", attrs={'class': 'post-headline'})
post_bodies = soup.findAll(name="div", attrs={'class': 'post-bodycopy'})
post_footers = soup.findAll(name="div", attrs={'class': 'post-footer'})
if len(post_heads) != len(post_bodies) or len(post_heads) != len(post_footers) \
or len(post_bodies) != len(post_footers):
# urg
print("Differing numbers of headers, bodies and footers in {0}.\n"
"Manual pre-processing required, sorry\n".format(fname))
print("headers: {0}\n{1}\n----\n # bodies {2}\n----\nfooters: {3}\n{4}".format(
len(post_heads), post_heads, len(post_bodies),
len(post_footers), [j for j in post_footers.findAll("a", {"class":"comments-link"})]))
continue
for idx in range(len(post_heads)):
title = get_post_title(post_heads[idx])
print("\tidx {0} has title :: {1}".format(idx, title))
# Get our list of hrefs
post_hrefs = {}
for ref in post_bodies[idx].findAll("a"):
#print(ref.__dict__)
if ref.has_attr('id'):
post_hrefs[ref['id']] = ref.text
else:
post_hrefs[ref.text] = ref['href']
outstr = tags_r(post_bodies[idx])
categories, slugfn, pubdate = get_other_meta(post_footers[idx])
if slugfn is None:
# new-style footer, have to use a different approach
pubdate = slug[0:10]
slugfn = slug
outfn = os.path.join(outfdir, slugfn) + ".rst"
outf = open(outfn, "w")
metablock = METADATA_TMPL.format(title,
pubdate, utcoff,
", ".join(categories),
categories[0])
#print(metablock)
outf.write(metablock)
outf.write(outstr)
outf.write("\n")
urlblock = ""
for k in post_hrefs:
urlblock += URL_TMPL.format(k, post_hrefs[k])
urlblock = urlblock.replace("http://www.jmcpdotcom.com/rollerhttp",
"http")
urlblock = urlblock.replace(
"http://www.jmcpdotcom.com/blog/wp-content/uploads",
"")
urlblock = urlblock.replace(
"http://www.jmcpdotcom.com/blog/wp-includes",
"")
outf.write(urlblock)
outf.write("\n")
outf.close()
print("Finished processing {0}\n".format(fname))