-
Notifications
You must be signed in to change notification settings - Fork 0
/
gdocs2html5.py
289 lines (261 loc) · 11.9 KB
/
gdocs2html5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# -*- coding: utf8 -*-
"""This module converts Google Docs HTML to structured HTML5"""
import sys
import os
import re
import urllib2
import subprocess
import libxml2
import libxslt
from tidylib import tidy_document
from xhtmlpremailer import xhtmlPremailer
from lxml import etree
import magic
from functools import partial
import html2text
CURRENT_DIR = os.path.dirname(__file__)
XHTML_ENTITIES = os.path.join(
CURRENT_DIR, 'xslt_gdocs_structured_html', 'catalog_xhtml', 'catalog.xml')
download_files_from_google = False
gmath_latex = []
def tidy2xhtml(html):
"""Use HTML Tidy to tidy up Google Docs HTML"""
xhtml, errors = tidy_document(html, options={
'output-xhtml': 1, # XHTML instead of HTML4
'indent': 0, # Don't use indent, add's extra linespace or linefeeds which are big problems
'tidy-mark': 0, # No tidy meta tag in output
'wrap': 0, # No wrapping
'alt-text': '', # Help ensure validation
'doctype': 'strict', # Little sense in transitional for tool-generated markup...
'force-output': 1, # May not get what you expect but you will get something
'numeric-entities': 1, # remove HTML entities like e.g. nbsp
'clean': 1, # remove
'bare': 1,
'word-2000': 1,
'drop-proprietary-attributes': 1,
'enclose-text': 1, # enclose text in body always with <p>...</p>
'logical-emphasis': 1 # transforms <i> and <b> text to <em> and <strong> text
})
# TODO: parse errors from tidy process
return xhtml, {}
def premail(xhtml):
"""Move CSS from stylesheet inside the tags itself.
BTW: Premailer does this usually for old email clients.
Use a special XHTML Premailer which does not destroy the XML structure.
"""
premailer = xhtmlPremailer(xhtml)
premailed_xhtml = premailer.transform()
return premailed_xhtml, {}
def tex2mathml(xml):
"""Use Blahtex transformation from TeX to XML.
http://gva.noekeon.org/blahtexml/
"""
# Do not run blahtex if we are not on Linux or Mac!
if os.name == 'posix':
xpathFormulars = etree.XPath('//nohtml:tex[@tex]', namespaces={'nohtml':'http://nohtml'})
formularList = xpathFormulars(xml)
for formular in formularList:
strTex = urllib2.unquote(formular.get('tex'))
#TODO: Ubuntu has 'blahtexml', when compiled by yourself the binary name will be 'blahtex'. This needs to be more dynamically!
strCmdBlahtex = ['blahtexml','--mathml']
# run the program with subprocess and pipe the input and output to variables
p = subprocess.Popen(strCmdBlahtex, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
#TODO: Catch blahtex processing errors!
strMathMl, strErr = p.communicate(strTex) # set STDIN and STDOUT and wait till the program finishes
mathMl = etree.fromstring(strMathMl)
annotation = etree.Element("annotation", encoding="math/tex")
annotation.text = strTex
mathMl.append(annotation)
formular.append(mathMl)
# How blahtex output looks like. Needs further processing (semantics, enclose all math into one tag, move annotation to right position)
# <blahtex>
# <mathml>
# <markup>
# <mrow><mi>x</mi><mo lspace="0.278em" rspace="0.278em">=</mo><msup><mi>d</mi><mn>2</mn></msup></mrow>
# </markup>
# </mathml>
# <annotation encoding="math/tex">x={d}^{2}</annotation>
# </blahtex>
else:
print 'Error: Math will not be converted! Blahtex is only available on Linux!'
return xml
def gmath2mathml(xml):
# Do not run blahtex if we are not on Linux or Mac!
if os.name == 'posix':
xpathFormulars = etree.XPath('//nohtml:gmath', namespaces={'nohtml':'http://nohtml'})
formularList = xpathFormulars(xml)
for position, formular in enumerate(formularList):
try:
strTex = gmath_latex[position]
except IndexError:
strTex = 'KixGdocsEerror'
#TODO: Ubuntu has 'blahtexml', when compiled by yourself the binary name will be 'blahtex'. This needs to be more dynamically!
strCmdBlahtex = ['blahtexml','--mathml']
# run the program with subprocess and pipe the input and output to variables
p = subprocess.Popen(strCmdBlahtex, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
#TODO: Catch blahtex processing errors!
strMathMl, strErr = p.communicate(strTex) # set STDIN and STDOUT and wait till the program finishes
mathMl = etree.fromstring(strMathMl)
annotation = etree.Element("annotation", encoding="math/tex")
annotation.text = strTex
mathMl.append(annotation)
formular.append(mathMl)
else:
print 'Error: Math will not be converted! Blahtex is only available on Linux!'
return xml
# Get the filename without extension form a URL
# TODO: This does not worked reliable
# def getNameFromUrl(s):
# return os.path.splitext(urllib2.unquote(os.path.basename(urlparse(s).path)))[0]
# Downloads images from Google Docs and sets metadata for further processing
def download_images(xml):
objects = {} # image contents will be saved here
xpathImages = etree.XPath('//nohtml:image', namespaces={'nohtml':'http://nohtml'})
imageList = xpathImages(xml)
for position, image in enumerate(imageList):
strImageUrl = image.get('src')
print "Download GDoc Image: " + strImageUrl # Debugging output
# TODO: This try finally block does not work when we have e.g. no network!!!
try:
strImageContent = urllib2.urlopen(strImageUrl).read()
# get Mime type from image
strImageMime = magic.whatis(strImageContent)
# only allow this three image formats
if strImageMime in ('image/png', 'image/jpeg', 'image/gif'):
image.set('mime-type', strImageMime)
strImageName = "gd-%04d" % (position + 1) # gd0001.jpg
if strImageMime == 'image/jpeg':
strImageName += '.jpg'
elif strImageMime == 'image/png':
strImageName += '.png'
elif strImageMime == 'image/gif':
strImageName += '.gif'
#Note: SVG is currently (2012-03-08) not supported by GDocs.
image.text = strImageName
# add contents of image to object
objects[strImageName] = strImageContent
# just for debugging
#myfile = open(strImageName, "wb")
#myfile.write(strImageContent)
#myfile.close
finally:
pass
return xml, objects
# Initialize libxml2, e.g. transforming XHTML entities to valid XML
def init_libxml2(xml):
libxml2.loadCatalog(XHTML_ENTITIES)
libxml2.lineNumbersDefault(1)
libxml2.substituteEntitiesDefault(1)
return xml, {}
def xslt(xsl, xml):
# XSLT transformation with libxml2
xsl = os.path.join(CURRENT_DIR, 'xslt_gdocs_structured_html', xsl) # TODO: Needs a cleaner solution
style_doc = libxml2.parseFile(xsl)
style = libxslt.parseStylesheetDoc(style_doc)
# doc = libxml2.parseFile(afile)) # another way, just for debugging
doc = libxml2.parseDoc(xml)
result = style.applyStylesheet(doc, None)
# style.saveResultToFilename(os.path.join('output', docFilename + '_xyz.xml'), result, 1) # another way, just for debugging
xml_result = style.saveResultToString(result)
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return xml_result, {}
def tex2mathml_transform(xml):
# Parse XML with etree from lxml for TeX2MathML
etree_xml = etree.fromstring(xml)
# Convert TeX to MathML with Blahtex
etree_xml = tex2mathml(etree_xml)
etree_xml = gmath2mathml(etree_xml)
return etree.tostring(etree_xml), {}
# Download Google Docs Images
def image_puller(xml):
if download_files_from_google:
image_objects = {}
etree_xml = etree.fromstring(xml)
etree_xml, image_objects = download_images(etree_xml)
return etree.tostring(etree_xml), image_objects
else:
return xml, {}
def extract_math_from_kix(kix_content):
# find all gmath expressions
encoded_math_list = re.findall("https?:\/\/api\.gmath\.guru\/cgi-bin\/gmath\?(.*)\f.*", kix_content)
latex_list = []
# decode url encoded math to UTF-8 string
for encoded_math in encoded_math_list:
decoded_math = urllib2.unquote(encoded_math).decode('utf8')
# remove dpi settings for blahtex
cleaned_math = re.sub(r'\\dpi{\d+}', '', decoded_math)
latex_list.append(cleaned_math)
return latex_list
# result from every step in pipeline is a string (xml) + object {...}
# explanation of "partial" : http://stackoverflow.com/q/10547659/756056
TRANSFORM_PIPELINE = [
tidy2xhtml, # 1
premail, # 2
init_libxml2, # 3
partial(xslt, 'pass0_gdocs_headers.xsl'), # 4
partial(xslt, 'pass1_new_min_header_level.xsl'), # 5
partial(xslt, 'pass2_xhtml_gdocs_headers.xsl'), # 6
partial(xslt, 'pass3_gdocs_listings.xsl'), # 7
partial(xslt, 'pass4_gdocs_listings.xsl'), # 8
partial(xslt, 'pass5_gdocs_listings.xsl'), # 9
partial(xslt, 'pass6_gdocs2html5.xsl'), # 11
tex2mathml_transform, # 12
image_puller, # 13
partial(xslt, 'pass7_nohtml_postprocessing.xsl'), # 14
]
# the function which is called from outside to start transformation
def gdocs_to_html5(content, kixcontent=None, bDownloadImages=False, debug=False):
global gmath_latex
global download_files_from_google
if (kixcontent==None):
gmath_latex = []
else:
gmath_latex = extract_math_from_kix(kixcontent)
objects = {}
xml = content
download_files_from_google = bDownloadImages
# write input file to debug dir
if debug: # create for each pass an output file
filename = os.path.join(CURRENT_DIR, 'gdocs_debug', 'input.htm') # TODO: needs a timestamp or something
f = open(filename, 'w')
f.write(xml)
f.flush()
f.close()
for i, transform in enumerate(TRANSFORM_PIPELINE):
newobjects = {}
xml, newobjects = transform(xml)
if len(newobjects) > 0:
objects.update(newobjects) # copy newobjects into objects dict
print "== Pass: %02d | Function: %s | Objects: %s ==" % (i+1, transform, objects.keys())
if debug: # create for each pass an output file
filename = os.path.join(CURRENT_DIR, 'gdocs_debug', 'pass%02d.xml' % (i+1)) # TODO: needs a timestamp or something
f = open(filename, 'w')
f.write(xml)
f.flush()
f.close()
# write objects to debug dir
if debug:
for image_filename, image in objects.iteritems():
image_filename = os.path.join(CURRENT_DIR, 'gdocs_debug', image_filename) # TODO: needs a timestamp or something
image_file = open(image_filename, 'wb') # write binary, important!
try:
image_file.write(image)
image_file.flush()
finally:
image_file.close()
return xml, objects
def html5_to_markdown(html):
h = html2text.HTML2Text()
h.ignore_links = False
h.skip_internal_links = False
h.unicode_snob = True
h.bypass_tables = True
markdown = h.handle(html)
return markdown
if __name__ == "__main__":
f = open(sys.argv[1])
content = f.read()
#print gdocs_to_cnxml(content)
gdocs_to_cnxml(content, bDownloadImages=True, debug=True)