-
Notifications
You must be signed in to change notification settings - Fork 0
/
testbed_gdocs_simple_http.py
167 lines (140 loc) · 6.41 KB
/
testbed_gdocs_simple_http.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
'''
Testing and development of GDocs to CNXML transformation.
Transforms all GDocs URLs in testbed folder to CNXML.
Validates the result with Jing Relax NG.
Input are all URLs in TESTBED_INPUT_URLS_FILE.
CNXML results are saved as in a directory named like the GDocs ID as:
.xml - the CNXML result
.htm - the raw HTML GDocs input format before transformation
.png/.jpg/.gif - including all images
.log - Jing Relax NG validation results
If there is no error during validation the .log file has zero bytes.
Created on 14.09.2011
@author: Marvin Reimer
'''
import sys
import os
import subprocess
import re
import shutil
import httplib2
from gdocs2html5 import gdocs_to_cnxml
TESTBED_INPUT_DIR = "testbed_gdocs" # the testbed folder
TESTBED_INPUT_URLS_FILE = "testbed_gdocs_urls.cfg"
TESTBED_OUTPUT_DIR = "testbed_gdocs_output"
# tests if java is installed and available at commandline
def java_installed():
error = True
try:
p = subprocess.Popen('java -version', shell=True, stdout=subprocess.PIPE)
error = p.communicate()[1]
finally:
return not error
# Be careful with this command!
def delete_all_contents_of_folder(folder):
if os.path.isdir(folder):
for root, dirs, files in os.walk(folder):
for f in files:
os.unlink(os.path.join(root, f))
for d in dirs:
shutil.rmtree(os.path.join(root, d))
# prints a status message surrounded by some lines
def print_status(status_message):
print '=' * 79
print status_message
print '=' * 79
# Jing validation and save log file
def jing_validate_file(xml_filename, log_filename):
# build the java commandline string
jing_jar_filename = os.path.join('jing', 'jing.jar')
jing_rng_filename = os.path.join('jing', 'cnxml-jing.rng')
java_cmd = 'java -jar %s %s %s' % (jing_jar_filename, jing_rng_filename, xml_filename)
# validate XML and save log file
jing_log_file = open(log_filename, 'w')
try:
p = subprocess.Popen(java_cmd, shell=True, stdout=subprocess.PIPE)
jing_log, error_data = p.communicate()
if not error_data:
jing_log_file.write(jing_log)
else:
jing_log_file.write(error_data)
finally:
jing_log_file.close()
# converts all URLs in testbed input file to CNXML output folder
def main():
# keep sure Java is installed (needed for Jing)
if not java_installed():
print "ERROR: Could not find Java. Please keep sure that Java is installed and available."
exit(1)
# delete the contents of the testbed folder
delete_all_contents_of_folder(TESTBED_OUTPUT_DIR)
# open file with GDocs public documents URLs (<- the testbed for GDocs)
url_file = open(os.path.join(TESTBED_INPUT_DIR, TESTBED_INPUT_URLS_FILE))
for url in url_file:
if not url.startswith('#'): # ignore comments
# check if we really have a gdocs document with an ID
# Get the ID out of the URL with regular expression
match_doc_id = re.match(r'^.*docs\.google\.com/document/d/([^/]+).*$', url)
if match_doc_id:
doc_id = match_doc_id.group(1)
# create a sub directory named like the ID
doc_output_dir = os.path.join(TESTBED_OUTPUT_DIR, doc_id)
try:
os.mkdir(doc_output_dir)
except OSError:
pass # If subdirectory already exists do nothing
doc_key = 'document:' + doc_id
print_status('Getting ' + doc_key)
# get the Google Docs by fetching the HTML directly
http = httplib2.Http()
http.follow_redirects = False
try:
plain_html_url = 'https://docs.google.com/document/d/%s/export?format=html&confirm=no_antivirus' % doc_id
print_status('URL: ' + plain_html_url)
resp, html = http.request(plain_html_url)
except HttpError:
print "Error: Failed to download Google Docs HTML"
try:
kix_url = 'https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=kix' % doc_id
print_status('URL: ' + kix_url)
resp, kix = http.request(kix_url)
except HttpError:
print "Error: Failed to download Google Docs Kix"
# write testbed source html output
html_filename = os.path.join(doc_output_dir, doc_id +'.htm')
html_file = open(html_filename, 'w')
try:
html_file.write(html)
html_file.flush()
finally:
html_file.close()
print_status('Transforming and get images from %s' % doc_key)
# transformation and get images
cnxml, objects = gdocs_to_cnxml(html, kixcontent=kix, bDownloadImages=True)
# write testbed images
for image_filename, image in objects.iteritems():
image_filename = os.path.join(doc_output_dir, image_filename)
image_file = open(image_filename, 'wb') # write binary, important!
try:
image_file.write(image)
image_file.flush()
finally:
image_file.close()
# write testbed CNXML output
cnxml_filename = os.path.join(doc_output_dir, doc_id + '.xml')
cnxml_file = open(cnxml_filename, 'w')
try:
cnxml_file.write(cnxml)
cnxml_file.flush()
finally:
cnxml_file.close()
# validate CNXML output with Jing Relax NG
if len(sys.argv) > 1 and sys.argv[1] == '-noval':
print_status('Validation skipped')
else:
print_status('Validating %s' % doc_key)
jing_log_filename = os.path.join(doc_output_dir, doc_id + '.log')
jing_validate_file(cnxml_filename, jing_log_filename)
print_status('Finished!!!')
if __name__ == "__main__":
main()