-
Notifications
You must be signed in to change notification settings - Fork 0
/
gutenb
executable file
·205 lines (191 loc) · 7.88 KB
/
gutenb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python
# Copyright (c) 2016, Max Fillinger <[email protected]>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
import argparse
import getebook
import getebook.epub
import html
import html.parser
import requests
import urllib.parse
import warnings
# Since all the books on gutenberg.spiegel.de are in German it might be
# a bit silly to write everything here in English, but that way it can
# also work as an extended example for how to use the getebook package.
class GutenbEbookParser(getebook.EbookParser):
'EbookParser initialized for gutenberg.spiegel.de.'
def __init__(self, builder):
'''Initialize the parser instance. Adds some quirks specific to
gutenberg.spiegel.de.'''
super().__init__(builder,
link_next='^Kapitel [0-9]* >>$',
root_tag='div',
root_id='gutenb'
)
# quirks.skip is used to tell the parser that some elements are
# not supposed to appear in the output. In this case, headings
# with class "author", "title" and "subtitle" should be skipped
# since the EpuBuilder makes its own titlepage.
self.quirks.skip(tag='h*',
class_val=['author', 'title', 'subtitle'],
id_val=None
)
self.quirks.skip('h*', class_val=None, id_val=None, text_re='^Roman$')
# quirks.par_heading informs the parser that some html inside
# <p> tags is (part of) a chapter heading. Below, we tell the
# parser that, e.g., <p class="centerbig">1. Kapitel</p> should
# be treated as a heading.
self.quirks.par_heading('centerbig', None, '^[0-9]*\. Kapitel$')
# There are probably many more quirks, but these are the ones I
# found so far.
# Since all books we parse here are from Projekt Gutenberg-DE, we
# can simplify the arguments to getebook().
def getebook(self, url):
'''Parse the html from url, and keep following the link to the
next part of the book.'''
base = 'http://gutenberg.spiegel.de'
if url.startswith(base):
path = url[len(base):]
super().getebook(base, path)
class MetadataError(Exception):
pass
# The GutenbMetaParser has no relation with the EbookParser class.
class GutenbMetaParser(html.parser.HTMLParser):
'''Extract author and title. This information is usually given on the
first page of the book, but sometimes it is missing.'''
title = None
main_title = None
subtitle = None
author = None
def __init__(self, author, main_title, subtitle):
'''Initialize the parser. If author, title, or subtitle are
already known, they can be supplied here, otherwise the
arguments should be set to None.'''
self.meta = {}
if author:
self.meta['author'] = author
if main_title:
self.meta['title'] = main_title
if subtitle:
self.meta['subtitle'] = subtitle
self.key = None
super().__init__(convert_charrefs = True)
def handle_starttag(self, tag, attr):
'Handle starttag.'
if tag == 'h2' and ('class', 'title') in attr:
self.key = 'title'
elif tag == 'h3' and ('class', 'author') in attr:
self.key = 'author'
elif tag == 'h4' and ('class', 'subtitle') in attr:
self.key = 'subtitle'
elif tag == 'br' and self.key:
self.meta[self.key] += '. '
# If we already know title, subtitle or author, set key = None.
if self.key in self.meta:
self.key = None
def handle_data(self, data):
'Handle data.'
# Strip whitespace around lines.'
if self.key:
text = ' '.join([l.strip() for l in data.splitlines() \
if len(l) > 0 and not l.isspace()])
try:
self.meta[self.key] += text
except KeyError:
self.meta[self.key] = text
def handle_endtag(self, tag):
if tag == 'h2' and self.key == 'title':
self.key = None
elif tag == 'h3' and self.key == 'author':
self.key = None
elif tag == 'h4' and self.key == 'subtitle':
self.key == None
# Use argparse to process command line arguments and display usage
# information.
argp = argparse.ArgumentParser(description = (
'Download a book from Projekt Gutenberg-DE and convert it to an\n'
'epub file. The first argument is the url to the book, and the second\n'
'one is the name of the output file.\n'
),
epilog = (
'If no author, title, and/or subtitle are given, the program tries to\n'
'extract that information from the book. Currently, this only works if\n'
'this information appears in the main text.'
))
argp.add_argument('-a', '--author', help = 'Name of the author')
argp.add_argument('-t', '--title', help = 'Title of the book')
argp.add_argument('-s', '--subtitle', help = 'Subtitle of the book')
argp.add_argument('url')
argp.add_argument('filename')
args = argp.parse_args()
if not args.url.startswith('http://gutenberg.spiegel.de'):
args.url = urllib.parse.urljoin('http://gutenberg.spiegel.de', args.url)
# Get metadata.
meta_p = GutenbMetaParser(args.author, args.title, args.subtitle)
if not (args.author and args.title):
# We need to use GutenbMetaParser to look for metadata in the book.
r = requests.get(args.url)
if not r:
raise getebook.PageNotFound('Got error code %03d' % r.get)
first_page = r.text
meta_p.feed(first_page)
meta_p.close()
try:
main_title = meta_p.meta['title']
title = main_title
except KeyError:
raise MetadataError('Failed to find the book title.')
try:
author = meta_p.meta['author']
except KeyError:
raise MetadataError('Failed to find the author.')
try:
subtitle = meta_p.meta['subtitle']
title += '. ' + subtitle
except KeyError:
pass
with getebook.epub.EpubBuilder(args.filename) as bld:
# Add css for some classes that appear in the html.
bld.style_css += (
'.center, .motto, .abstract {\n'
' text-align: center;\n'
'}\n'
'.centerbig {\n'
' text-align: center;\n'
' font-size: 120%;\n'
'}\n'
)
bld.title = title
# Assign an UID for the EPUB. The EPUB specification requires that
# every book is assigned a unique identifier. If this is skipped,
# the builder creates a pseudo-random UID that is extremely unlikely
# to collide with any existing book.
tr_table = {ord(' '): '-', ord('.'): None}
bld.uid = 'getebook-gutenb-' + title.lower().translate(tr_table)
bld.lang = 'de'
bld.author = author
try:
bld.titlepage(main_title, subtitle)
except NameError:
# There is no subtitle.
# Without arguments, titlepage() takes the builder's title
# attribute for the main title, and no subtitle.
bld.titlepage()
# bld.insert_file() includes an external file in the epub. We use it
# here to add a page about the copyright; in_spine tells the builder
# that the page is part of the reading order.
bld.insert_file('gutenb-copyright.html', in_spine = True)
p = GutenbEbookParser(bld)
p.getebook(args.url)
p.close()