gutenb

#!/usr/bin/env python

# Copyright (c) 2016, Max Fillinger <max@max-fillinger.net>
# 
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# 
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.

import argparse
import getebook
import getebook.epub
import html
import html.parser
import requests
import urllib.parse
import warnings

# Since all the books on gutenberg.spiegel.de are in German it might be
# a bit silly to write everything here in English, but that way it can
# also work as an extended example for how to use the getebook package.

class GutenbEbookParser(getebook.EbookParser):
    'EbookParser initialized for gutenberg.spiegel.de.'
    def __init__(self, builder):
        '''Initialize the parser instance. Adds some quirks specific to
        gutenberg.spiegel.de.'''
        super().__init__(builder,
                         link_next='^Kapitel [0-9]* >>$',
                         root_tag='div',
                         root_id='gutenb'
                         )
        # quirks.skip is used to tell the parser that some elements are
        # not supposed to appear in the output. In this case, headings
        # with class "author", "title" and "subtitle" should be skipped
        # since the EpuBuilder makes its own titlepage.
        self.quirks.skip(tag='h*',
                         class_val=['author', 'title', 'subtitle'],
                         id_val=None
                         )
        self.quirks.skip('h*', class_val=None, id_val=None, text_re='^Roman$')
        # quirks.par_heading informs the parser that some html inside
        # <p> tags is (part of) a chapter heading. Below, we tell the
        # parser that, e.g., <p class="centerbig">1. Kapitel</p> should
        # be treated as a heading.
        self.quirks.par_heading('centerbig', None, '^[0-9]*\. Kapitel$')
        # There are probably many more quirks, but these are the ones I
        # found so far.

    # Since all books we parse here are from Projekt Gutenberg-DE, we
    # can simplify the arguments to getebook().
    def getebook(self, url):
        '''Parse the html from url, and keep following the link to the
        next part of the book.'''
        base = 'http://gutenberg.spiegel.de'
        if url.startswith(base):
            path = url[len(base):]
        super().getebook(base, path)

class MetadataError(Exception):
    pass

# The GutenbMetaParser has no relation with the EbookParser class.
class GutenbMetaParser(html.parser.HTMLParser):
    '''Extract author and title. This information is usually given on the
    first page of the book, but sometimes it is missing.'''
    title = None
    main_title = None
    subtitle = None
    author = None
    def __init__(self, author, main_title, subtitle):
        '''Initialize the parser. If author, title, or subtitle are
        already known, they can be supplied here, otherwise the
        arguments should be set to None.'''
        self.meta = {}
        if author:
            self.meta['author'] = author
        if main_title:
            self.meta['title'] = main_title
        if subtitle:
            self.meta['subtitle'] = subtitle
        self.key = None
        super().__init__(convert_charrefs = True)

    def handle_starttag(self, tag, attr):
        'Handle starttag.'
        if tag == 'h2' and ('class', 'title') in attr:
            self.key = 'title'
        elif tag == 'h3' and ('class', 'author') in attr:
            self.key = 'author'
        elif tag == 'h4' and ('class', 'subtitle') in attr:
            self.key = 'subtitle'
        elif tag == 'br' and self.key:
            self.meta[self.key] += '. '
        # If we already know title, subtitle or author, set key = None.
        if self.key in self.meta:
            self.key = None

    def handle_data(self, data):
        'Handle data.'
        # Strip whitespace around lines.'
        if self.key:
            text = ' '.join([l.strip() for l in data.splitlines() \
                                       if len(l) > 0 and not l.isspace()])
            try:
                self.meta[self.key] += text
            except KeyError:
                self.meta[self.key] = text

    def handle_endtag(self, tag):
        if tag == 'h2' and self.key == 'title':
            self.key = None
        elif tag == 'h3' and self.key == 'author':
            self.key = None
        elif tag == 'h4' and self.key == 'subtitle':
            self.key == None

# Use argparse to process command line arguments and display usage
# information.
argp = argparse.ArgumentParser(description = (
  'Download a book from Projekt Gutenberg-DE and convert it to an\n'
  'epub file. The first argument is the url to the book, and the  second\n'
  'one is the name of the output file.\n'
  ),
  epilog = (
  'If no author, title, and/or subtitle are given, the program tries to\n'
  'extract that information from the book. Currently, this only works if\n'
  'this information appears in the main text.'
  ))
argp.add_argument('-a', '--author', help = 'Name of the author')
argp.add_argument('-t', '--title', help = 'Title of the book')
argp.add_argument('-s', '--subtitle', help = 'Subtitle of the book')
argp.add_argument('url')
argp.add_argument('filename')
args = argp.parse_args()

if not args.url.startswith('http://gutenberg.spiegel.de'):
    args.url = urllib.parse.urljoin('http://gutenberg.spiegel.de', args.url)

# Get metadata.
meta_p = GutenbMetaParser(args.author, args.title, args.subtitle)
if not (args.author and args.title):
    # We need to use GutenbMetaParser to look for metadata in the book.
    r = requests.get(args.url)
    if not r:
        raise getebook.PageNotFound('Got error code %03d' % r.get)
    first_page = r.text
    meta_p.feed(first_page)
    meta_p.close()
try:
    main_title = meta_p.meta['title']
    title = main_title
except KeyError:
    raise MetadataError('Failed to find the book title.')
try:
    author = meta_p.meta['author']
except KeyError:
    raise MetadataError('Failed to find the author.')
try:
    subtitle = meta_p.meta['subtitle']
    title += '. ' + subtitle
except KeyError:
    pass

with getebook.epub.EpubBuilder(args.filename) as bld:
    # Add css for some classes that appear in the html.
    bld.style_css += (
      '.center, .motto, .abstract {\n'
      '  text-align: center;\n'
      '}\n'
      '.centerbig {\n'
      '  text-align: center;\n'
      '  font-size: 120%;\n'
      '}\n'
    )
    bld.title = title
    # Assign an UID for the EPUB. The EPUB specification requires that
    # every book is assigned a unique identifier. If this is skipped,
    # the builder creates a pseudo-random UID that is extremely unlikely
    # to collide with any existing book.
    tr_table = {ord(' '): '-', ord('.'): None}
    bld.uid = 'getebook-gutenb-' + title.lower().translate(tr_table)
    bld.lang = 'de'
    bld.author = author
    try:
        bld.titlepage(main_title, subtitle)
    except NameError:
        # There is no subtitle.
        # Without arguments, titlepage() takes the builder's title
        # attribute for the main title, and no subtitle.
        bld.titlepage()
    # bld.insert_file() includes an external file in the epub. We use it
    # here to add a page about the copyright; in_spine tells the builder
    # that the page is part of the reading order.
    bld.insert_file('gutenb-copyright.html', in_spine = True)
    p = GutenbEbookParser(bld)
    p.getebook(args.url)
    p.close()