Initial commit

jacobbudin · Apr 30, 2019 · 5d8e152 · 5d8e152
commit 5d8e152
Show file tree

Hide file tree

Showing 21 changed files with 575 additions and 0 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,20 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+
+[*.{py}]
+charset = utf-8
+
+[*.py]
+indent_style = space
+indent_size = 4
+
+[*.css]
+indent_style = space
+indent_size = 2
+
+[{.travis.yml}]
+indent_style = space
+indent_size = 2
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,14 @@
+# packages
+venv
+
+# python
+.eggs
+*.egg-info
+
+# cache
+__pycache__
+*.pyc
+tmp
+
+# build
+*.epub
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2019 Jacob Budin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+# Portable Wisdom
+
+Portable Wisdom is a tool to generate EPUB files from [Instapaper](https://www.instapaper.com/). You can then choose to sync these files to your ereader.
+
+## Technologies
+
+- Python 3
+
+## Features
+
+- Retrieves unread articles from Instapaper
+- Embeds web images, downsizes them, and converts them to greyscale
+- Caches articles and images
+- Creates well-formatted EPUB files, tailored for your ereader
+
+## Quick Start
+
+1. Download and install Portable Wisdom from PyPI:
+
+	$ pip install portable_wisdom
+
+2. [Request an Instapaper API key.](https://www.instapaper.com/main/request_oauth_consumer_token) (Or copy one from a friend.)
+4. Run Portable Wisdom from the command line:
+
+	$ portable-wisdom --instapaper-key KEY \
+		--instapaper-secret SECRET \
+		--instapaper-login USER \
+		--instapaper-password PASS
+
+On success, the script will print the output filename. For all the options, run `$ portable-wisdom -h`.
+
+## License
+
+MIT License
diff --git a/portable_wisdom/__init__.py b/portable_wisdom/__init__.py
diff --git a/portable_wisdom/article.py b/portable_wisdom/article.py
@@ -0,0 +1,10 @@
+class Article:
+    def __init__(self, title, content):
+        self.title = title
+        self.content = content
+
+    def __repr__(self):
+        return 'Article("%s", "%s")' % (self.title, self.content)
+
+    def __str__(self):
+        return '"%s"' % self.title
diff --git a/portable_wisdom/cache.py b/portable_wisdom/cache.py
@@ -0,0 +1,36 @@
+from diskcache import Cache as DiskCache
+import logging
+import os
+
+CACHE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'tmp')
+CACHE_SIZE = 64 * 1000000 #64MB
+
+class Cache:
+    def __init__(self):
+        self.cache = DiskCache(CACHE_PATH, size_limit=CACHE_SIZE)
+
+    def get(self, key):
+        value = self.cache.get(key)
+
+        if value:
+            logging.debug('Hit cache key %s' % key)
+
+        return value
+
+    def clear(self):
+        return self.cache.clear()
+
+    def set(self, key, value):
+        return self.cache.set(key, value)
+
+    def get_or(self, key, _or):
+        """Get a key's value, or use function's return value to set"""
+        if key in self.cache:
+            logging.debug('Hit cache key %s' % key)
+            return self.cache[key]
+
+        value = _or()
+        self.cache.set(key, value)
+        return value
+
+cache = Cache()
diff --git a/portable_wisdom/config.py b/portable_wisdom/config.py
@@ -0,0 +1,26 @@
+# HTML elements that are preserved
+ALLOWED_TAGS = ('p', 'b', 'i', 'blockquote', 'strong', 'em', 'figure', 'figcaption', 'img')
+
+# HTML element attributes that are preserved
+ALLOWED_ATTRIBUTES = ('src', )
+
+# Instapaper configuration
+INSTAPAPER_KEY = ''
+INSTAPAPER_SECRET = ''
+INSTAPAPER_LOGIN = ''
+INSTAPAPER_PASSWORD = ''
+
+# Maximum number of articles to include
+ARTICLE_LIMIT = 25
+
+# Maximum dimensions of embedded images
+IMAGE_MAX_SIZE = (600, 600)
+
+# Name of stylesheet to use
+STYLE = 'nook-glowlight-3'
+
+# Debug mode
+DEBUG = False
+
+# Verbose mode
+VERBOSE = False
diff --git a/portable_wisdom/epub.py b/portable_wisdom/epub.py
@@ -0,0 +1,91 @@
+from bs4 import BeautifulSoup
+from .cache import cache
+from urllib.parse import urlparse
+import os
+from ebooklib import epub
+import io
+import logging
+import requests
+from PIL import Image
+from .config import *
+
+def embed_images(book):
+    """Embeds remote images in EPUB HTML chapters"""
+    image_names = set()
+
+    for item in book.items:
+        if type(item) is not epub.EpubHtml:
+            continue
+
+        # Parse HTML, find `img` elements
+        soup = BeautifulSoup('<html><body>%s</body></html>' % item.content, 'html5lib')
+
+        for img in soup.find_all('img'):
+            src = img.get('src')
+
+            # Remove junk images
+            if not src:
+                img.decompose()
+                continue
+            if src.startswith('denied:'):
+                img.decompose()
+                continue
+            if src.startswith('data:'):
+                img.decompose()
+                continue
+
+            src_parts = urlparse(src)
+            ext = os.path.splitext(src_parts.path)[1]
+            name = str(hash(src)) + ext
+
+            if name not in image_names:
+                # Create `EpubImage` wrapper object
+                image = epub.EpubImage()
+                image.id = str(hash(src))
+                image.file_name = name
+
+                thumbnail_hash = src + str(IMAGE_MAX_SIZE)
+                thumbnail_bytes = cache.get(thumbnail_hash)
+
+                # Download the image
+                if thumbnail_bytes:
+                    thumbnail = io.BytesIO(thumbnail_bytes)
+                else:
+                    thumbnail = io.BytesIO()
+
+                    try:
+                        logging.info('Downloading image %s', img['src'])
+                        content = requests.get(img['src']).content
+                    except requests.exceptions.ContentDecodingError as e:
+                        logging.error('Skipping image %s (%s)' % (img['src'], e))
+                        continue
+                    except requests.exceptions.ConnectionError as e:
+                        logging.error('Skipping image %s (%s)' % (img['src'], e))
+                        continue
+
+                    original = io.BytesIO()
+                    original.write(content)
+
+                    try:
+                        # Create smaller, greyscale image from source image
+                        im = Image.open(original).convert('RGBA') # convert to `RGBA` before `L` or Pillow will complain
+                        im.thumbnail(IMAGE_MAX_SIZE)
+                        im = im.convert('L')
+                        im.save(thumbnail, 'png' if ext == '.png' else 'jpeg')
+
+                    except OSError as e:
+                        logging.error('Skipping image %s (%s)' % (img['src'], e))
+                        continue
+
+                    cache.set(thumbnail_hash, thumbnail.getvalue())
+
+                thumbnail.seek(0)
+
+                image.content = thumbnail.read()
+                book.add_item(image)
+                image_names.add(name)
+
+            img['style'] = 'max-width: 100%'
+            img['src'] = name
+
+        item.content = str(soup.body)
diff --git a/portable_wisdom/source.py b/portable_wisdom/source.py
@@ -0,0 +1,45 @@
+import os.path
+from .config import *
+from ebooklib import epub
+import logging
+
+STYLES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'styles')
+
+class Source:
+    def to_epub(self, style=None):
+        """Generate `EpubBook` from result of `get_articles`"""
+        if not style:
+            style = STYLE
+
+        logging.info('Creating book using %s style' % style)
+        articles = self.get_articles()
+
+        book = epub.EpubBook()
+        book.set_title(self.__class__.name)
+
+        # Create HTML file for each article
+        chapters = []
+        for i, article in enumerate(articles):
+            chapter = epub.EpubHtml(uid=str(i), title=article.title, file_name=('%d.xhtml' % i))
+            chapter.content = '<html><head><link rel="stylesheet" href="style/default.css" /></head><body>' + ('<h1>%s</h1>' % article.title) + article.content + '</body></html>'
+            chapters.append(chapter)
+            book.add_item(chapter)
+
+        # Add generic book metadata
+        book.toc = map(lambda c: epub.Link(c.get_name(), c.title, str(c.get_id())), chapters)
+        book.add_item(epub.EpubNcx())
+        book.add_item(epub.EpubNav())
+
+        # Add stylesheet
+        if not style.endswith('.css'):
+            style = style + '.css'
+
+        style_path = os.path.join(STYLES_PATH, style)
+        with open(style_path) as f:
+            nav_css = epub.EpubItem(uid="style_nav", file_name="style/default.css", media_type="text/css", content=f.read())
+
+        book.add_item(nav_css)
+
+        book.spine = ['nav'] + chapters
+
+        return book
diff --git a/portable_wisdom/sources/__init__.py b/portable_wisdom/sources/__init__.py
@@ -0,0 +1 @@
+from .instapaper import Instapaper
diff --git a/portable_wisdom/sources/instapaper.py b/portable_wisdom/sources/instapaper.py
@@ -0,0 +1,33 @@
+import logging
+import os.path
+from ..cache import cache
+from ..article import Article
+from ..config import *
+from ..source import Source
+from pyinstapaper.instapaper import Instapaper as PInstapaper
+
+class Instapaper(Source):
+    name = 'Instapaper'
+
+    def get_articles(self):
+        """Produce a list of Articles"""
+        instapaper = PInstapaper(INSTAPAPER_KEY, INSTAPAPER_SECRET)
+        instapaper.login(INSTAPAPER_LOGIN, INSTAPAPER_PASSWORD)
+
+        # Enforce 25 article maximum
+        limit = 25
+
+        if ARTICLE_LIMIT:
+            limit = min(limit, ARTICLE_LIMIT)
+
+        bookmarks = instapaper.get_bookmarks('unread', limit)
+        articles = []
+
+        for bookmark in bookmarks:
+            content = cache.get_or(bookmark.hash,
+                    lambda: bookmark.get_text()['data'].decode())
+            article = Article(title=bookmark.title, content=content)
+            articles.append(article)
+
+        logging.info('Retrieved %d articles' % len(articles))
+        return articles