diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..88ff976 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true + +[*.{py}] +charset = utf-8 + +[*.py] +indent_style = space +indent_size = 4 + +[*.css] +indent_style = space +indent_size = 2 + +[{.travis.yml}] +indent_style = space +indent_size = 2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d2b24e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# packages +venv + +# python +.eggs +*.egg-info + +# cache +__pycache__ +*.pyc +tmp + +# build +*.epub diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f45c27e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2019 Jacob Budin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..56f2fba --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Portable Wisdom + +Portable Wisdom is a tool to generate EPUB files from [Instapaper](https://www.instapaper.com/). You can then choose to sync these files to your ereader. + +## Technologies + +- Python 3 + +## Features + +- Retrieves unread articles from Instapaper +- Embeds web images, downsizes them, and converts them to greyscale +- Caches articles and images +- Creates well-formatted EPUB files, tailored for your ereader + +## Quick Start + +1. Download and install Portable Wisdom from PyPI: + + $ pip install portable_wisdom + +2. [Request an Instapaper API key.](https://www.instapaper.com/main/request_oauth_consumer_token) (Or copy one from a friend.) +4. Run Portable Wisdom from the command line: + + $ portable-wisdom --instapaper-key KEY \ + --instapaper-secret SECRET \ + --instapaper-login USER \ + --instapaper-password PASS + +On success, the script will print the output filename. For all the options, run `$ portable-wisdom -h`. + +## License + +MIT License diff --git a/portable_wisdom/__init__.py b/portable_wisdom/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/portable_wisdom/article.py b/portable_wisdom/article.py new file mode 100644 index 0000000..23d6e3b --- /dev/null +++ b/portable_wisdom/article.py @@ -0,0 +1,10 @@ +class Article: + def __init__(self, title, content): + self.title = title + self.content = content + + def __repr__(self): + return 'Article("%s", "%s")' % (self.title, self.content) + + def __str__(self): + return '"%s"' % self.title diff --git a/portable_wisdom/cache.py b/portable_wisdom/cache.py new file mode 100644 index 0000000..bb0c862 --- /dev/null +++ b/portable_wisdom/cache.py @@ -0,0 +1,36 @@ +from diskcache import Cache as DiskCache +import logging +import os + +CACHE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'tmp') +CACHE_SIZE = 64 * 1000000 #64MB + +class Cache: + def __init__(self): + self.cache = DiskCache(CACHE_PATH, size_limit=CACHE_SIZE) + + def get(self, key): + value = self.cache.get(key) + + if value: + logging.debug('Hit cache key %s' % key) + + return value + + def clear(self): + return self.cache.clear() + + def set(self, key, value): + return self.cache.set(key, value) + + def get_or(self, key, _or): + """Get a key's value, or use function's return value to set""" + if key in self.cache: + logging.debug('Hit cache key %s' % key) + return self.cache[key] + + value = _or() + self.cache.set(key, value) + return value + +cache = Cache() diff --git a/portable_wisdom/config.py b/portable_wisdom/config.py new file mode 100644 index 0000000..e11789d --- /dev/null +++ b/portable_wisdom/config.py @@ -0,0 +1,26 @@ +# HTML elements that are preserved +ALLOWED_TAGS = ('p', 'b', 'i', 'blockquote', 'strong', 'em', 'figure', 'figcaption', 'img') + +# HTML element attributes that are preserved +ALLOWED_ATTRIBUTES = ('src', ) + +# Instapaper configuration +INSTAPAPER_KEY = '' +INSTAPAPER_SECRET = '' +INSTAPAPER_LOGIN = '' +INSTAPAPER_PASSWORD = '' + +# Maximum number of articles to include +ARTICLE_LIMIT = 25 + +# Maximum dimensions of embedded images +IMAGE_MAX_SIZE = (600, 600) + +# Name of stylesheet to use +STYLE = 'nook-glowlight-3' + +# Debug mode +DEBUG = False + +# Verbose mode +VERBOSE = False diff --git a/portable_wisdom/epub.py b/portable_wisdom/epub.py new file mode 100644 index 0000000..bf4e61e --- /dev/null +++ b/portable_wisdom/epub.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup +from .cache import cache +from urllib.parse import urlparse +import os +from ebooklib import epub +import io +import logging +import requests +from PIL import Image +from .config import * + +def embed_images(book): + """Embeds remote images in EPUB HTML chapters""" + image_names = set() + + for item in book.items: + if type(item) is not epub.EpubHtml: + continue + + # Parse HTML, find `img` elements + soup = BeautifulSoup('%s' % item.content, 'html5lib') + + for img in soup.find_all('img'): + src = img.get('src') + + # Remove junk images + if not src: + img.decompose() + continue + if src.startswith('denied:'): + img.decompose() + continue + if src.startswith('data:'): + img.decompose() + continue + + src_parts = urlparse(src) + ext = os.path.splitext(src_parts.path)[1] + name = str(hash(src)) + ext + + if name not in image_names: + # Create `EpubImage` wrapper object + image = epub.EpubImage() + image.id = str(hash(src)) + image.file_name = name + + thumbnail_hash = src + str(IMAGE_MAX_SIZE) + thumbnail_bytes = cache.get(thumbnail_hash) + + # Download the image + if thumbnail_bytes: + thumbnail = io.BytesIO(thumbnail_bytes) + else: + thumbnail = io.BytesIO() + + try: + logging.info('Downloading image %s', img['src']) + content = requests.get(img['src']).content + except requests.exceptions.ContentDecodingError as e: + logging.error('Skipping image %s (%s)' % (img['src'], e)) + continue + except requests.exceptions.ConnectionError as e: + logging.error('Skipping image %s (%s)' % (img['src'], e)) + continue + + original = io.BytesIO() + original.write(content) + + try: + # Create smaller, greyscale image from source image + im = Image.open(original).convert('RGBA') # convert to `RGBA` before `L` or Pillow will complain + im.thumbnail(IMAGE_MAX_SIZE) + im = im.convert('L') + im.save(thumbnail, 'png' if ext == '.png' else 'jpeg') + + except OSError as e: + logging.error('Skipping image %s (%s)' % (img['src'], e)) + continue + + cache.set(thumbnail_hash, thumbnail.getvalue()) + + thumbnail.seek(0) + + image.content = thumbnail.read() + book.add_item(image) + image_names.add(name) + + img['style'] = 'max-width: 100%' + img['src'] = name + + item.content = str(soup.body) diff --git a/portable_wisdom/source.py b/portable_wisdom/source.py new file mode 100644 index 0000000..c518260 --- /dev/null +++ b/portable_wisdom/source.py @@ -0,0 +1,45 @@ +import os.path +from .config import * +from ebooklib import epub +import logging + +STYLES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'styles') + +class Source: + def to_epub(self, style=None): + """Generate `EpubBook` from result of `get_articles`""" + if not style: + style = STYLE + + logging.info('Creating book using %s style' % style) + articles = self.get_articles() + + book = epub.EpubBook() + book.set_title(self.__class__.name) + + # Create HTML file for each article + chapters = [] + for i, article in enumerate(articles): + chapter = epub.EpubHtml(uid=str(i), title=article.title, file_name=('%d.xhtml' % i)) + chapter.content = '' + ('

%s

' % article.title) + article.content + '' + chapters.append(chapter) + book.add_item(chapter) + + # Add generic book metadata + book.toc = map(lambda c: epub.Link(c.get_name(), c.title, str(c.get_id())), chapters) + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + # Add stylesheet + if not style.endswith('.css'): + style = style + '.css' + + style_path = os.path.join(STYLES_PATH, style) + with open(style_path) as f: + nav_css = epub.EpubItem(uid="style_nav", file_name="style/default.css", media_type="text/css", content=f.read()) + + book.add_item(nav_css) + + book.spine = ['nav'] + chapters + + return book diff --git a/portable_wisdom/sources/__init__.py b/portable_wisdom/sources/__init__.py new file mode 100644 index 0000000..3838e99 --- /dev/null +++ b/portable_wisdom/sources/__init__.py @@ -0,0 +1 @@ +from .instapaper import Instapaper diff --git a/portable_wisdom/sources/instapaper.py b/portable_wisdom/sources/instapaper.py new file mode 100644 index 0000000..9374319 --- /dev/null +++ b/portable_wisdom/sources/instapaper.py @@ -0,0 +1,33 @@ +import logging +import os.path +from ..cache import cache +from ..article import Article +from ..config import * +from ..source import Source +from pyinstapaper.instapaper import Instapaper as PInstapaper + +class Instapaper(Source): + name = 'Instapaper' + + def get_articles(self): + """Produce a list of Articles""" + instapaper = PInstapaper(INSTAPAPER_KEY, INSTAPAPER_SECRET) + instapaper.login(INSTAPAPER_LOGIN, INSTAPAPER_PASSWORD) + + # Enforce 25 article maximum + limit = 25 + + if ARTICLE_LIMIT: + limit = min(limit, ARTICLE_LIMIT) + + bookmarks = instapaper.get_bookmarks('unread', limit) + articles = [] + + for bookmark in bookmarks: + content = cache.get_or(bookmark.hash, + lambda: bookmark.get_text()['data'].decode()) + article = Article(title=bookmark.title, content=content) + articles.append(article) + + logging.info('Retrieved %d articles' % len(articles)) + return articles diff --git a/portable_wisdom/styles/nook-glowlight-3.css b/portable_wisdom/styles/nook-glowlight-3.css new file mode 100644 index 0000000..bf20832 --- /dev/null +++ b/portable_wisdom/styles/nook-glowlight-3.css @@ -0,0 +1,65 @@ +/* General */ +p { + margin: 1em 0; + text-indent: 1.5em; +} + +/* Headers */ +h1, h2, h3, h4, h5, h6 { + hyphens: none; +} +h1 { + font-size: 1.4em; + margin: 0 2.5em 2.5em 2.5em; + text-align: center; + font-weight: bold; +} +h2 { + font-size: 1.25em; + font-weight: bold; +} +h3 { + font-size: 1.1em; + font-weight: bold; +} +h4 { + font-size: 1em; + font-weight: bold; +} +h5 { + font-size: 1em; + font-weight: normal; +} +h6 { + font-size: 1em; + font-weight: normal; +} + +/* Quotes */ +blockquote, aside { + margin: 1em 1.5em; + font-style: italic; +} +blockquote p { + text-indent: 0; +} +blockquote p:first-child { + margin-top: 0; +} +blockquote p:last-child { + margin-bottom: 0; +} + +/* Images */ +figcaption { + margin: 0 0 1em 0; +} +img { + text-align: center; +} + +/* Inline */ +a { + color: black !important; + text-decoration: underline; +} diff --git a/portable_wisdom/tests/__init__.py b/portable_wisdom/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/portable_wisdom/tests/test_article.py b/portable_wisdom/tests/test_article.py new file mode 100644 index 0000000..940ac58 --- /dev/null +++ b/portable_wisdom/tests/test_article.py @@ -0,0 +1,17 @@ +from unittest import TestCase +from portable_wisdom.article import Article + +title = 'Bank Robber Strikes Again' +content = '$1m was stolen from First Federal Bank yesterday.' + +class TestArticle(TestCase): + def test_init(self): + a = Article(title, content) + self.assertEqual(a.title, title) + self.assertEqual(a.content, content) + + def test_str(self): + a = Article(title, content) + self.assertIn(title, str(a)) + self.assertNotIn(content, str(a)) + diff --git a/portable_wisdom/tests/test_cache.py b/portable_wisdom/tests/test_cache.py new file mode 100644 index 0000000..2c967e7 --- /dev/null +++ b/portable_wisdom/tests/test_cache.py @@ -0,0 +1,39 @@ +from unittest import TestCase +from random import choice +from portable_wisdom.cache import Cache + +class TestCache(TestCase): + def test_get(self): + c = Cache() + v = choice(range(1, 255)) + c.set('some_key', v) + self.assertEqual(c.get('some_key'), v) + c.clear() + + def test_get_empty(self): + c = Cache() + self.assertIsNone(c.get('some_absent_key')) + c.clear() + + def test_get_or(self): + c = Cache() + v = choice(range(1, 255)) + print(v) + c.get_or('some_key', lambda: v) + self.assertEqual(c.get('some_key'), v) + c.clear() + + def test_get_or2(self): + c = Cache() + v = choice(range(1, 255)) + c.set('some_key', v) + c.get_or('some_key', lambda: v+1) + self.assertEqual(c.get('some_key'), v) + c.clear() + + def test_set(self): + c = Cache() + v = choice(range(1, 255)) + c.set('some_key', v) + self.assertEqual(c.get('some_key'), v) + c.clear() diff --git a/portable_wisdom/tests/test_source.py b/portable_wisdom/tests/test_source.py new file mode 100644 index 0000000..619ec99 --- /dev/null +++ b/portable_wisdom/tests/test_source.py @@ -0,0 +1,27 @@ +from ebooklib import epub +from unittest import TestCase +from portable_wisdom.source import Source +from portable_wisdom.article import Article + +articles = ( + Article('Bank Robber Strikes Again', '$1m was stolen from First Federal Bank yesterday.'), + Article('UFO Lands in Area 51', 'Aliens greeted humans with peace sign.'), + ) + +class FakeSource(Source): + name = 'Lies Magazine' + + def get_articles(self): + return articles + +class TestSource(TestCase): + def test_to_epub(self): + source = FakeSource() + book = source.to_epub() + + chapters = 0 + for item in book.items: + if type(item) is epub.EpubHtml: + chapters += 1 + + self.assertEqual(len(articles), chapters) diff --git a/portable_wisdom/tests/test_wisdom.py b/portable_wisdom/tests/test_wisdom.py new file mode 100644 index 0000000..e69de29 diff --git a/portable_wisdom/tmp/.keep b/portable_wisdom/tmp/.keep new file mode 100644 index 0000000..e69de29 diff --git a/portable_wisdom/wisdom.py b/portable_wisdom/wisdom.py new file mode 100644 index 0000000..d47da1c --- /dev/null +++ b/portable_wisdom/wisdom.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# +# Generate EPUB from Instapaper + +import argparse +import datetime +from ebooklib import epub +import logging +from . import config + +def main(): + """Generate EPUB from Instapaper""" + # Support CLI + parser = argparse.ArgumentParser(description='Generate EPUB from Instapaper') + parser.add_argument('--instapaper-key', help='Instapaper API key') + parser.add_argument('--instapaper-secret', help='Instapaper API secret') + parser.add_argument('--instapaper-login', help='Instapaper account username or email address') + parser.add_argument('--instapaper-password', help='Instapaper account password') + parser.add_argument('-s', '--style', default=config.STYLE, help='stylesheet to use') + parser.add_argument('-l', '--article-limit', '--limit', default=config.ARTICLE_LIMIT, metavar='LIMIT', type=int, help='number of articles to include') + parser.add_argument('-v', '--verbose', default=False, action='store_true', help='verbose mode') + parser.add_argument('-d', '--debug', default=False, action='store_true', help='debug mode') + + args = parser.parse_args() + + # Where an option is provided, override its configuration value + for option, value in vars(args).items(): + if value: + setattr(config, option.upper(), value) + + logging_level = logging.CRITICAL + if config.VERBOSE or config.DEBUG: + logging_level = logging.DEBUG + + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging_level) + + # Import after configuration is set + from .epub import embed_images + from .sources import Instapaper + + # Create EPUB and save to disk + source = Instapaper + book = source().to_epub() + embed_images(book) + + today = datetime.datetime.today() + filename = '%s - %s-%s-%s.epub' % (source.name, today.year, today.month, today.day) + epub.write_epub(filename, book, {}) + print(filename) + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b1868b2 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +from setuptools import find_packages, setup + +with open('README.md') as f: + readme = f.read() + +setup(name='portable_wisdom', + version='0.1', + description='Generate EPUB from Instapaper', + long_description=readme, + long_description_content_type='text/markdown', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: End Users/Desktop', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Topic :: Utilities', + ], + url='https://github.com/jacobbudin/portable_wisdom', + author='Jacob Budin', + author_email='self@jacobbudin.com', + license='MIT', + install_requires=[ + 'ebooklib>=0.17', # generates EPUB + 'pillow>=6.0', # downsizes images + 'requests>=2.21', # downloads images + 'beautifulsoup4>=4.7.1', # parses HTML for imags + 'html5lib>=1.0.1', # parses HTML for images + 'diskcache>=3.1.1', # caches article text and images + 'pyinstapaper>=0.2.2', # Instapaper API client + ], + packages=find_packages(), + package_data={ + 'portable_wisdom': ['styles/*.css'], + }, + entry_points={ + 'console_scripts': ['portable-wisdom=portable_wisdom.wisdom:main'], + }, + test_suite='nose.collector', + tests_require=['nose'], + zip_safe=False)