Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobbudin committed Apr 30, 2019
0 parents commit 5d8e152
Show file tree
Hide file tree
Showing 21 changed files with 575 additions and 0 deletions.
20 changes: 20 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
root = true

[*]
end_of_line = lf
insert_final_newline = true

[*.{py}]
charset = utf-8

[*.py]
indent_style = space
indent_size = 4

[*.css]
indent_style = space
indent_size = 2

[{.travis.yml}]
indent_style = space
indent_size = 2
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# packages
venv

# python
.eggs
*.egg-info

# cache
__pycache__
*.pyc
tmp

# build
*.epub
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License

Copyright (c) 2019 Jacob Budin

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Portable Wisdom

Portable Wisdom is a tool to generate EPUB files from [Instapaper](https://www.instapaper.com/). You can then choose to sync these files to your ereader.

## Technologies

- Python 3

## Features

- Retrieves unread articles from Instapaper
- Embeds web images, downsizes them, and converts them to greyscale
- Caches articles and images
- Creates well-formatted EPUB files, tailored for your ereader

## Quick Start

1. Download and install Portable Wisdom from PyPI:

$ pip install portable_wisdom

2. [Request an Instapaper API key.](https://www.instapaper.com/main/request_oauth_consumer_token) (Or copy one from a friend.)
4. Run Portable Wisdom from the command line:

$ portable-wisdom --instapaper-key KEY \
--instapaper-secret SECRET \
--instapaper-login USER \
--instapaper-password PASS

On success, the script will print the output filename. For all the options, run `$ portable-wisdom -h`.

## License

MIT License
Empty file added portable_wisdom/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions portable_wisdom/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class Article:
def __init__(self, title, content):
self.title = title
self.content = content

def __repr__(self):
return 'Article("%s", "%s")' % (self.title, self.content)

def __str__(self):
return '"%s"' % self.title
36 changes: 36 additions & 0 deletions portable_wisdom/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from diskcache import Cache as DiskCache
import logging
import os

CACHE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'tmp')
CACHE_SIZE = 64 * 1000000 #64MB

class Cache:
def __init__(self):
self.cache = DiskCache(CACHE_PATH, size_limit=CACHE_SIZE)

def get(self, key):
value = self.cache.get(key)

if value:
logging.debug('Hit cache key %s' % key)

return value

def clear(self):
return self.cache.clear()

def set(self, key, value):
return self.cache.set(key, value)

def get_or(self, key, _or):
"""Get a key's value, or use function's return value to set"""
if key in self.cache:
logging.debug('Hit cache key %s' % key)
return self.cache[key]

value = _or()
self.cache.set(key, value)
return value

cache = Cache()
26 changes: 26 additions & 0 deletions portable_wisdom/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# HTML elements that are preserved
ALLOWED_TAGS = ('p', 'b', 'i', 'blockquote', 'strong', 'em', 'figure', 'figcaption', 'img')

# HTML element attributes that are preserved
ALLOWED_ATTRIBUTES = ('src', )

# Instapaper configuration
INSTAPAPER_KEY = ''
INSTAPAPER_SECRET = ''
INSTAPAPER_LOGIN = ''
INSTAPAPER_PASSWORD = ''

# Maximum number of articles to include
ARTICLE_LIMIT = 25

# Maximum dimensions of embedded images
IMAGE_MAX_SIZE = (600, 600)

# Name of stylesheet to use
STYLE = 'nook-glowlight-3'

# Debug mode
DEBUG = False

# Verbose mode
VERBOSE = False
91 changes: 91 additions & 0 deletions portable_wisdom/epub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from bs4 import BeautifulSoup
from .cache import cache
from urllib.parse import urlparse
import os
from ebooklib import epub
import io
import logging
import requests
from PIL import Image
from .config import *

def embed_images(book):
"""Embeds remote images in EPUB HTML chapters"""
image_names = set()

for item in book.items:
if type(item) is not epub.EpubHtml:
continue

# Parse HTML, find `img` elements
soup = BeautifulSoup('<html><body>%s</body></html>' % item.content, 'html5lib')

for img in soup.find_all('img'):
src = img.get('src')

# Remove junk images
if not src:
img.decompose()
continue
if src.startswith('denied:'):
img.decompose()
continue
if src.startswith('data:'):
img.decompose()
continue

src_parts = urlparse(src)
ext = os.path.splitext(src_parts.path)[1]
name = str(hash(src)) + ext

if name not in image_names:
# Create `EpubImage` wrapper object
image = epub.EpubImage()
image.id = str(hash(src))
image.file_name = name

thumbnail_hash = src + str(IMAGE_MAX_SIZE)
thumbnail_bytes = cache.get(thumbnail_hash)

# Download the image
if thumbnail_bytes:
thumbnail = io.BytesIO(thumbnail_bytes)
else:
thumbnail = io.BytesIO()

try:
logging.info('Downloading image %s', img['src'])
content = requests.get(img['src']).content
except requests.exceptions.ContentDecodingError as e:
logging.error('Skipping image %s (%s)' % (img['src'], e))
continue
except requests.exceptions.ConnectionError as e:
logging.error('Skipping image %s (%s)' % (img['src'], e))
continue

original = io.BytesIO()
original.write(content)

try:
# Create smaller, greyscale image from source image
im = Image.open(original).convert('RGBA') # convert to `RGBA` before `L` or Pillow will complain
im.thumbnail(IMAGE_MAX_SIZE)
im = im.convert('L')
im.save(thumbnail, 'png' if ext == '.png' else 'jpeg')

except OSError as e:
logging.error('Skipping image %s (%s)' % (img['src'], e))
continue

cache.set(thumbnail_hash, thumbnail.getvalue())

thumbnail.seek(0)

image.content = thumbnail.read()
book.add_item(image)
image_names.add(name)

img['style'] = 'max-width: 100%'
img['src'] = name

item.content = str(soup.body)
45 changes: 45 additions & 0 deletions portable_wisdom/source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os.path
from .config import *
from ebooklib import epub
import logging

STYLES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'styles')

class Source:
def to_epub(self, style=None):
"""Generate `EpubBook` from result of `get_articles`"""
if not style:
style = STYLE

logging.info('Creating book using %s style' % style)
articles = self.get_articles()

book = epub.EpubBook()
book.set_title(self.__class__.name)

# Create HTML file for each article
chapters = []
for i, article in enumerate(articles):
chapter = epub.EpubHtml(uid=str(i), title=article.title, file_name=('%d.xhtml' % i))
chapter.content = '<html><head><link rel="stylesheet" href="style/default.css" /></head><body>' + ('<h1>%s</h1>' % article.title) + article.content + '</body></html>'
chapters.append(chapter)
book.add_item(chapter)

# Add generic book metadata
book.toc = map(lambda c: epub.Link(c.get_name(), c.title, str(c.get_id())), chapters)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

# Add stylesheet
if not style.endswith('.css'):
style = style + '.css'

style_path = os.path.join(STYLES_PATH, style)
with open(style_path) as f:
nav_css = epub.EpubItem(uid="style_nav", file_name="style/default.css", media_type="text/css", content=f.read())

book.add_item(nav_css)

book.spine = ['nav'] + chapters

return book
1 change: 1 addition & 0 deletions portable_wisdom/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .instapaper import Instapaper
33 changes: 33 additions & 0 deletions portable_wisdom/sources/instapaper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import logging
import os.path
from ..cache import cache
from ..article import Article
from ..config import *
from ..source import Source
from pyinstapaper.instapaper import Instapaper as PInstapaper

class Instapaper(Source):
name = 'Instapaper'

def get_articles(self):
"""Produce a list of Articles"""
instapaper = PInstapaper(INSTAPAPER_KEY, INSTAPAPER_SECRET)
instapaper.login(INSTAPAPER_LOGIN, INSTAPAPER_PASSWORD)

# Enforce 25 article maximum
limit = 25

if ARTICLE_LIMIT:
limit = min(limit, ARTICLE_LIMIT)

bookmarks = instapaper.get_bookmarks('unread', limit)
articles = []

for bookmark in bookmarks:
content = cache.get_or(bookmark.hash,
lambda: bookmark.get_text()['data'].decode())
article = Article(title=bookmark.title, content=content)
articles.append(article)

logging.info('Retrieved %d articles' % len(articles))
return articles
Loading

0 comments on commit 5d8e152

Please sign in to comment.