-
Notifications
You must be signed in to change notification settings - Fork 0
/
multy_feed.py
389 lines (330 loc) · 14.4 KB
/
multy_feed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
from calibre import unicode_path, __appname__
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.recipes import AutomaticNewsRecipe, BasicNewsRecipe
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.utils.filenames import ascii_filename
import subprocess
import traceback
import tempfile
import urlparse
import urllib2
import shutil
import time
import os
import re
from cStringIO import StringIO
from BeautifulSoup import BeautifulSoup
from PIL import Image
#======= helper functions ===========
# Image paths can't be changed during html postprocess. So this work is done with regexps
xkcd_title_re = re.compile(r'(<img.*title=")([^"]+)(".*>)')
def refactor_xkcd_image(tag):
""" xkcd has every image titled with a funny phrase. Add these phrases below images. """
if not isinstance(tag, basestring):
tag = tag.group(0)
m = xkcd_title_re.match(tag)
result = '<center>%s%s<br /><i>%s</i></center>' % (m.group(1), m.group(3), m.group(2))
# This is a bugfix for "what if" feed
return result.replace('src="//', 'src="http://')
def substitute_latex(match):
# FIXME wipe temporary files after work
TEMP_PATH = tempfile.tempdir
if match.group(0)[1] == '[':
prefix = "<center>"
postfix = "<center>"
else:
prefix = ""
postfix = ""
latex = urllib2.unquote(match.group(1))
transform_result = subprocess.check_output(["texvc", TEMP_PATH, TEMP_PATH, latex])
if transform_result[0] == "+":
return prefix + '<img src="file://{0}/{1}.png" alt="{2}">'.format(TEMP_PATH, transform_result[1:], match.group(1)) + postfix
if transform_result[0] in "cmlCML":
return prefix + transform_result[33:] + postfix
return match.group(0)
#================ Patched RecursiveFetcher ====================
class ImageFormat(object):
""" Base class for image formats """
def save(self, imgpath, data):
""" Saves image to imgpath. Returns Trueon success,
False if image must be skipped. """
with open(imgpath, 'wb') as x:
x.write(data)
return True
class PngFormat(ImageFormat):
extension = "png"
def magic(self, data):
return data[1:4] == "PNG"
class GifFormat(ImageFormat):
extension = "gif"
def magic(self, data):
return data[0:3] == "GIF"
def save(self, imgpath, data):
if data == 'GIF89a\x01':
# Skip empty GIF files as PIL errors on them anyway
return False
else:
return super(GifFormat, self).save(imgpath, data)
class JpegFormat(ImageFormat):
extension = "jpg"
def magic(self, data):
return data[0] == 0xFF and data[0] == "D9"
def save(self, imgpath, data):
""" This is fallback format too, so we will accept any image and save it to jpeg"""
im = Image.open(StringIO(data)).convert('RGBA')
# FIXME Use white background for transparent images
with open(imgpath, 'wb') as x:
im.save(x, 'JPEG')
return True
class RichRecursiveFetcher(RecursiveFetcher):
""" Nearly the same as RecursiveFetcher, but can be configured
not to repack png's and gis's in jpeg. API is compatible. """
def __init__(self, *args, **kwargs):
self._image_formats = kwargs.pop("image_formats", [JpegFormat()])
super(RichRecursiveFetcher, self).__init__(*args, **kwargs)
def process_images(self, soup, baseurl):
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
if callable(self.image_url_processor):
iurl = self.image_url_processor(baseurl, iurl)
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
with self.imagemap_lock:
if self.imagemap.has_key(iurl):
tag['src'] = self.imagemap[iurl]
continue
#==== Changes begin here ====
try:
data = self.fetch_url(iurl)
except Exception:
self.log.exception('Could not fetch image ', iurl)
continue
c += 1
fname = ascii_filename('img'+str(c))
# Hm. Does ascii_filename return unicode names? Not touching.
if isinstance(fname, unicode):
fname = fname.encode('ascii', 'replace')
for image_format in self._image_formats:
# Use the last format as a fallback
if image_format.magic(data) or image_format == self._image_formats[-1]:
imgpath = os.path.join(diskpath, fname + "." + image_format.extension)
try:
with self.imagemap_lock:
self.imagemap[iurl] = imgpath
if not image_format.save(imgpath, data):
break
except:
traceback.print_exc()
break
tag['src'] = imgpath
break
#================ end of patched RecursiveFetcher ====================
class DownloadedArticlesList(object):
""" Remembers which articles were downloaded so as not to add them
to export second time."""
LIST_SIZE_LIMIT = 5000
def __init__(self, file):
self._file = file
self._url_list = []
self._url_set = set()
try:
with open(self._file, "r") as f:
for line in f:
line = line.strip()
self._url_list.append(line)
self._url_set.add(line)
except IOError:
pass
def __contains__(self, url):
return url in self._url_set
def add(self, url):
self._url_list.append(url)
self._url_set.add(url)
def close(self):
with open(self._file, "w") as f:
self._url_list = self._url_list[-DownloadedArticlesList.LIST_SIZE_LIMIT:]
for url in self._url_list:
print >>f, url
class MultiFeedRecipe(AutomaticNewsRecipe):
title = u'RSS ' + time.strftime("%m %d")
oldest_article = 7
max_articles_per_feed = 200
auto_cleanup = False
ignore_duplicate_articles = None
remove_empty_feeds = True
# Do not rescale images
scale_news_images_to_device = False
compress_news_images = False
compress_news_images_auto_size = None
def build_index(self):
#========== added =========
downloaded_list = DownloadedArticlesList(self.download_history_file)
#==========================
self.report_progress(0, _('Fetching feeds...'))
try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
feeds = self.parse_feeds()
#========== reworked =========
for feed in feeds:
feed.articles = filter(lambda article: article.url not in downloaded_list, feed.articles)
# Filer out empty feeds
if self.ignore_duplicate_articles is not None:
feeds = self.remove_duplicate_articles(feeds)
feeds = filter(lambda feed: len(feed.articles), feeds)
if not feeds:
raise ValueError('No articles found, aborting')
#=============================
#feeds = FeedCollection(feeds)
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.jobs = []
if self.reverse_article_order:
for feed in feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
self.feed_objects = feeds
for f, feed in enumerate(feeds):
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
#========== refactored =========
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
downloaded_list.add(article.url)
url = self.feed_settings[feed.title].print_version_url(article.url)
req = WorkRequest(
self.feed_settings[feed.title].fetch,
(self, article, url, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
#===============================
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(feeds):
html = self.feed2index(f,feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
#========== added =========
downloaded_list.close()
#==========================
return index
def _fetch_article(self, url, dir_, f, a, num_of_feeds):
br = self.browser
if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
self.web2disk_options.browser = br
# ============== Here is the only change =================
fetcher = RichRecursiveFetcher(self.web2disk_options, self.log,
self.image_map, self.css_map,
(url, f, a, num_of_feeds),
image_formats=[PngFormat(), GifFormat(), JpegFormat()])
# ========================================================
fetcher.browser = br
fetcher.base_dir = dir_
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' '
if self.debug:
msg += _('The debug traceback is available earlier in this log')
else:
msg += _('Run with -vv to see the reason')
raise Exception(msg)
return res, path, failures
def _postprocess_html(self, soup, first_fetch, job_info):
if self.no_stylesheets:
for link in list(soup.findAll('link', type=re.compile('css')))+list(soup.findAll('style')):
link.extract()
head = soup.find('head')
if not head:
head = soup.find('body')
if not head:
head = soup.find(True)
style = BeautifulSoup(u'<style type="text/css" title="override_css">%s</style>'%(
self.template_css +'\n\n'+(self.extra_css if self.extra_css else ''))).find('style')
head.insert(len(head.contents), style)
if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
if body is not None:
templ = self.navbar.generate(False, f, a, feed_len,
not self.has_single_feed,
url, __appname__,
center=self.center_navbar,
extra_css=self.extra_css)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
if self.remove_javascript:
for script in list(soup.findAll('script')):
script.extract()
for o in soup.findAll(onload=True):
del o['onload']
for script in list(soup.findAll('noscript')):
script.extract()
for attr in self.remove_attributes:
for x in soup.findAll(attrs={attr:True}):
del x[attr]
for base in list(soup.findAll(['base', 'iframe', 'canvas', 'embed',
'command', 'datalist', 'video', 'audio'])):
base.extract()
# ============== Here is the only change =================
# Soup seems to be rotten. Don't know why. Recook it.
soup = BeautifulSoup(str(soup))
ans = self.postprocess_html(soup, first_fetch)
if first_fetch and job_info:
postprocessor = self.feed_settings[self.feed_objects[f].title].postprocess_html
ans = postprocessor(soup, first_fetch)
# ========================================================
# Nuke HTML5 tags
for x in ans.findAll(['article', 'aside', 'header', 'footer', 'nav',
'figcaption', 'figure', 'section']):
x.name = 'div'
if job_info:
url, f, a, feed_len = job_info
try:
article = self.feed_objects[f].articles[a]
except:
self.log.exception('Failed to get article object for postprocessing')
pass
else:
self.populate_article_metadata(article, ans, first_fetch)
return ans
def default_cover(self, cover_file):
# Default cover weights too much
pass