From d4900b902beb5085691df4846b3d1b60e8d60c84 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 13 Aug 2023 11:43:42 +0530 Subject: [PATCH 1/4] The Hindu update --- recipes/business_standard.recipe | 11 ++++++----- recipes/hindu.recipe | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/recipes/business_standard.recipe b/recipes/business_standard.recipe index 52005da36d5f..c6b171d1bab6 100644 --- a/recipes/business_standard.recipe +++ b/recipes/business_standard.recipe @@ -95,11 +95,12 @@ class BusinessStandard(BasicNewsRecipe): if 'multiple_authors_name' in data: auth = '
' + data['multiple_authors_name'] + ' | ' + data['placeName'] + ' | ' + date + '
'.format(img_url) - else: - lede = '
'.format(data['featuredImageObj']['url']) + if 'featuredImageObj' in data: + if 'url' in data['featuredImageObj']: + if img_url is not None: + lede = '
'.format(img_url) + else: + lede = '
'.format(data['featuredImageObj']['url']) if 'alt_text' in data['featuredImageObj']: caption = '' + data['featuredImageObj']['alt_text'] + '
' diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index ed8ade80e542..35650d11c9b0 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -4,7 +4,6 @@ from collections import defaultdict from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes - def absurl(url): if url.startswith('/'): url = 'https://www.thehindu.com' + url @@ -16,11 +15,16 @@ local_edition = None # For past editions, set date to, for example, '2023-01-28' past_edition = None +is_monday = date.today().weekday() == 0 +is_friday = date.today().weekday() == 4 +is_saturday = date.today().weekday() == 5 is_sunday = date.today().weekday() == 6 if past_edition: year, month, day = (int(x) for x in past_edition.split('-')) dt = date(year, month, day) + is_monday = dt.weekday() == 0 + is_saturday = dt.weekday() == 5 is_sunday = dt.weekday() == 6 class TheHindu(BasicNewsRecipe): @@ -75,13 +79,23 @@ class TheHindu(BasicNewsRecipe): today = past_edition self.log('Downloading past edition of', local_edition + ' from ' + today) url = absurl('/todays-paper/' + today + '/' + local_edition + '/') + if is_monday: + mag_url = url + '?supplement=' + local_edition + '-epbs' + if is_saturday: + mag_url = url + '?supplement=' + local_edition + '-mp' if is_sunday: mag_url = url + '?supplement=' + local_edition + '-sm' else: url = 'https://www.thehindu.com/todays-paper/' + if is_monday: + mag_url = url + '?supplement=th_chennai-epbs' + if is_friday: + mag_url = url + '?supplement=th_chennai-fr' + if is_saturday: + mag_url = url + '?supplement=th_chennai-mp' if is_sunday: mag_url = url + '?supplement=th_chennai-sm' - + raw = self.index_to_soup(url, raw=True) soup = self.index_to_soup(raw) ans = self.hindu_parse_index(soup) @@ -93,17 +107,18 @@ class TheHindu(BasicNewsRecipe): 'The Hindu Newspaper is not published Today.' ) if mag_url: - self.log('\nFetching Sunday Magazine') + self.log('\nFetching Magazine') soup = self.index_to_soup(mag_url) ans2 = self.hindu_parse_index(soup) if ans2: return ans + ans2 + self.log('\tMagazine not Found') return ans return ans def hindu_parse_index(self, soup): for script in soup.findAll('script'): - if not self.tag_to_string(script).strip().startswith('let grouped_articles = {}'): + if not self.tag_to_string(script).__contains__('grouped_articles = {"'): continue if script is not None: art = re.search(r'grouped_articles = ({\".*)', self.tag_to_string(script)) From b3b8d274a842a623963600b62f811737c99d4b51 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 13 Aug 2023 12:42:08 +0530 Subject: [PATCH 2/4] Update epoch_times.recipe --- recipes/epoch_times.recipe | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/recipes/epoch_times.recipe b/recipes/epoch_times.recipe index 785f76242d22..f4eea11d3d99 100644 --- a/recipes/epoch_times.recipe +++ b/recipes/epoch_times.recipe @@ -18,19 +18,16 @@ class EpochTimes(BasicNewsRecipe): masthead_url = 'https://epochtimes-ny.newsmemory.com/eeLayout/epochtimes/1.0.a/images/webapp/banner.png' extra_css = """ body{font-family: Arial,sans-serif } - .featured_caption{font-size: small} - .author,.date{font-size: small} """ keep_only_tags = [ - classes('post-main'), + dict(name='article') ] remove_tags = [ - classes('print:hidden h-header'), + classes('print:hidden h-header shortcode aspect-square'), dict(name='button'), ] - # - # feeds can be found at https://www.theepochtimes.com/rssfeeds + # feeds can be found at https://www.theepochtimes.com/rssfeeds feeds = [ ('Special Series', 'https://feed.theepochtimes.com/health/special-series/feed'), ('US', 'https://feed.theepochtimes.com/us/feed'), @@ -48,11 +45,4 @@ class EpochTimes(BasicNewsRecipe): def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] - title = soup.find(attrs={'class': 'post_title'}) - fi = soup.find(attrs={'class': 'featured_image'}) - if title is not None and fi is not None: - title.extract() - fi.insert_before(title) - for div in soup.findAll(**classes('post-main'))[1:]: - div.extract() return soup From 474417cf451eb4146cbb34abc3f049ea4feeaa5e Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 13 Aug 2023 12:45:16 +0530 Subject: [PATCH 3/4] Update focus_de.recipe --- recipes/focus_de.recipe | 58 ++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/recipes/focus_de.recipe b/recipes/focus_de.recipe index b96992372748..e111f8cc1ce2 100644 --- a/recipes/focus_de.recipe +++ b/recipes/focus_de.recipe @@ -1,27 +1,35 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -from __future__ import unicode_literals, division, absolute_import, print_function - ''' focus.de ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe, classes class AdvancedUserRecipe1305567197(BasicNewsRecipe): title = 'Focus (DE)' - __author__ = 'Anonymous' - description = 'RSS-Feeds von Focus.de' + __author__ = 'unkn0wn' + description = 'RSS-Feeds von Focus.de, best downloaded at the end of the week.' language = 'de' oldest_article = 7 - max_articles_per_feed = 100 + max_articles_per_feed = 25 no_stylesheets = True remove_javascript = True use_embedded_content = False remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} + remove_attributes = ['width', 'height', 'style'] + masthead_url = 'https://www.focus-magazin.de/img/Focus_Logo.jpg' + extra_css = ''' + .posMarker_oh { font-size:small; color:#404040; } + .posMarker_he { font-size:large; font-weight:bold; } + .leadIn { font-style:italic; color:#202020; } + .caption { text-align:center; font-size:small; } + .authorMeta, .displayDate { font-size:small; } + ''' + + def get_cover_url(self): + soup = self.index_to_soup('https://www.focus-magazin.de/') + return soup.find('img', attrs={'class':'main-cover'})['src'] feeds = [ ('Politik', 'http://rss.focus.de/politik/'), @@ -29,33 +37,25 @@ class AdvancedUserRecipe1305567197(BasicNewsRecipe): ('Gesundheit', 'http://rss.focus.de/gesundheit/'), ('Panorama', 'http://rss.focus.de/panorama/'), ('Digital', 'http://rss.focus.de/digital/'), - ('Reisen', 'http://rss.focus.de/reisen/') + ('Reisen', 'http://rss.focus.de/reisen/'), + ('Andere', 'http://rss.focus.de') ] keep_only_tags = [ - dict(name='div', attrs={'id': 'article'}) + classes('articleHead articleContent') ] remove_tags = [ - dict(name='div', attrs={'class': ['inimagebuttons', - 'kolumneHead clearfix']}) + dict(name=['svg', 'script']), + classes('socFbLikeShare video social_frame'), + dict(attrs={'id': 'article-social-holder'}) ] - remove_attributes = ['width', 'height'] - - extra_css = 'h1 {font-size: 1.6em; text-align: left; margin-top: 0em} \ - h2 {font-size: 1em; text-align: left} \ - .overhead {margin-bottom: 0em} \ - .caption {font-size: 0.6em}' - - def print_version(self, url): - return url + '?drucken=1' - def preprocess_html(self, soup): - # remove useless references to videos - for item in soup.findAll('h2'): - if item.string: - txt = item.string.upper() - if txt.startswith('IM VIDEO:') or txt.startswith('VIDEO:'): - item.extract() + if h1 := soup.find(attrs={'class':'articleIdentH1'}): + h1.name = 'h1' + if he := soup.find(**classes('posMarker_he')): + he.name = 'div' + for img in soup.findAll('img', attrs={'data-src':True}): + img['src'] = img['data-src'] return soup From 25948f59f0f33b681432cea8ad71dbbb2f253cdf Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 13 Aug 2023 13:04:23 +0530 Subject: [PATCH 4/4] ... --- recipes/epoch_times.recipe | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/recipes/epoch_times.recipe b/recipes/epoch_times.recipe index f4eea11d3d99..79030de0fc4e 100644 --- a/recipes/epoch_times.recipe +++ b/recipes/epoch_times.recipe @@ -12,13 +12,11 @@ class EpochTimes(BasicNewsRecipe): oldest_article = 1.2 max_articles_per_feed = 20 ignore_duplicate_articles = {'url'} - remove_attributes = ['height', 'width'] + remove_attributes = ['height', 'width', 'style'] remove_empty_feeds = True resolve_internal_links = True masthead_url = 'https://epochtimes-ny.newsmemory.com/eeLayout/epochtimes/1.0.a/images/webapp/banner.png' - extra_css = """ - body{font-family: Arial,sans-serif } - """ + keep_only_tags = [ dict(name='article') ]