From 1e6a633d09d0e5f6a5a44f24b12216e464f3f8b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Tue, 1 Aug 2017 08:42:26 +0000 Subject: [PATCH 01/23] Create model for Podcast update results --- mygpo/data/models.py | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 mygpo/data/models.py diff --git a/mygpo/data/models.py b/mygpo/data/models.py new file mode 100644 index 000000000..879a3b045 --- /dev/null +++ b/mygpo/data/models.py @@ -0,0 +1,45 @@ +from datetime import datetime + +from django.db import models + +from mygpo.podcasts.models import Podcast + + +class PodcastUpdateResult(models.Model): + """ Results of a podcast update + + Once an instance is stored, the update is assumed to be finished. """ + + # The podcast that was updated + podcast = models.ForeignKey(Podcast, on_delete=models.CASCADE) + + # The timestamp at which the updated started to be executed + start = models.DateTimeField(default=datetime.utcnow) + + # The duration of the update + duration = models.DurationField() + + # A flad indicating whether the update was successful + successful = models.BooleanField() + + # An error message. Should be empty if the update was successful + error_message = models.TextField() + + # A flag indicating whether the update created the podcast + podcast_created = models.BooleanField() + + # The number of episodes that were created by the update + episodes_added = models.IntegerField() + + class Meta(object): + + get_latest_by = 'start' + + order_with_respect_to = 'podcast' + + ordering = ['-start'] + + indexes = [ + models.Index(fields=['podcast', 'start']) + ] + From 4a7e5fc50cceba82df60deb2133ff74d1cb2ef4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 6 Aug 2017 15:26:05 +0200 Subject: [PATCH 02/23] Remove order_with_respect_to from PodcastUpdateResult It cannot be used together with 'ordering' (models.E021) --- mygpo/data/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mygpo/data/models.py b/mygpo/data/models.py index 879a3b045..d74a74a50 100644 --- a/mygpo/data/models.py +++ b/mygpo/data/models.py @@ -35,8 +35,6 @@ class Meta(object): get_latest_by = 'start' - order_with_respect_to = 'podcast' - ordering = ['-start'] indexes = [ From e948fa0c24ebfe83d2df81f729b5bcc9b4b971b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 6 Aug 2017 16:54:17 +0200 Subject: [PATCH 03/23] Use UUID as primary key of PodcastUpdateResult --- mygpo/data/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mygpo/data/models.py b/mygpo/data/models.py index d74a74a50..7958f2b5c 100644 --- a/mygpo/data/models.py +++ b/mygpo/data/models.py @@ -2,10 +2,11 @@ from django.db import models +from mygpo.core.models import UUIDModel from mygpo.podcasts.models import Podcast -class PodcastUpdateResult(models.Model): +class PodcastUpdateResult(UUIDModel): """ Results of a podcast update Once an instance is stored, the update is assumed to be finished. """ From e7159ebea49afa0ad4bbfb162a7825ec51b90e51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 6 Aug 2017 16:54:28 +0200 Subject: [PATCH 04/23] Create migration for PodcastUpdateResult --- mygpo/data/migrations/0001_initial.py | 48 +++++++++++++++++++++++++++ mygpo/data/migrations/__init__.py | 0 2 files changed, 48 insertions(+) create mode 100644 mygpo/data/migrations/0001_initial.py create mode 100644 mygpo/data/migrations/__init__.py diff --git a/mygpo/data/migrations/0001_initial.py b/mygpo/data/migrations/0001_initial.py new file mode 100644 index 000000000..4a4399168 --- /dev/null +++ b/mygpo/data/migrations/0001_initial.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.4 on 2017-08-06 14:46 +from __future__ import unicode_literals + +import datetime +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('podcasts', '0037_index_podcast_lastupdate'), + ] + + operations = [ + migrations.CreateModel( + name='PodcastUpdateResult', + fields=[ + ('id', models.UUIDField(primary_key=True, serialize=False)), + ('start', models.DateTimeField( + default=datetime.datetime.utcnow, + )), + ('duration', models.DurationField()), + ('successful', models.BooleanField()), + ('error_message', models.TextField()), + ('podcast_created', models.BooleanField()), + ('episodes_added', models.IntegerField()), + ('podcast', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to='podcasts.Podcast', + )), + ], + options={ + 'get_latest_by': 'start', + 'ordering': ['-start'], + }, + ), + migrations.AddIndex( + model_name='podcastupdateresult', + index=models.Index( + fields=['podcast', 'start'], + name='data_podcas_podcast_cf4cc1_idx', + ), + ), + ] diff --git a/mygpo/data/migrations/__init__.py b/mygpo/data/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb From 0662ae39d7b13663981851d94c87637ab5cd62e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sat, 2 Dec 2017 23:10:47 +0100 Subject: [PATCH 05/23] Refactor podcast updates, create PodcastUpdater class --- mygpo/data/feeddownloader.py | 778 ++++++++++-------- .../management/commands/feed-downloader.py | 3 +- mygpo/directory/search.py | 6 +- mygpo/maintenance/management/podcastcmd.py | 14 +- mygpo/share/views.py | 5 +- 5 files changed, 432 insertions(+), 374 deletions(-) diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index 9572eaa61..42b5898a3 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -2,7 +2,9 @@ # -*- coding: utf-8 -*- import os.path -import urllib.request, urllib.error, urllib.parse +import urllib.request +import urllib.error +import urllib.parse from urllib.parse import urljoin import http.client import hashlib @@ -54,7 +56,8 @@ def update_podcasts(queue): continue try: - yield update_podcast(podcast_url) + updater = PodcastUpdater(podcast_url) + yield updater.update_podcast() except NoPodcastCreated as npc: logger.info('No podcast created: %s', npc) @@ -65,444 +68,493 @@ def update_podcasts(queue): raise -def update_podcast(podcast_url): - """ Update the podcast for the supplied URL """ +class PodcastUpdater(object): + """ Updates the podcast specified by the podcast_url """ - try: - parsed = _fetch_feed(podcast_url) - _validate_parsed(parsed) + def __init__(self, podcast_url): + self.podcast_url = podcast_url - except requests.exceptions.RequestException as re: - logging.exception('Error while fetching response from feedservice') + def update_podcast(self): + """ Update the podcast """ - # if we fail to parse the URL, we don't even create the - # podcast object - try: - p = Podcast.objects.get(urls__url=podcast_url) - # if it exists already, we mark it as outdated - _mark_outdated(p, 'error while fetching feed: %s' % str(re)) - p.last_update = datetime.utcnow() - p.save() - return p + parsed = self.parse_feed() + if not parsed: + return + + podcast = Podcast.objects.get_or_create_for_url(self.podcast_url) + + episode_updater = MultiEpisodeUpdater(podcast) + episode_updater.update_episodes(parsed.get('episodes', [])) + + podcast.refresh_from_db() + podcast.episode_count = Episode.objects.filter(podcast=podcast).count() + podcast.save() + + episode_updater.order_episodes() - except Podcast.DoesNotExist: - raise NoPodcastCreated(re) + self._update_podcast(podcast, parsed, episode_updater) - except NoEpisodesException as nee: - logging.warn('No episode found while parsing podcast') + return podcast - # if we fail to parse the URL, we don't even create the - # podcast object + def parse_feed(self): try: - p = Podcast.objects.get(urls__url=podcast_url) - # if it exists already, we mark it as outdated - _mark_outdated(p, 'error while fetching feed: %s' % str(nee)) - return p - - except Podcast.DoesNotExist: - raise NoPodcastCreated(nee) - - assert parsed, 'fetch_feed must return something' - p = Podcast.objects.get_or_create_for_url(podcast_url) - episodes = _update_episodes(p, parsed.get('episodes', [])) - p.refresh_from_db() - p.episode_count = Episode.objects.filter(podcast=p).count() - p.save() - max_episode_order = _order_episodes(p) - _update_podcast(p, parsed, episodes, max_episode_order) - return p - - -def verify_podcast_url(podcast_url): - parsed = _fetch_feed(podcast_url) - _validate_parsed(parsed) - return True + parsed = self._fetch_feed() + self._validate_parsed(parsed) + except requests.exceptions.RequestException as re: + logging.exception('Error while fetching response from feedservice') -def _fetch_feed(podcast_url): - params = { - 'url': podcast_url, - 'process_text': 'markdown', - } - headers = { - 'Accept': 'application/json', - } - url = urljoin(settings.FEEDSERVICE_URL, 'parse') - r = requests.get(url, params=params, headers=headers, timeout=30) - - if r.status_code != 200: - logger.error('Feed-service status code for "%s" was %s', podcast_url, - r.status_code) - return None + # if we fail to parse the URL, we don't even create the + # podcast object + try: + p = Podcast.objects.get(urls__url=podcast_url) + # if it exists already, we mark it as outdated + _mark_outdated(p, 'error while fetching feed: %s' % str(re)) + p.last_update = datetime.utcnow() + p.save() + return None + + except Podcast.DoesNotExist as pdne: + raise NoPodcastCreated(re) from pdne + + except NoEpisodesException as nee: + logging.warn('No episode found while parsing podcast') + + # if we fail to parse the URL, we don't even create the + # podcast object + try: + p = Podcast.objects.get(urls__url=podcast_url) + # if it exists already, we mark it as outdated + self._mark_outdated(p, 'error while fetching feed: {}'.format( + str(nee))) + return None + + except Podcast.DoesNotExist as pdne: + raise NoPodcastCreated(nee) from pdne + + return parsed + + def _fetch_feed(self): + params = { + 'url': self.podcast_url, + 'process_text': 'markdown', + } + headers = { + 'Accept': 'application/json', + } + url = urljoin(settings.FEEDSERVICE_URL, 'parse') + r = requests.get(url, params=params, headers=headers, timeout=30) + + if r.status_code != 200: + logger.error('Feed-service status code for "{}" was {}'.format( + podcast_url, r.status_code)) + return None - try: - return r.json()[0] - except ValueError: - logger.exception( - 'Feed-service error while parsing response for url "%s": %s', - podcast_url, r.text, - ) - raise - - -def _validate_parsed(parsed): - """ validates the parsed results and raises an exception if invalid - - feedparser parses pretty much everything. We reject anything that - doesn't look like a feed""" - - if not parsed or not parsed.get('episodes', []): - raise NoEpisodesException('no episodes found') - - -def _update_podcast(podcast, parsed, episodes, max_episode_order): - """ updates a podcast according to new parser results """ - - # we need that later to decide if we can "bump" a category - prev_latest_episode_timestamp = podcast.latest_episode_timestamp - - # will later be used to see whether the index is outdated - old_index_fields = get_index_fields(podcast) - - podcast.title = parsed.get('title') or podcast.title - podcast.description = parsed.get('description') or podcast.description - podcast.subtitle = parsed.get('subtitle') or podcast.subtitle - podcast.link = parsed.get('link') or podcast.link - podcast.logo_url = parsed.get('logo') or podcast.logo_url - podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or - podcast.author) - podcast.language = to_maxlength(Podcast, 'language', - parsed.get('language') or podcast.language) - podcast.content_types = ','.join(parsed.get('content_types')) or \ - podcast.content_types - #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', []) - podcast.common_episode_title = to_maxlength( - Podcast, - 'common_episode_title', - parsed.get('common_title') or podcast.common_episode_title) - podcast.new_location = parsed.get('new_location') or podcast.new_location - podcast.flattr_url = to_maxlength(Podcast, 'flattr_url', - parsed.get('flattr') or - podcast.flattr_url) - podcast.hub = parsed.get('hub') or podcast.hub - podcast.license = parsed.get('license') or podcast.license - podcast.max_episode_order = max_episode_order - - podcast.add_missing_urls(parsed.get('urls', [])) - - if podcast.new_location: try: - new_podcast = Podcast.objects.get(urls__url=podcast.new_location) - if new_podcast != podcast: - _mark_outdated(podcast, 'redirected to different podcast') - return - except Podcast.DoesNotExist: - podcast.set_url(podcast.new_location) + return r.json()[0] + except ValueError: + logger.exception( + 'Feed-service error while parsing response for url "%s": %s', + podcast_url, r.text, + ) + raise - # latest episode timestamp - episodes = Episode.objects.filter(podcast=podcast, - released__isnull=False)\ - .order_by('released') + def _validate_parsed(self, parsed): + """ validates the parsed results and raises an exception if invalid - podcast.update_interval = get_update_interval(episodes) + feedparser parses pretty much everything. We reject anything that + doesn't look like a feed""" - latest_episode = episodes.last() - if latest_episode: - podcast.latest_episode_timestamp = latest_episode.released + if not parsed or not parsed.get('episodes', []): + raise NoEpisodesException('no episodes found') - # podcast.episode_count is not update here on purpose. It is, instead, - # continuously updated when creating new episodes in - # EpisodeManager.get_or_create_for_url + def _update_podcast(self, podcast, parsed, episode_updater): + """ updates a podcast according to new parser results """ - _update_categories(podcast, prev_latest_episode_timestamp) + # we need that later to decide if we can "bump" a category + prev_latest_episode_timestamp = podcast.latest_episode_timestamp - # try to download the logo and reset logo_url to None on http errors - found = _save_podcast_logo(podcast.logo_url) - if not found: - podcast.logo_url = None + # will later be used to see whether the index is outdated + old_index_fields = get_index_fields(podcast) + podcast.title = parsed.get('title') or podcast.title + podcast.description = parsed.get('description') or podcast.description + podcast.subtitle = parsed.get('subtitle') or podcast.subtitle + podcast.link = parsed.get('link') or podcast.link + podcast.logo_url = parsed.get('logo') or podcast.logo_url - # check if search index should be considered out of date - new_index_fields = get_index_fields(podcast) - if list(old_index_fields.items()) != list(new_index_fields.items()): - podcast.search_index_uptodate = False + podcast.author = to_maxlength( + Podcast, 'author', + parsed.get('author') or podcast.author) - # The podcast is always saved (not just when there are changes) because - # we need to record the last update - logger.info('Saving podcast.') - podcast.last_update = datetime.utcnow() - podcast.save() + podcast.language = to_maxlength( + Podcast, 'language', + parsed.get('language') or podcast.language) - try: - subscribe_at_hub(podcast) - except SubscriptionError as se: - logger.warn('subscribing to hub failed: %s', str(se)) + podcast.content_types = (','.join(parsed.get('content_types')) or + podcast.content_types) - assign_slug(podcast) - assign_missing_episode_slugs(podcast) - update_related_podcasts.delay(podcast.pk) + # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', []) + podcast.common_episode_title = to_maxlength( + Podcast, + 'common_episode_title', + parsed.get('common_title') or podcast.common_episode_title) -def assign_slug(podcast): - if podcast.slug: - return + podcast.new_location = (parsed.get('new_location') or + podcast.new_location) + podcast.flattr_url = to_maxlength(Podcast, 'flattr_url', + parsed.get('flattr') or + podcast.flattr_url) + podcast.hub = parsed.get('hub') or podcast.hub + podcast.license = parsed.get('license') or podcast.license + podcast.max_episode_order = episode_updater.max_episode_order - for slug in PodcastSlugs(podcast): - try: - with transaction.atomic(): - podcast.add_slug(slug) - break + podcast.add_missing_urls(parsed.get('urls', [])) - except: - continue + if podcast.new_location: + try: + new_podcast = Podcast.objects.get( + urls__url=podcast.new_location + ) + + if new_podcast != podcast: + _mark_outdated(podcast, 'redirected to different podcast') + return + except Podcast.DoesNotExist: + podcast.set_url(podcast.new_location) + # latest episode timestamp + episodes = Episode.objects.filter(podcast=podcast, + released__isnull=False)\ + .order_by('released') -def assign_missing_episode_slugs(podcast): - common_title = podcast.get_common_episode_title() + podcast.update_interval = episode_updater.get_update_interval(episodes) - episodes = Episode.objects.filter(podcast=podcast, slugs__isnull=True) + latest_episode = episodes.last() + if latest_episode: + podcast.latest_episode_timestamp = latest_episode.released - for episode in episodes: + # podcast.episode_count is not update here on purpose. It is, instead, + # continuously updated when creating new episodes in + # EpisodeManager.get_or_create_for_url - for slug in EpisodeSlugs(episode, common_title): + self._update_categories(podcast, prev_latest_episode_timestamp) + + # try to download the logo and reset logo_url to None on http errors + found = self._save_podcast_logo(podcast.logo_url) + if not found: + podcast.logo_url = None + + # check if search index should be considered out of date + new_index_fields = get_index_fields(podcast) + if list(old_index_fields.items()) != list(new_index_fields.items()): + podcast.search_index_uptodate = False + + # The podcast is always saved (not just when there are changes) because + # we need to record the last update + logger.info('Saving podcast.') + podcast.last_update = datetime.utcnow() + podcast.save() + + try: + subscribe_at_hub(podcast) + except SubscriptionError as se: + logger.warn('subscribing to hub failed: %s', str(se)) + + self.assign_slug(podcast) + episode_updater.assign_missing_episode_slugs() + update_related_podcasts.delay(podcast.pk) + + def assign_slug(self, podcast): + if podcast.slug: + return + + for slug in PodcastSlugs(podcast): try: with transaction.atomic(): - episode.set_slug(slug) + podcast.add_slug(slug) break except: continue + def _update_categories(self, podcast, prev_timestamp): + """ checks some practical requirements and updates a category """ -def _update_categories(podcast, prev_timestamp): - """ checks some practical requirements and updates a category """ + max_timestamp = datetime.utcnow() + timedelta(days=1) - max_timestamp = datetime.utcnow() + timedelta(days=1) + # no episodes at all + if not podcast.latest_episode_timestamp: + return - # no episodes at all - if not podcast.latest_episode_timestamp: - return + # no new episode + if prev_timestamp and \ + (podcast.latest_episode_timestamp <= prev_timestamp): + return - # no new episode - if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp: - return + # too far in the future + if podcast.latest_episode_timestamp > max_timestamp: + return - # too far in the future - if podcast.latest_episode_timestamp > max_timestamp: - return + # not enough subscribers + if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY: + return - # not enough subscribers - if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY: - return + update_category(podcast) - update_category(podcast) + def _save_podcast_logo(self, cover_art): + if not cover_art: + return + try: + image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest() + prefix = CoverArt.get_prefix(image_sha1) -def _update_episodes(podcast, parsed_episodes): + filename = CoverArt.get_original(prefix, image_sha1) + dirname = CoverArt.get_dir(filename) - pid = podcast.get_id() + # get hash of existing file + if os.path.exists(filename): + with open(filename, 'rb') as f: + old_hash = file_hash(f).digest() + else: + old_hash = '' - # list of (obj, fun) where fun is the function to update obj - updated_episodes = [] - episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE)) - logger.info('Parsed %d (%d) episodes', len(parsed_episodes), - len(episodes_to_update)) + logger.info('Logo %s', cover_art) - logger.info('Updating %d episodes', len(episodes_to_update)) - for n, parsed in enumerate(episodes_to_update, 1): + # save new cover art + with open(filename, 'wb') as fp: + fp.write(urllib.request.urlopen(cover_art).read()) - url = get_episode_url(parsed) - if not url: - logger.info('Skipping episode %d for missing URL', n) - continue + # get hash of new file + with open(filename, 'rb') as f: + new_hash = file_hash(f).digest() - logger.info('Updating episode %d / %d', n, len(parsed_episodes)) + # remove thumbnails if cover changed + if old_hash != new_hash: + thumbnails = CoverArt.get_existing_thumbnails(prefix, filename) + logger.info('Removing %d thumbnails', len(thumbnails)) + for f in thumbnails: + os.unlink(f) - episode = Episode.objects.get_or_create_for_url(podcast, url) + return cover_art - update_episode(parsed, episode, podcast) - updated_episodes.append(episode) + except (urllib.error.HTTPError, urllib.error.URLError, ValueError, + http.client.HTTPException, socket.error, IOError) as e: + logger.warn('Exception while updating podcast logo: %s', str(e)) - # and mark the remaining ones outdated - current_episodes = Episode.objects.filter(podcast=podcast, - outdated=False)[:500] - outdated_episodes = set(current_episodes) - set(updated_episodes) + def _mark_outdated(podcast, msg=''): + logger.info('marking podcast outdated: %s', msg) + podcast.outdated = True + podcast.last_update = datetime.utcnow() + podcast.save() + _update_episodes(podcast, []) - logger.info('Marking %d episodes as outdated', len(outdated_episodes)) - for episode in outdated_episodes: - mark_outdated(episode) +class MultiEpisodeUpdater(object): -@transaction.atomic -def _order_episodes(podcast): - """ Reorder the podcast's episode according to release timestamp + def __init__(self, podcast): + self.podcast = podcast + self.updated_episodes = [] + self.max_episode_order = None - Returns the highest order value (corresponding to the most recent - episode) """ + def update_episodes(self, parsed_episodes): - num_episodes = podcast.episode_count - if not num_episodes: - return 0 + pid = self.podcast.get_id() - episodes = podcast.episode_set.all().extra(select={ - 'has_released': 'released IS NOT NULL', - })\ - .order_by('-has_released', '-released', 'pk')\ - .only('pk') + episodes_to_update = list(islice(parsed_episodes, 0, + MAX_EPISODES_UPDATE)) + logger.info('Parsed %d (%d) episodes', len(parsed_episodes), + len(episodes_to_update)) - for n, episode in enumerate(episodes.iterator(), 1): - # assign ``order`` from higher (most recent) to 0 (oldest) - # None means "unknown" - new_order = num_episodes - n + logger.info('Updating %d episodes', len(episodes_to_update)) + for n, parsed in enumerate(episodes_to_update, 1): - # optimize for new episodes that are newer than all existing - if episode.order == new_order: - continue + url = self.get_episode_url(parsed) + if not url: + logger.info('Skipping episode %d for missing URL', n) + continue - logger.info('Updating order from {} to {}'.format(episode.order, - new_order)) - episode.order = new_order - episode.save() + logger.info('Updating episode %d / %d', n, len(parsed_episodes)) - return num_episodes - 1 + episode = Episode.objects.get_or_create_for_url(self.podcast, url) + updater = EpisodeUpdater(episode, self.podcast) + updater.update_episode(parsed) -def _save_podcast_logo(cover_art): - if not cover_art: - return + self.updated_episodes.append(episode) - try: - image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest() - prefix = CoverArt.get_prefix(image_sha1) + # and mark the remaining ones outdated + current_episodes = Episode.objects.filter(podcast=self.podcast, + outdated=False)[:500] + outdated_episodes = set(current_episodes) - set(self.updated_episodes) - filename = CoverArt.get_original(prefix, image_sha1) - dirname = CoverArt.get_dir(filename) + logger.info('Marking %d episodes as outdated', len(outdated_episodes)) + for episode in outdated_episodes: + updater = EpisodeUpdater(episode, self.podcast) + updater.mark_outdated() - # get hash of existing file - if os.path.exists(filename): - with open(filename, 'rb') as f: - old_hash = file_hash(f).digest() - else: - old_hash = '' - - logger.info('Logo %s', cover_art) - - # save new cover art - with open(filename, 'wb') as fp: - fp.write(urllib.request.urlopen(cover_art).read()) - - # get hash of new file - with open(filename, 'rb') as f: - new_hash = file_hash(f).digest() - - # remove thumbnails if cover changed - if old_hash != new_hash: - thumbnails = CoverArt.get_existing_thumbnails(prefix, filename) - logger.info('Removing %d thumbnails', len(thumbnails)) - for f in thumbnails: - os.unlink(f) - - return cover_art - - except (urllib.error.HTTPError, urllib.error.URLError, ValueError, - http.client.HTTPException, socket.error, IOError) as e: - logger.warn('Exception while updating podcast logo: %s', str(e)) - - -def _mark_outdated(podcast, msg=''): - logger.info('marking podcast outdated: %s', msg) - podcast.outdated = True - podcast.last_update = datetime.utcnow() - podcast.save() - _update_episodes(podcast, []) - - -def get_episode_url(parsed_episode): - """ returns the URL of a parsed episode """ - for f in parsed_episode.get('files', []): - if f.get('urls', []): - return f['urls'][0] - return None - - -def update_episode(parsed_episode, episode, podcast): - """ updates "episode" with the data from "parsed_episode" """ - - # TODO: check if there have been any changes, to avoid unnecessary updates - episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or - episode.guid) - episode.description = parsed_episode.get('description') or \ - episode.description - episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle - episode.content = parsed_episode.get('content') or \ - parsed_episode.get('description') or episode.content - episode.link = to_maxlength(Episode, 'link', - parsed_episode.get('link') or episode.link) - episode.released = datetime.utcfromtimestamp( - parsed_episode.get('released')) if parsed_episode.get('released') \ - else episode.released - episode.author = to_maxlength(Episode, 'author', - parsed_episode.get('author') or - episode.author) - episode.duration = parsed_episode.get('duration') or episode.duration - episode.filesize = parsed_episode['files'][0]['filesize'] - episode.language = parsed_episode.get('language') or \ - episode.language or podcast.language - episode.mimetypes = ','.join(list(set( - filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])]) - ))) - episode.flattr_url = to_maxlength(Episode, 'flattr_url', - parsed_episode.get('flattr') or - episode.flattr_url) - episode.license = parsed_episode.get('license') or episode.license - - episode.title = to_maxlength(Episode, 'title', - parsed_episode.get('title') or - episode.title or - file_basename_no_extension(episode.url)) - - episode.last_update = datetime.utcnow() - episode.save() - - parsed_urls = list(chain.from_iterable( - f.get('urls', []) for f in parsed_episode.get('files', []))) - episode.add_missing_urls(parsed_urls) - - -def mark_outdated(obj): - """ marks obj outdated if its not already """ - if obj.outdated: + @transaction.atomic + def order_episodes(self): + """ Reorder the podcast's episode according to release timestamp + + Returns the highest order value (corresponding to the most recent + episode) """ + + num_episodes = self.podcast.episode_count + if not num_episodes: + return 0 + + episodes = self.podcast.episode_set.all().extra(select={ + 'has_released': 'released IS NOT NULL', + })\ + .order_by('-has_released', '-released', 'pk')\ + .only('pk') + + for n, episode in enumerate(episodes.iterator(), 1): + # assign ``order`` from higher (most recent) to 0 (oldest) + # None means "unknown" + new_order = num_episodes - n + + # optimize for new episodes that are newer than all existing + if episode.order == new_order: + continue + + logger.info('Updating order from {} to {}'.format(episode.order, + new_order)) + episode.order = new_order + episode.save() + + self.max_episode_order = num_episodes - 1 + + def get_episode_url(self, parsed_episode): + """ returns the URL of a parsed episode """ + for f in parsed_episode.get('files', []): + if f.get('urls', []): + return f['urls'][0] return None - obj.outdated = True - obj.last_update = datetime.utcnow() - obj.save() + def get_update_interval(self, episodes): + """ calculates the avg interval between new episodes """ + + count = episodes.count() + if not count: + logger.info('no episodes, using default interval of %dh', + DEFAULT_UPDATE_INTERVAL) + return DEFAULT_UPDATE_INTERVAL + + earliest = episodes.first() + now = datetime.utcnow() + + timespan_s = (now - earliest.released).total_seconds() + timespan_h = timespan_s / 60 / 60 + + interval = int(timespan_h / count) + logger.info('%d episodes in %d days => %dh interval', count, + timespan_h / 24, interval) + + # place interval between {MIN,MAX}_UPDATE_INTERVAL + interval = max(interval, MIN_UPDATE_INTERVAL) + interval = min(interval, MAX_UPDATE_INTERVAL) + + return interval + + def assign_missing_episode_slugs(self): + common_title = self.podcast.get_common_episode_title() + + episodes = Episode.objects.filter(podcast=self.podcast, + slugs__isnull=True) + + for episode in episodes: + + for slug in EpisodeSlugs(episode, common_title): + try: + with transaction.atomic(): + episode.set_slug(slug) + break + + except: + continue -def get_update_interval(episodes): - """ calculates the avg interval between new episodes """ +class EpisodeUpdater(object): + """ Updates an individual episode """ - count = episodes.count() - if not count: - logger.info('no episodes, using default interval of %dh', - DEFAULT_UPDATE_INTERVAL) - return DEFAULT_UPDATE_INTERVAL + def __init__(self, episode, podcast): + self.episode = episode + self.podcast = podcast - earliest = episodes.first() - now = datetime.utcnow() + def update_episode(self, parsed_episode): + """ updates "episode" with the data from "parsed_episode" """ - timespan_s = (now - earliest.released).total_seconds() - timespan_h = timespan_s / 60 / 60 + # TODO: check if there have been any changes, to + # avoid unnecessary updates + self.episode.guid = to_maxlength( + Episode, 'guid', + parsed_episode.get('guid') or self.episode.guid) - interval = int(timespan_h / count) - logger.info('%d episodes in %d days => %dh interval', count, - timespan_h / 24, interval) + self.episode.description = (parsed_episode.get('description') or + self.episode.description) - # place interval between {MIN,MAX}_UPDATE_INTERVAL - interval = max(interval, MIN_UPDATE_INTERVAL) - interval = min(interval, MAX_UPDATE_INTERVAL) + self.episode.subtitle = (parsed_episode.get('subtitle') or + self.episode.subtitle) - return interval + self.episode.content = (parsed_episode.get('content') or + parsed_episode.get('description') or + self.episode.content) + + self.episode.link = to_maxlength( + Episode, 'link', + parsed_episode.get('link') or self.episode.link) + + self.episode.released = (datetime.utcfromtimestamp( + parsed_episode.get('released')) if parsed_episode.get('released') + else self.episode.released) + + self.episode.author = to_maxlength( + Episode, 'author', + parsed_episode.get('author') or self.episode.author) + + self.episode.duration = (parsed_episode.get('duration') or + self.episode.duration) + + self.episode.filesize = parsed_episode['files'][0]['filesize'] + + self.episode.language = (parsed_episode.get('language') or + self.episode.language or + self.podcast.language) + + mimetypes = [f['mimetype'] for f in parsed_episode.get('files', [])] + self.episode.mimetypes = ','.join(list(set(filter(None, mimetypes)))) + + self.episode.flattr_url = to_maxlength( + Episode, 'flattr_url', + parsed_episode.get('flattr') or self.episode.flattr_url) + + self.episode.license = (parsed_episode.get('license') or + self.episode.license) + + self.episode.title = to_maxlength( + Episode, 'title', + parsed_episode.get('title') or self.episode.title or + file_basename_no_extension(self.episode.url)) + + self.episode.last_update = datetime.utcnow() + self.episode.save() + + parsed_urls = list(chain.from_iterable( + f.get('urls', []) for f in parsed_episode.get('files', []))) + self.episode.add_missing_urls(parsed_urls) + + def mark_outdated(self): + """ marks the episode outdated if its not already """ + if self.episode.outdated: + return None + + self.episode.outdated = True + self.episode.last_update = datetime.utcnow() + self.episode.save() def file_basename_no_extension(filename): @@ -517,3 +569,9 @@ def file_basename_no_extension(filename): base = os.path.basename(filename) name, extension = os.path.splitext(base) return name + + +def verify_podcast_url(self): + parsed = _fetch_feed(self.podcast_url) + self._validate_parsed(parsed) + return True diff --git a/mygpo/data/management/commands/feed-downloader.py b/mygpo/data/management/commands/feed-downloader.py index a9a05cb75..3283c7251 100644 --- a/mygpo/data/management/commands/feed-downloader.py +++ b/mygpo/data/management/commands/feed-downloader.py @@ -16,8 +16,7 @@ class Command(PodcastCommand): def add_arguments(self, parser): - parser.add_argument('urls', nargs='+', type=str) - + super().add_arguments(parser) parser.add_argument('--list-only', action='store_true', dest='list', default=False, help="Don't update anything, just list podcasts "), diff --git a/mygpo/directory/search.py b/mygpo/directory/search.py index 483619a36..48d01c15c 100644 --- a/mygpo/directory/search.py +++ b/mygpo/directory/search.py @@ -1,6 +1,6 @@ from mygpo.podcasts.models import Podcast from mygpo.utils import is_url, normalize_feed_url -from mygpo.data.feeddownloader import update_podcast, NoPodcastCreated +from mygpo.data.feeddownloader import PodcastUpdater, NoPodcastCreated from mygpo.search.index import search_podcasts as search @@ -14,9 +14,11 @@ def search_podcasts(q): except Podcast.DoesNotExist: podcast = None + updater = PodcastUpdater(url) + if not podcast or not podcast.title: try: - update_podcast(url) + updater.update_podcast() except NoPodcastCreated as npc: return [] diff --git a/mygpo/maintenance/management/podcastcmd.py b/mygpo/maintenance/management/podcastcmd.py index f59113204..8d68f134d 100644 --- a/mygpo/maintenance/management/podcastcmd.py +++ b/mygpo/maintenance/management/podcastcmd.py @@ -18,7 +18,7 @@ def add_arguments(self, parser): parser.add_argument('--update-new', action='store_true', dest='new', default=False, help="Update all podcasts with new Episodes"), - parser.add_argument('--max', action='store', dest='max', type='int', + parser.add_argument('--max', action='store', dest='max', type=int, default=0, help="Set how many feeds should be updated at maximum"), parser.add_argument('--random', action='store_true', dest='random', @@ -27,6 +27,7 @@ def add_arguments(self, parser): parser.add_argument('--next', action='store_true', dest='next', default=False, help="Podcasts that are due to be updated next"), + parser.add_argument('urls', nargs='+', type=str) def get_podcasts(self, *args, **options): return chain.from_iterable(self._get_podcasts(*args, **options)) @@ -54,17 +55,14 @@ def _get_podcasts(self, *args, **options): podcasts = Podcast.objects.all().order_by_next_update()[:max_podcasts] yield (p.url for p in podcasts) - - if args: - yield args - if options.get('urls'): yield options.get('urls') - if not args and not options.get('toplist') and not options.get('new') \ - and not options.get('random') and not options.get('next'): + if not options.get('urls') and not options.get('toplist') and \ + not options.get('new') and not options.get('random') and \ + not options.get('next'): query = Podcast.objects.order_by('last_update') - podcasts = query.select_related('urls')[:max_podcasts] + podcasts = query[:max_podcasts] yield (p.url for p in podcasts) diff --git a/mygpo/share/views.py b/mygpo/share/views.py index 64b86cbd6..faf37245a 100644 --- a/mygpo/share/views.py +++ b/mygpo/share/views.py @@ -11,7 +11,7 @@ from mygpo.podcasts.models import Podcast from mygpo.publisher.models import PublishedPodcast from mygpo.userfeeds.feeds import FavoriteFeed -from mygpo.data.feeddownloader import update_podcast +from mygpo.data.feeddownloader import PodcastUpdater import logging logger = logging.getLogger(__name__) @@ -100,7 +100,8 @@ def post(self, request): publisher=user, ) - update_podcast(feed_url) + updater = PodcastUpdater(feed_url) + updater.update_podcast() return HttpResponseRedirect(reverse('share-favorites')) From 4cbc611aa9479903ebbde28292242bd157e31ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:42:52 +0100 Subject: [PATCH 06/23] Return created flag from get_or_create_for_url --- mygpo/administration/tests.py | 12 +++++------ mygpo/api/advanced/__init__.py | 4 ++-- mygpo/api/advanced/episode.py | 4 ++-- mygpo/api/advanced/lists.py | 4 ++-- mygpo/api/legacy.py | 4 ++-- mygpo/api/simple.py | 2 +- mygpo/api/subscriptions.py | 2 +- mygpo/api/tests.py | 4 ++-- mygpo/maintenance/tests.py | 28 +++++++++++++------------- mygpo/podcasts/models.py | 35 +++++++++++++++++++++------------ mygpo/podcasts/tests.py | 6 +++--- mygpo/podcasts/views/podcast.py | 2 +- mygpo/share/views.py | 2 +- mygpo/subscriptions/tests.py | 2 +- mygpo/users/tests.py | 7 +++++-- mygpo/usersettings/tests.py | 7 +++++-- mygpo/web/tests.py | 2 +- 17 files changed, 71 insertions(+), 56 deletions(-) diff --git a/mygpo/administration/tests.py b/mygpo/administration/tests.py index a8124dc74..3a181c868 100644 --- a/mygpo/administration/tests.py +++ b/mygpo/administration/tests.py @@ -24,22 +24,22 @@ class SimpleTest(TestCase): def test_merge(self): - p1 = Podcast.objects.get_or_create_for_url('http://example.com/podcast1.rss') - p2 = Podcast.objects.get_or_create_for_url('http://example.com/podcast2.rss') + p1 = Podcast.objects.get_or_create_for_url('http://example.com/podcast1.rss').object + p2 = Podcast.objects.get_or_create_for_url('http://example.com/podcast2.rss').object - e1 = Episode.objects.get_or_create_for_url(p1, 'http://example.com/podcast1/e1.mp3') + e1 = Episode.objects.get_or_create_for_url(p1, 'http://example.com/podcast1/e1.mp3').object e1.title = 'Episode 1' e1.save() - e2 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast1/e2.mp3') + e2 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast1/e2.mp3').object e2.title = 'Episode 2' e2.save() - e3 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast2/e2.mp3') + e3 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast2/e2.mp3').object e3.title = 'Episode 3' e3.save() - e4 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast2/e3.mp3') + e4 = Episode.objects.get_or_create_for_url(p2, 'http://example.com/podcast2/e3.mp3').object e4.title = 'Episode 4' e4.save() diff --git a/mygpo/api/advanced/__init__.py b/mygpo/api/advanced/__init__.py index 9f3cddc90..2687db147 100644 --- a/mygpo/api/advanced/__init__.py +++ b/mygpo/api/advanced/__init__.py @@ -200,8 +200,8 @@ def update_episodes(user, actions, now, ua_string): if not episode_url: continue - podcast = Podcast.objects.get_or_create_for_url(podcast_url) - episode = Episode.objects.get_or_create_for_url(podcast, episode_url) + podcast = Podcast.objects.get_or_create_for_url(podcast_url).object + episode = Episode.objects.get_or_create_for_url(podcast, episode_url).object # parse_episode_action returns a EpisodeHistoryEntry obj history = parse_episode_action(action, user, update_urls, now, diff --git a/mygpo/api/advanced/episode.py b/mygpo/api/advanced/episode.py index 2b12704f2..2741945f2 100644 --- a/mygpo/api/advanced/episode.py +++ b/mygpo/api/advanced/episode.py @@ -54,8 +54,8 @@ def get(self, request, username): def update_chapters(self, req, user): """ Add / remove chapters according to the client's request """ - podcast = Podcast.objects.get_or_create_for_url(podcast_url) - episode = Episode.objects.get_or_create_for_url(podcast, episode_url) + podcast = Podcast.objects.get_or_create_for_url(podcast_url).object + episode = Episode.objects.get_or_create_for_url(podcast, episode_url).object # add chapters for chapter_data in req.get('chapters_add', []): diff --git a/mygpo/api/advanced/lists.py b/mygpo/api/advanced/lists.py index cb586945c..85ac72cf2 100644 --- a/mygpo/api/advanced/lists.py +++ b/mygpo/api/advanced/lists.py @@ -61,7 +61,7 @@ def create(request, username, format): return HttpResponse('List already exists', status=409) urls = parse_subscription(request.body.decode('utf-8'), format) - podcasts = [Podcast.objects.get_or_create_for_url(url) for url in urls] + podcasts = [Podcast.objects.get_or_create_for_url(url).object for url in urls] for podcast in podcasts: plist.add_entry(podcast) @@ -148,7 +148,7 @@ def get_list(request, plist, owner, format): def update_list(request, plist, owner, format): """ Replaces the podcasts in the list and returns 204 No Content """ urls = parse_subscription(request.body.decode('utf-8'), format) - podcasts = [Podcast.objects.get_or_create_for_url(url) for url in urls] + podcasts = [Podcast.objects.get_or_create_for_url(url).object for url in urls] plist.set_entries(podcasts) return HttpResponse(status=204) diff --git a/mygpo/api/legacy.py b/mygpo/api/legacy.py index f7aee4126..d00da2416 100644 --- a/mygpo/api/legacy.py +++ b/mygpo/api/legacy.py @@ -55,11 +55,11 @@ def upload(request): rem = list(set(rem)) for n in new: - p = Podcast.objects.get_or_create_for_url(n) + p = Podcast.objects.get_or_create_for_url(n).object subscribe(p, user, dev) for r in rem: - p = Podcast.objects.get_or_create_for_url(r) + p = Podcast.objects.get_or_create_for_url(r).object unsubscribe(p, user, dev) return HttpResponse('@SUCCESS', content_type='text/plain') diff --git a/mygpo/api/simple.py b/mygpo/api/simple.py index 00970e35a..7105f6d07 100644 --- a/mygpo/api/simple.py +++ b/mygpo/api/simple.py @@ -200,7 +200,7 @@ def set_subscriptions(urls, user, device_uid, user_agent): unsubscribe(podcast, user, device) for url in new: - podcast = Podcast.objects.get_or_create_for_url(url) + podcast = Podcast.objects.get_or_create_for_url(url).object subscribe(podcast, user, device, url) # Only an empty response is a successful response diff --git a/mygpo/api/subscriptions.py b/mygpo/api/subscriptions.py index 5f38984f5..50ac7cf92 100644 --- a/mygpo/api/subscriptions.py +++ b/mygpo/api/subscriptions.py @@ -81,7 +81,7 @@ def update_subscriptions(self, user, device, add, remove): rem_s = filter(lambda x: x not in add_s, rem_s) for add_url in add_s: - podcast = Podcast.objects.get_or_create_for_url(add_url) + podcast = Podcast.objects.get_or_create_for_url(add_url).object subscribe(podcast, user, device, add_url) remove_podcasts = Podcast.objects.filter(urls__url__in=rem_s) diff --git a/mygpo/api/tests.py b/mygpo/api/tests.py index 86504a59d..653126825 100644 --- a/mygpo/api/tests.py +++ b/mygpo/api/tests.py @@ -168,14 +168,14 @@ def setUp(self): defaults = { 'title': 'My Podcast', }, - ) + ).object self.episode = Episode.objects.get_or_create_for_url( self.podcast, 'http://example.com/directory-podcast/1.mp3', defaults = { 'title': 'My Episode', }, - ) + ).object self.client = Client() def test_episode_info(self): diff --git a/mygpo/maintenance/tests.py b/mygpo/maintenance/tests.py index defa14d45..544d7d351 100644 --- a/mygpo/maintenance/tests.py +++ b/mygpo/maintenance/tests.py @@ -22,22 +22,22 @@ def setUp(self): self.podcast1 = Podcast.objects.get_or_create_for_url( 'http://example.com/simple-merge-test-feed.rss', defaults={'title': 'Podcast 1'}, - ) + ).object self.podcast2 = Podcast.objects.get_or_create_for_url( 'http://simple-merge-test.org/podcast/', defaults={'title': 'Podcast 2'}, - ) + ).object self.episode1 = Episode.objects.get_or_create_for_url( self.podcast1, 'http://example.com/simple-merge-test-episode1.mp3', defaults={ 'title': 'Episode 1 A', - }) + }).object self.episode2 = Episode.objects.get_or_create_for_url( self.podcast2, 'http://example.com/simple-merge-test-episode1.mp3', defaults={ 'title': 'Episode 1 B', - }) + }).object def test_merge_podcasts(self): # decide which episodes to merge @@ -55,22 +55,22 @@ def setUp(self): self.podcast1 = Podcast.objects.get_or_create_for_url( 'http://example.com/merge-test-feed.rss', defaults={'title': 'Podcast 1'}, - ) + ).object self.podcast2 = Podcast.objects.get_or_create_for_url( 'http://merge-test.org/podcast/', defaults={'title': 'Podcast 2'}, - ) + ).object self.episode1 = Episode.objects.get_or_create_for_url( self.podcast1, 'http://example.com/merge-test-episode1.mp3', defaults={ 'title': 'Episode 1 A', - }) + }).object self.episode2 = Episode.objects.get_or_create_for_url( self.podcast2, 'http://example.com/merge-test-episode1.mp3', defaults={ 'title': 'Episode 1 B', - }) + }).object User = get_user_model() self.user = User(username='test-merge') @@ -123,38 +123,38 @@ def setUp(self): defaults={ 'title': 'Podcast 1', }, - ) + ).object self.podcast2 = Podcast.objects.get_or_create_for_url( 'http://test.org/group-merge-podcast/', defaults={ 'title': 'Podcast 2', }, - ) + ).object self.podcast3 = Podcast.objects.get_or_create_for_url( 'http://group-test.org/feed/', defaults={ 'title': 'Podcast 3', }, - ) + ).object self.episode1 = Episode.objects.get_or_create_for_url( self.podcast1, 'http://example.com/group-merge-episode1.mp3', defaults={ 'title': 'Episode 1 A', }, - ) + ).object self.episode2 = Episode.objects.get_or_create_for_url( self.podcast2, 'http://example.com/group-merge-episode1.mp3', defaults={ 'title': 'Episode 1 B', }, - ) + ).object self.episode3 = Episode.objects.get_or_create_for_url( self.podcast3, 'http://example.com/group-merge-media.mp3', defaults={ 'title': 'Episode 2', }, - ) + ).object self.podcast2.group_with(self.podcast3, 'My Group', 'Feed1', 'Feed2') diff --git a/mygpo/podcasts/models.py b/mygpo/podcasts/models.py index d5e911516..54fe96c42 100644 --- a/mygpo/podcasts/models.py +++ b/mygpo/podcasts/models.py @@ -1,5 +1,5 @@ - +import collections import uuid import re from datetime import timedelta @@ -22,6 +22,9 @@ logger = logging.getLogger(__name__) +GetCreateResult = collections.namedtuple('GetCreateResult', 'object created') + + # default podcast update interval in hours DEFAULT_UPDATE_INTERVAL = 7 * 24 @@ -395,9 +398,11 @@ def get_or_create_for_url(self, url, defaults={}): url = utils.to_maxlength(URL, 'url', url) try: # try to fetch the podcast - return Podcast.objects.get(urls__url=url, - urls__scope='', - ) + podcast = Podcast.objects.get(urls__url=url, + urls__scope='', + ) + return GetCreateResult(podcast, False) + except Podcast.DoesNotExist: # episode did not exist, try to create it try: @@ -408,13 +413,14 @@ def get_or_create_for_url(self, url, defaults={}): scope='', content_object=podcast, ) - return podcast + return GetCreateResult(podcast, True) # URL could not be created, so it was created since the first get except IntegrityError: - return Podcast.objects.get(urls__url=url, - urls__scope='', - ) + podcast = Podcast.objects.get(urls__url=url, + urls__scope='', + ) + return GetCreateResult(podcast, False) class URL(OrderedModel, ScopedModel): @@ -723,6 +729,7 @@ def get_or_create_for_url(self, podcast, url, defaults={}): try: url = URL.objects.get(url=url, scope=podcast.as_scope) + created = False episode = url.content_object if episode is None: @@ -734,8 +741,9 @@ def get_or_create_for_url(self, podcast, url, defaults={}): url.content_object = episode url.save() + created = True - return episode + return GetCreateResult(episode, created) except URL.DoesNotExist: @@ -758,13 +766,14 @@ def get_or_create_for_url(self, podcast, url, defaults={}): Podcast.objects.filter(pk=podcast.pk)\ .update(episode_count=F('episode_count')+1) - return episode + return GetCreateResult(episode, True) # URL could not be created, so it was created since the first get except IntegrityError: - return Episode.objects.get(urls__url=url, - urls__scope=podcast.as_scope, - ) + episode = Episode.objects.get(urls__url=url, + urls__scope=podcast.as_scope, + ) + return GetCreateResult(episode, False) class Episode(UUIDModel, TitleModel, DescriptionModel, LinkModel, diff --git a/mygpo/podcasts/tests.py b/mygpo/podcasts/tests.py index f0b58307c..9dd3be800 100644 --- a/mygpo/podcasts/tests.py +++ b/mygpo/podcasts/tests.py @@ -33,8 +33,8 @@ def test_next_update(self): def test_get_or_create_for_url(self): """ Test that get_or_create_for_url returns existing Podcast """ URL = 'http://example.com/get_or_create.rss' - p1 = Podcast.objects.get_or_create_for_url(URL) - p2 = Podcast.objects.get_or_create_for_url(URL) + p1 = Podcast.objects.get_or_create_for_url(URL).object + p2 = Podcast.objects.get_or_create_for_url(URL).object self.assertEqual(p1.pk, p2.pk) def test_episode_count(self): @@ -43,7 +43,7 @@ def test_episode_count(self): EPISODE_URL = 'http://example.com/episode%d.mp3' NUM_EPISODES=3 - p = Podcast.objects.get_or_create_for_url(PODCAST_URL) + p = Podcast.objects.get_or_create_for_url(PODCAST_URL).object for n in range(NUM_EPISODES): Episode.objects.get_or_create_for_url(p, EPISODE_URL % (n, )) diff --git a/mygpo/podcasts/views/podcast.py b/mygpo/podcasts/views/podcast.py index ace98826e..14a74854e 100644 --- a/mygpo/podcasts/views/podcast.py +++ b/mygpo/podcasts/views/podcast.py @@ -326,7 +326,7 @@ def subscribe_url(request): if not url: raise Http404('Please specify a valid url') - podcast = Podcast.objects.get_or_create_for_url(url) + podcast = Podcast.objects.get_or_create_for_url(url).object return HttpResponseRedirect(get_podcast_link_target(podcast, 'subscribe')) diff --git a/mygpo/share/views.py b/mygpo/share/views.py index faf37245a..b2a4f8be9 100644 --- a/mygpo/share/views.py +++ b/mygpo/share/views.py @@ -93,7 +93,7 @@ def post(self, request): site = RequestSite(request) feed_url = feed.get_public_url(site.domain) - podcast = Podcast.objects.get_or_create_for_url(feed_url) + podcast = Podcast.objects.get_or_create_for_url(feed_url).object PublishedPodcast.objects.get_or_create( podcast=podcast, diff --git a/mygpo/subscriptions/tests.py b/mygpo/subscriptions/tests.py index 528cf1761..72d37d548 100644 --- a/mygpo/subscriptions/tests.py +++ b/mygpo/subscriptions/tests.py @@ -25,7 +25,7 @@ def setUp(self): user=self.user, uid='dev1', id=uuid.uuid1()) self.url = 'http://www.example.com/pdocast.rss' - self.podcast = Podcast.objects.get_or_create_for_url(self.url) + self.podcast = Podcast.objects.get_or_create_for_url(self.url).object def test_duplicate_subscribe(self): """ Test that a duplicate subscription is skipped """ diff --git a/mygpo/users/tests.py b/mygpo/users/tests.py index 5c6800020..6d5f137c4 100644 --- a/mygpo/users/tests.py +++ b/mygpo/users/tests.py @@ -66,8 +66,11 @@ class UnsubscribeMergeTests(TestCase): P2_URL = 'http://test.org/podcast/' def setUp(self): - self.podcast1 = Podcast.objects.get_or_create_for_url('http://example.com/feed.rss') - self.podcast2 = Podcast.objects.get_or_create_for_url(self.P2_URL) + self.podcast1 = Podcast.objects.get_or_create_for_url( + 'http://example.com/feed.rss').object + + self.podcast2 = Podcast.objects.get_or_create_for_url( + self.P2_URL).object User = get_user_model() self.user = User(username='test-merge') diff --git a/mygpo/usersettings/tests.py b/mygpo/usersettings/tests.py index 0ed32e749..9de6321a0 100644 --- a/mygpo/usersettings/tests.py +++ b/mygpo/usersettings/tests.py @@ -22,11 +22,14 @@ def setUp(self): self.podcast_url = 'http://example.com/podcast.rss' self.episode_url = 'http://example.com/podcast/episode-1.mp3' self.uid = 'client-uid' - self.podcast = Podcast.objects.get_or_create_for_url(self.podcast_url) + self.podcast = Podcast.objects.get_or_create_for_url( + self.podcast_url).object + self.episode = Episode.objects.get_or_create_for_url( self.podcast, self.episode_url, - ) + ).object + self.user_client = Client.objects.create( id = uuid.uuid1(), user = self.user, diff --git a/mygpo/web/tests.py b/mygpo/web/tests.py index 9dd7c2d4a..cca2c226b 100644 --- a/mygpo/web/tests.py +++ b/mygpo/web/tests.py @@ -74,7 +74,7 @@ def setUp(self): episode = Episode.objects.get_or_create_for_url( podcast, 'http://www.example.com/episode%d.mp3' % (n, ), - ) + ).object # we only need (the last) one self.episode_slug = Slug.objects.create(content_object=episode, From b0227c0a975340c3cae9b93fbf9efc4a45b6fa68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:44:30 +0100 Subject: [PATCH 07/23] Update Procfile --- Procfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Procfile b/Procfile index cbbcbbb6e..4dd801905 100644 --- a/Procfile +++ b/Procfile @@ -1,2 +1,3 @@ -web: gunicorn mygpo.wsgi:application -c gunicorn.conf.py -beat: python manage.py celery beat -S django --pidfile /var/run/mygpo/celerybeat.pid +web: gunicorn mygpo.wsgi:application -c conf/gunicorn.conf.py +beat: celery -A mygpo beat --pidfile /tmp/celerybeat.pid -S django +celery: celery -A mygpo worker --concurrency=3 -l info -Ofair From 43238d0de9e4d6d4909b4d67c17449a9599e5dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:45:55 +0100 Subject: [PATCH 08/23] Format short durations without "0 hours" --- mygpo/web/templatetags/time.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mygpo/web/templatetags/time.py b/mygpo/web/templatetags/time.py index 760f09ef9..3a3d72c34 100644 --- a/mygpo/web/templatetags/time.py +++ b/mygpo/web/templatetags/time.py @@ -29,10 +29,16 @@ def format_duration(sec): """ Converts seconds into a duration string >>> format_duration(1000) - '0h 16m 40s' + '16m 40s' + >>> format_duration(10009) + '2h 46m 49s' """ hours = int(sec / 60 / 60) minutes = int((sec / 60) % 60) seconds = int(sec % 60) - return _('{h}h {m}m {s}s').format(h=hours, m=minutes, s=seconds) + + if hours: + return _('{h}h {m}m {s}s').format(h=hours, m=minutes, s=seconds) + else: + return _('{m}m {s}s').format(m=minutes, s=seconds) From 0309dc17399b6a55e6c7c03838cdcb0af1c58ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:46:36 +0100 Subject: [PATCH 09/23] Fix variable name in update_related_podcasts() --- mygpo/data/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mygpo/data/tasks.py b/mygpo/data/tasks.py index 09d241c69..0ee31043e 100644 --- a/mygpo/data/tasks.py +++ b/mygpo/data/tasks.py @@ -25,7 +25,7 @@ def update_podcasts(podcast_urls): def update_related_podcasts(podcast_pk, max_related=20): get_podcast = itemgetter(0) - podcast = Podcast.objects.get(pk=pk) + podcast = Podcast.objects.get(pk=podcast_pk) related = calc_similar_podcasts(podcast)[:max_related] related = map(get_podcast, related) From e8fe4fde046144236df679bb6519e8dadf06aa93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:47:09 +0100 Subject: [PATCH 10/23] Collect results when updating podcasts --- htdocs/media/screen.css | 19 +++++++ mygpo/data/admin.py | 18 +++++++ mygpo/data/feeddownloader.py | 47 +++++++++++------ .../migrations/0002_result_podcast_null.py | 21 ++++++++ .../0003_podcastupdateresult_podcast_url.py | 21 ++++++++ mygpo/data/models.py | 34 ++++++++++++- mygpo/podcasts/models.py | 4 ++ .../templates/publisher/podcast.html | 50 ++++++++++++++++--- mygpo/publisher/views.py | 7 +++ 9 files changed, 200 insertions(+), 21 deletions(-) create mode 100644 mygpo/data/admin.py create mode 100644 mygpo/data/migrations/0002_result_podcast_null.py create mode 100644 mygpo/data/migrations/0003_podcastupdateresult_podcast_url.py diff --git a/htdocs/media/screen.css b/htdocs/media/screen.css index 6a1950933..b2d396193 100644 --- a/htdocs/media/screen.css +++ b/htdocs/media/screen.css @@ -1,3 +1,9 @@ +:root { + --color-status-success: 90; + --color-status-error: 0; + --color-status-neutral: 247; +} + /* Landscape phones and down */ @media (min-width: 980px) @@ -628,3 +634,16 @@ div.podcasts div.podcast:hover div.actions button.btn .hosting { text-align: center; } + +.status-success { + background-color: hsla(var(--color-status-success), 100%, 75%, 1); +} + + +.status-error { + background-color: hsla(var(--color-status-error), 100%, 75%, 1); +} + +.status-neutral { + background-color: hsla(var(--color-status-neutral), 16%, 85%, 1); +} diff --git a/mygpo/data/admin.py b/mygpo/data/admin.py new file mode 100644 index 000000000..cc94ec441 --- /dev/null +++ b/mygpo/data/admin.py @@ -0,0 +1,18 @@ +from django.contrib import admin + +from . import models + + +@admin.register(models.PodcastUpdateResult) +class PodcastUpdateResultAdmin(admin.ModelAdmin): + model = models.PodcastUpdateResult + + list_display = ['title', 'start', 'duration', 'successful', + 'episodes_added'] + + readonly_fields = ['id', 'podcast_url', 'podcast', 'start', 'duration', + 'successful', 'error_message', 'podcast_created', + 'episodes_added'] + + def title(self, res): + return res.podcast or res.podcast_url diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index 42b5898a3..19a5a448d 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -28,6 +28,8 @@ from mygpo.directory.tags import update_category from mygpo.search import get_index_fields +from . import models + import logging logger = logging.getLogger(__name__) @@ -77,22 +79,31 @@ def __init__(self, podcast_url): def update_podcast(self): """ Update the podcast """ - parsed = self.parse_feed() - if not parsed: - return + with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res: - podcast = Podcast.objects.get_or_create_for_url(self.podcast_url) + parsed = self.parse_feed() + if not parsed: + res.podcast_created = False + res.error_message = '"{}" could not be parsed'.format( + self.podcast_url) + return - episode_updater = MultiEpisodeUpdater(podcast) - episode_updater.update_episodes(parsed.get('episodes', [])) + podcast, created = Podcast.objects.get_or_create_for_url( + self.podcast_url) + res.podcast = podcast + res.podcast_created = created - podcast.refresh_from_db() - podcast.episode_count = Episode.objects.filter(podcast=podcast).count() - podcast.save() + res.episodes_added = 0 + episode_updater = MultiEpisodeUpdater(podcast, res) + episode_updater.update_episodes(parsed.get('episodes', [])) + + podcast.refresh_from_db() + podcast.episode_count = episode_updater.count_episodes() + podcast.save() - episode_updater.order_episodes() + episode_updater.order_episodes() - self._update_podcast(podcast, parsed, episode_updater) + self._update_podcast(podcast, parsed, episode_updater) return podcast @@ -123,7 +134,7 @@ def parse_feed(self): # if we fail to parse the URL, we don't even create the # podcast object try: - p = Podcast.objects.get(urls__url=podcast_url) + p = Podcast.objects.get(urls__url=self.podcast_url) # if it exists already, we mark it as outdated self._mark_outdated(p, 'error while fetching feed: {}'.format( str(nee))) @@ -354,8 +365,9 @@ def _mark_outdated(podcast, msg=''): class MultiEpisodeUpdater(object): - def __init__(self, podcast): + def __init__(self, podcast, update_result): self.podcast = podcast + self.update_result = update_result self.updated_episodes = [] self.max_episode_order = None @@ -378,7 +390,11 @@ def update_episodes(self, parsed_episodes): logger.info('Updating episode %d / %d', n, len(parsed_episodes)) - episode = Episode.objects.get_or_create_for_url(self.podcast, url) + episode, created = Episode.objects.get_or_create_for_url( + self.podcast, url) + + if created: + self.update_result.episodes_added += 1 updater = EpisodeUpdater(episode, self.podcast) updater.update_episode(parsed) @@ -435,6 +451,9 @@ def get_episode_url(self, parsed_episode): return f['urls'][0] return None + def count_episodes(self): + return Episode.objects.filter(podcast=self.podcast).count() + def get_update_interval(self, episodes): """ calculates the avg interval between new episodes """ diff --git a/mygpo/data/migrations/0002_result_podcast_null.py b/mygpo/data/migrations/0002_result_podcast_null.py new file mode 100644 index 000000000..4149dfcf4 --- /dev/null +++ b/mygpo/data/migrations/0002_result_podcast_null.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.5 on 2017-12-03 17:22 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('data', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='podcastupdateresult', + name='podcast', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='podcasts.Podcast'), + ), + ] diff --git a/mygpo/data/migrations/0003_podcastupdateresult_podcast_url.py b/mygpo/data/migrations/0003_podcastupdateresult_podcast_url.py new file mode 100644 index 000000000..89b0d97c5 --- /dev/null +++ b/mygpo/data/migrations/0003_podcastupdateresult_podcast_url.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.5 on 2017-12-03 17:28 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('data', '0002_result_podcast_null'), + ] + + operations = [ + migrations.AddField( + model_name='podcastupdateresult', + name='podcast_url', + field=models.URLField(default='unknown', max_length=2048), + preserve_default=False, + ), + ] diff --git a/mygpo/data/models.py b/mygpo/data/models.py index 7958f2b5c..02b0bc11e 100644 --- a/mygpo/data/models.py +++ b/mygpo/data/models.py @@ -1,3 +1,5 @@ +import uuid + from datetime import datetime from django.db import models @@ -11,8 +13,11 @@ class PodcastUpdateResult(UUIDModel): Once an instance is stored, the update is assumed to be finished. """ + # URL of the podcast to be updated + podcast_url = models.URLField(max_length=2048) + # The podcast that was updated - podcast = models.ForeignKey(Podcast, on_delete=models.CASCADE) + podcast = models.ForeignKey(Podcast, on_delete=models.CASCADE, null=True) # The timestamp at which the updated started to be executed start = models.DateTimeField(default=datetime.utcnow) @@ -42,3 +47,30 @@ class Meta(object): models.Index(fields=['podcast', 'start']) ] + def __str__(self): + return 'Update Result for "{}" @ {:%Y-%m-%d %H:%M}'.format( + self.podcast, self.start) + + # Use as context manager + + def __enter__(self): + self.id = uuid.uuid4() + self.start = datetime.utcnow() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.duration = datetime.utcnow() - self.start + + success = (exc_type, exc_value, traceback) == (None, None, None) + self.successful = success + + if not success: + self.error_message = str(exc_value) + + if self.podcast_created is None: + self.podcast_created = False + + if self.episodes_added is None: + self.episodes_added = 0 + + self.save() diff --git a/mygpo/podcasts/models.py b/mygpo/podcasts/models.py index 54fe96c42..6b5d1f967 100644 --- a/mygpo/podcasts/models.py +++ b/mygpo/podcasts/models.py @@ -696,6 +696,10 @@ def display_title(self): return _('Unknown Podcast from {domain}'.format( domain=utils.get_domain(self.url))) + @property + def next_update(self): + return self.last_update + timedelta(hours=self.update_interval) + class EpisodeQuerySet(MergedUUIDQuerySet): """ QuerySet for Episodes """ diff --git a/mygpo/publisher/templates/publisher/podcast.html b/mygpo/publisher/templates/publisher/podcast.html index 93c03a38a..314ab388e 100644 --- a/mygpo/publisher/templates/publisher/podcast.html +++ b/mygpo/publisher/templates/publisher/podcast.html @@ -4,6 +4,7 @@ {% load podcasts %} {% load charts %} {% load pcharts %} +{% load time %} {% load static %} {% load menu %} {% load utils %} @@ -56,12 +57,49 @@

Podcast Data

{% trans "The podcast information is regularly retrieved from the podcast feed" %}

{{ podcast.url }}
-

{% trans "Timing" %}

-
    -
  • {% trans "Last update:" %} {{ podcast.last_update|naturaltime }}
  • -
  • {% trans "Update interval:" %} {{ podcast.update_interval|hours_to_str }}
  • -
  • {% trans "Next update:" %} {{ podcast.next_update|naturaltime }}
  • -
+

{% trans "Updates" %}

+ + {% trans "Update interval:" %} {{ podcast.update_interval|hours_to_str }} + + + + + + + + + + + + + + + + + + {% for result in update_results %} + + + + + + + {% empty %} + + + + + + + {% endfor %} + +
StartDurationStatusEpisodes Added
{{ podcast.next_update|naturaltime }}Next
{{ result.start|naturaltime }}{{ result.duration.total_seconds|format_duration }} + {% if result.successful %} + {% trans "Successful" %} + {% else %} + {% trans "Error" %} {{ result.error_message }} + {% endif %} + {{ result.episodes_added }}
{{ podcast.last_update|naturaltime }}
{% csrf_token %} diff --git a/mygpo/publisher/views.py b/mygpo/publisher/views.py index 9e62e98b7..5038cdf8f 100644 --- a/mygpo/publisher/views.py +++ b/mygpo/publisher/views.py @@ -31,6 +31,7 @@ get_episode_link_target from django.contrib.sites.requests import RequestSite from mygpo.data.tasks import update_podcasts +from mygpo.data.models import PodcastUpdateResult from mygpo.decorators import requires_token, allowed_methods from mygpo.pubsub.models import HubSubscription @@ -92,6 +93,11 @@ def podcast(request, podcast): except HubSubscription.DoesNotExist: pubsubscription = None + MAX_UPDATE_RESULTS=10 + + update_results = PodcastUpdateResult.objects.filter(podcast=podcast) + update_results = update_results[:MAX_UPDATE_RESULTS] + site = RequestSite(request) feedurl_quoted = urllib.parse.quote(podcast.url.encode('ascii')) @@ -105,6 +111,7 @@ def podcast(request, podcast): 'update_token': update_token, 'feedurl_quoted': feedurl_quoted, 'pubsubscription': pubsubscription, + 'update_results': update_results, }) From 4460405a640e3fcb400bc096b69a2b646ea61d80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 18:54:01 +0100 Subject: [PATCH 11/23] Fix Podcast's order_by_next_update() --- mygpo/podcasts/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mygpo/podcasts/models.py b/mygpo/podcasts/models.py index 6b5d1f967..7e7ea0561 100644 --- a/mygpo/podcasts/models.py +++ b/mygpo/podcasts/models.py @@ -342,8 +342,8 @@ def license(self, license_url=None): def order_by_next_update(self): """ Sort podcasts by next scheduled update """ NEXTUPDATE = "last_update + (update_interval || ' hours')::INTERVAL" - q = self.extra(select={'next_update': NEXTUPDATE}) - return q.order_by('next_update') + q = self.extra(select={'_next_update': NEXTUPDATE}) + return q.order_by('_next_update') @property def next_update(self): From 35490229c1e35eb695cc269b510f823f5a808b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 19:40:12 +0100 Subject: [PATCH 12/23] Fix error when marking podcasts as outdated --- mygpo/data/feeddownloader.py | 59 ++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index 19a5a448d..a6aae1050 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -81,20 +81,29 @@ def update_podcast(self): with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res: - parsed = self.parse_feed() - if not parsed: + parsed, podcast, created = self.parse_feed() + + if not podcast: res.podcast_created = False res.error_message = '"{}" could not be parsed'.format( self.podcast_url) + return - podcast, created = Podcast.objects.get_or_create_for_url( - self.podcast_url) res.podcast = podcast res.podcast_created = created res.episodes_added = 0 episode_updater = MultiEpisodeUpdater(podcast, res) + + if not parsed: + # if it exists already, we mark it as outdated + self._mark_outdated( + podcast, + 'error while fetching feed', + episode_updater) + return + episode_updater.update_episodes(parsed.get('episodes', [])) podcast.refresh_from_db() @@ -112,38 +121,24 @@ def parse_feed(self): parsed = self._fetch_feed() self._validate_parsed(parsed) - except requests.exceptions.RequestException as re: - logging.exception('Error while fetching response from feedservice') + except (requests.exceptions.RequestException, + NoEpisodesException) as ex: + logging.exception('Error while fetching/parsing feed') # if we fail to parse the URL, we don't even create the # podcast object try: p = Podcast.objects.get(urls__url=podcast_url) - # if it exists already, we mark it as outdated - _mark_outdated(p, 'error while fetching feed: %s' % str(re)) - p.last_update = datetime.utcnow() - p.save() - return None + return (None, p, False) except Podcast.DoesNotExist as pdne: - raise NoPodcastCreated(re) from pdne - - except NoEpisodesException as nee: - logging.warn('No episode found while parsing podcast') - - # if we fail to parse the URL, we don't even create the - # podcast object - try: - p = Podcast.objects.get(urls__url=self.podcast_url) - # if it exists already, we mark it as outdated - self._mark_outdated(p, 'error while fetching feed: {}'.format( - str(nee))) - return None + raise NoPodcastCreated(ex) from pdne - except Podcast.DoesNotExist as pdne: - raise NoPodcastCreated(nee) from pdne + # Parsing went well, get podcast + podcast, created = Podcast.objects.get_or_create_for_url( + self.podcast_url) - return parsed + return (parsed, podcast, created) def _fetch_feed(self): params = { @@ -230,7 +225,11 @@ def _update_podcast(self, podcast, parsed, episode_updater): ) if new_podcast != podcast: - _mark_outdated(podcast, 'redirected to different podcast') + self._mark_outdated( + podcast, + 'redirected to different podcast', + episode_updater, + ) return except Podcast.DoesNotExist: podcast.set_url(podcast.new_location) @@ -355,12 +354,12 @@ def _save_podcast_logo(self, cover_art): http.client.HTTPException, socket.error, IOError) as e: logger.warn('Exception while updating podcast logo: %s', str(e)) - def _mark_outdated(podcast, msg=''): + def _mark_outdated(self, podcast, msg, episode_updater): logger.info('marking podcast outdated: %s', msg) podcast.outdated = True podcast.last_update = datetime.utcnow() podcast.save() - _update_episodes(podcast, []) + episode_updater.update_episodes([]) class MultiEpisodeUpdater(object): From 601bc439d976e3aa7f3e04d7af1076f481cc00c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 19:43:17 +0100 Subject: [PATCH 13/23] Fix scheduling of podcast updates --- mygpo/data/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mygpo/data/tasks.py b/mygpo/data/tasks.py index 0ee31043e..5235fd28c 100644 --- a/mygpo/data/tasks.py +++ b/mygpo/data/tasks.py @@ -78,5 +78,6 @@ def _schedule_updates(podcasts): for podcast in podcasts: # update_podcasts.delay() seems to block other task execution, # therefore celery.send_task() is used instead + urls = [podcast.url] celery.send_task('mygpo.data.tasks.update_podcasts', - args=[podcast.url]) + args=[urls]) From 294ac46ed502779136395b25141b21e27cb7f2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 20:58:26 +0100 Subject: [PATCH 14/23] Fix variable in podcast updater --- mygpo/data/feeddownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index a6aae1050..9e7b3178c 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -128,7 +128,7 @@ def parse_feed(self): # if we fail to parse the URL, we don't even create the # podcast object try: - p = Podcast.objects.get(urls__url=podcast_url) + p = Podcast.objects.get(urls__url=self.podcast_url) return (None, p, False) except Podcast.DoesNotExist as pdne: From cd31eaa62c689c979c06900cab77749ef245d32b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 20:59:06 +0100 Subject: [PATCH 15/23] Modify Podcast.update_interval with factor When no new episodes are found, the factor increases the interval between podcast updates. --- mygpo/data/feeddownloader.py | 11 ++++++++++ .../0040_podcast_update_interval_factor.py | 20 +++++++++++++++++++ mygpo/podcasts/models.py | 20 +++++++++++++++---- 3 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 mygpo/podcasts/migrations/0040_podcast_update_interval_factor.py diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index 9e7b3178c..79dcbab0f 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -239,8 +239,19 @@ def _update_podcast(self, podcast, parsed, episode_updater): released__isnull=False)\ .order_by('released') + # Determine update interval + + # Update interval is based on intervals between episodes podcast.update_interval = episode_updater.get_update_interval(episodes) + # factor is increased / decreased depending on whether the latest + # update has returned episodes + if episode_updater.episodes_added == 0: # no episodes, incr factor + podcast.update_interval_factor *= 1.2 + elif episode_updater.episodes_added > 1: # new episodes, decr factor + newfactor = podcast.update_interval_factor / 1.2 + podcast.update_interval_factor = max(1, newfactor) # never below 1 + latest_episode = episodes.last() if latest_episode: podcast.latest_episode_timestamp = latest_episode.released diff --git a/mygpo/podcasts/migrations/0040_podcast_update_interval_factor.py b/mygpo/podcasts/migrations/0040_podcast_update_interval_factor.py new file mode 100644 index 000000000..57447b2dc --- /dev/null +++ b/mygpo/podcasts/migrations/0040_podcast_update_interval_factor.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.5 on 2017-12-03 19:53 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('podcasts', '0039_podcast_search_index_uptodate'), + ] + + operations = [ + migrations.AddField( + model_name='podcast', + name='update_interval_factor', + field=models.FloatField(default=1), + ), + ] diff --git a/mygpo/podcasts/models.py b/mygpo/podcasts/models.py index 7e7ea0561..f4596181f 100644 --- a/mygpo/podcasts/models.py +++ b/mygpo/podcasts/models.py @@ -341,16 +341,20 @@ def license(self, license_url=None): def order_by_next_update(self): """ Sort podcasts by next scheduled update """ - NEXTUPDATE = "last_update + (update_interval || ' hours')::INTERVAL" + NEXTUPDATE = ("last_update + (update_interval * " + "update_interval_factor || ' hours')::INTERVAL") q = self.extra(select={'_next_update': NEXTUPDATE}) return q.order_by('_next_update') @property def next_update(self): - return self.last_update + timedelta(hours=self.update_interval) + interval = (timedelta(hours=self.update_interval) * + self.update_interval_factor) + return self.last_update + interval def next_update_between(self, start, end): - NEXTUPDATE_BETWEEN = ("(last_update + (update_interval || " + NEXTUPDATE_BETWEEN = ("(last_update + (update_interval * " + " update_interval_factor || " "' hours')::INTERVAL) BETWEEN %s AND %s") return self.extra( where=[NEXTUPDATE_BETWEEN], params=[start, end] @@ -570,9 +574,15 @@ class Podcast(UUIDModel, TitleModel, DescriptionModel, LinkModel, latest_episode_timestamp = models.DateTimeField(null=True) episode_count = models.PositiveIntegerField(default=0) hub = models.URLField(null=True) + + # Interval between episodes, within a specified range update_interval = models.PositiveSmallIntegerField(null=False, default=DEFAULT_UPDATE_INTERVAL) + # factor to increase update_interval if an update does not find any + # new episodes + update_interval_factor = models.FloatField(default=1) + # "order" value of the most recent episode (will be the highest of all) max_episode_order = models.PositiveIntegerField(null=True, default=None) @@ -698,7 +708,9 @@ def display_title(self): @property def next_update(self): - return self.last_update + timedelta(hours=self.update_interval) + interval = (timedelta(hours=self.update_interval) * + self.update_interval_factor) + return self.last_update + interval class EpisodeQuerySet(MergedUUIDQuerySet): From 32b14c11d6ca14df777d6376d68d00001b1a2cdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 22:15:41 +0100 Subject: [PATCH 16/23] Show waiting celery tasks in admin interface --- mygpo/administration/views.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mygpo/administration/views.py b/mygpo/administration/views.py index 0c9b5c689..d351421c0 100644 --- a/mygpo/administration/views.py +++ b/mygpo/administration/views.py @@ -4,6 +4,8 @@ from collections import Counter from datetime import datetime +import redis + import django from django.shortcuts import render from django.contrib import messages @@ -57,13 +59,6 @@ def get(self, request): hostname = socket.gethostname() django_version = django.VERSION - i = celery.control.inspect() - scheduled = i.scheduled() - if not scheduled: - num_celery_tasks = None - else: - num_celery_tasks = sum(len(node) for node in scheduled.values()) - feed_queue_status = self._get_feed_queue_status() num_index_outdated = self._get_num_outdated_search_index() @@ -73,11 +68,22 @@ def get(self, request): 'base_dir': base_dir, 'hostname': hostname, 'django_version': django_version, - 'num_celery_tasks': num_celery_tasks, + 'num_celery_tasks': self._get_waiting_celery_tasks(), 'feed_queue_status': feed_queue_status, 'num_index_outdated': num_index_outdated, }) + def _get_waiting_celery_tasks(self): + con = celery.broker_connection() + + args = {'host': con.hostname} + if con.port: + args['port'] = con.port + + r = redis.StrictRedis(**args) + return r.llen('celery') + + def _get_feed_queue_status(self): now = datetime.utcnow() next_podcast = Podcast.objects.all().order_by_next_update().first() From 514556a491abef7e5b55ec92245ad263d365a6c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sun, 3 Dec 2017 22:23:34 +0100 Subject: [PATCH 17/23] Show avg podcast update duration in admin area --- mygpo/administration/templates/admin/hostinfo.html | 10 ++++++++++ mygpo/administration/views.py | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/mygpo/administration/templates/admin/hostinfo.html b/mygpo/administration/templates/admin/hostinfo.html index e09c45ea3..ad273f682 100644 --- a/mygpo/administration/templates/admin/hostinfo.html +++ b/mygpo/administration/templates/admin/hostinfo.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {% load i18n %} +{% load time %} {% load podcasts %} {% load menu %} @@ -62,6 +63,15 @@

{% trans "Host Information" %}

{{ num_index_outdated }} + + + + {% trans "Average podcast update duration" %} + + + {{ avg_podcast_update_duration.total_seconds|format_duration}} + + {% trans "Scheduled Celery Tasks" %} {{ num_celery_tasks }} diff --git a/mygpo/administration/views.py b/mygpo/administration/views.py index d351421c0..16353feea 100644 --- a/mygpo/administration/views.py +++ b/mygpo/administration/views.py @@ -7,6 +7,7 @@ import redis import django +from django.db.models import Avg from django.shortcuts import render from django.contrib import messages from django.urls import reverse @@ -28,6 +29,7 @@ from mygpo.administration.clients import UserAgentStats, ClientStats from mygpo.administration.tasks import merge_podcasts from mygpo.utils import get_git_head +from mygpo.data.models import PodcastUpdateResult from mygpo.users.models import UserProxy from mygpo.publisher.models import PublishedPodcast from mygpo.api.httpresponse import JsonResponse @@ -61,6 +63,7 @@ def get(self, request): feed_queue_status = self._get_feed_queue_status() num_index_outdated = self._get_num_outdated_search_index() + avg_podcast_update_duration = self._get_avg_podcast_update_duration() return self.render_to_response({ 'git_commit': commit, @@ -69,6 +72,7 @@ def get(self, request): 'hostname': hostname, 'django_version': django_version, 'num_celery_tasks': self._get_waiting_celery_tasks(), + 'avg_podcast_update_duration': avg_podcast_update_duration, 'feed_queue_status': feed_queue_status, 'num_index_outdated': num_index_outdated, }) @@ -83,6 +87,9 @@ def _get_waiting_celery_tasks(self): r = redis.StrictRedis(**args) return r.llen('celery') + def _get_avg_podcast_update_duration(self): + queryset = PodcastUpdateResult.objects.filter(successful=True) + return queryset.aggregate(avg_duration=Avg('duration'))['avg_duration'] def _get_feed_queue_status(self): now = datetime.utcnow() From e08b59c91c0ebc1e295a2c0ac7d3868145575038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Fri, 15 Dec 2017 20:06:00 +0100 Subject: [PATCH 18/23] Remove Opbeat Opbeat is being sunset on May 24, 2018 --- mygpo/settings.py | 10 ---------- requirements-setup.txt | 1 - 2 files changed, 11 deletions(-) diff --git a/mygpo/settings.py b/mygpo/settings.py index 336835ea5..3227c50a7 100644 --- a/mygpo/settings.py +++ b/mygpo/settings.py @@ -179,16 +179,6 @@ def get_intOrNone(name, default): pass -try: - import opbeat - - if not DEBUG: - INSTALLED_APPS += ['opbeat.contrib.django'] - -except ImportError: - pass - - ACCOUNT_ACTIVATION_DAYS = int(os.getenv('ACCOUNT_ACTIVATION_DAYS', 7)) AUTHENTICATION_BACKENDS = ( diff --git a/requirements-setup.txt b/requirements-setup.txt index 3831a5081..fa834b459 100644 --- a/requirements-setup.txt +++ b/requirements-setup.txt @@ -1,2 +1 @@ envdir -opbeat==3.5.2 From 3def98bc06df9f3ec50786fd4a58d07d5757fd1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Fri, 15 Dec 2017 20:30:40 +0100 Subject: [PATCH 19/23] Update psycopg2cffi to fix Travis install errors psycopg2cffi 2.7.7 change log: Support installation under Postgres 10 by jimbattin (#90) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6bfe2ef9d..feb5452dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ gunicorn==19.7.1 html2text==2016.9.19 markdown2==2.3.4 oauth2client==4.1.2 -psycopg2cffi==2.7.6 +psycopg2cffi==2.7.7 python-dateutil==2.6.1 redis==2.10.6 django-celery-results==1.0.1 From 5f623b238f1d432dbcb9de2639e3e4f0f20a1a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sat, 30 Dec 2017 16:07:46 +0000 Subject: [PATCH 20/23] Fix podcast updater --- mygpo/data/feeddownloader.py | 23 ++++++++++++++--------- mygpo/data/tasks.py | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py index 79dcbab0f..c05e0c41c 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -64,6 +64,9 @@ def update_podcasts(queue): except NoPodcastCreated as npc: logger.info('No podcast created: %s', npc) + except GeneratorExit: + pass + except: logger.exception('Error while updating podcast "%s"', podcast_url) @@ -112,7 +115,7 @@ def update_podcast(self): episode_updater.order_episodes() - self._update_podcast(podcast, parsed, episode_updater) + self._update_podcast(podcast, parsed, episode_updater, res) return podcast @@ -153,7 +156,7 @@ def _fetch_feed(self): if r.status_code != 200: logger.error('Feed-service status code for "{}" was {}'.format( - podcast_url, r.status_code)) + url, r.status_code)) return None try: @@ -174,7 +177,7 @@ def _validate_parsed(self, parsed): if not parsed or not parsed.get('episodes', []): raise NoEpisodesException('no episodes found') - def _update_podcast(self, podcast, parsed, episode_updater): + def _update_podcast(self, podcast, parsed, episode_updater, update_result): """ updates a podcast according to new parser results """ # we need that later to decide if we can "bump" a category @@ -246,9 +249,10 @@ def _update_podcast(self, podcast, parsed, episode_updater): # factor is increased / decreased depending on whether the latest # update has returned episodes - if episode_updater.episodes_added == 0: # no episodes, incr factor - podcast.update_interval_factor *= 1.2 - elif episode_updater.episodes_added > 1: # new episodes, decr factor + if update_result.episodes_added == 0: # no episodes, incr factor + newfactor = podcast.update_interval_factor * 1.2 + podcast.update_interval_factor = min(1000, newfactor) # never above 1000 + elif update_result.episodes_added > 1: # new episodes, decr factor newfactor = podcast.update_interval_factor / 1.2 podcast.update_interval_factor = max(1, newfactor) # never below 1 @@ -600,7 +604,8 @@ def file_basename_no_extension(filename): return name -def verify_podcast_url(self): - parsed = _fetch_feed(self.podcast_url) - self._validate_parsed(parsed) +def verify_podcast_url(url): + updater = PodcastUpdater(url) + parsed = updater._fetch_feed() + updater._validate_parsed(parsed) return True diff --git a/mygpo/data/tasks.py b/mygpo/data/tasks.py index 5235fd28c..a4e577b12 100644 --- a/mygpo/data/tasks.py +++ b/mygpo/data/tasks.py @@ -18,6 +18,7 @@ def update_podcasts(podcast_urls): """ Task to update a podcast """ from mygpo.data.feeddownloader import update_podcasts as update podcasts = update(podcast_urls) + podcasts = filter(None, podcasts) return [podcast.pk for podcast in podcasts] From 4ad55fb3c3b52a848bbd447959b50714e6ecedcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sat, 30 Dec 2017 18:08:46 +0100 Subject: [PATCH 21/23] Update installation instructions to (fixes #81) --- doc/dev/installation.rst | 61 ++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/doc/dev/installation.rst b/doc/dev/installation.rst index 0cbe2b31b..b1bb8d95a 100644 --- a/doc/dev/installation.rst +++ b/doc/dev/installation.rst @@ -43,20 +43,7 @@ Now install additional dependencies locally: pip install -r requirements-test.txt # for running tests -That's it for the setup. Now to initialize the DB: - -First run the commands from :ref:`db-setup`. Then - -.. code-block:: bash - - cd mygpo - python manage.py migrate - -..and here we go: - -.. code-block:: bash - - python manage.py runserver +That's it for the setup. Configuration @@ -75,6 +62,26 @@ For a development configuration you will probably want to use the following See :ref:`configuration` for further information. +Database Initialization +----------------------- + +Now to initialize the DB: + +First run the commands from :ref:`db-setup`. Then + +.. code-block:: bash + + cd mygpo + envdir envs/local python manage.py migrate + +..and here we go: + +.. code-block:: bash + + envdir envs/local python manage.py runserver + + + Accessing the dev server from other devices ------------------------------------------- @@ -84,7 +91,7 @@ runserver command of manage.py, like this: .. code-block:: bash - python manage.py runserver 0.0.0.0:8000 + envdir envs/local python manage.py runserver 0.0.0.0:8000 Beware, though, that this will expose the web service to your all networks that your machine is connected to. Apply common sense and ideally use only @@ -101,22 +108,22 @@ commands regularly on your development machine: .. code-block:: bash - python manage.py update-categories - python manage.py update-toplist - python manage.py update-episode-toplist + envdir envs/local python manage.py update-categories + envdir envs/local python manage.py update-toplist + envdir envs/local python manage.py update-episode-toplist - python manage.py feed-downloader - python manage.py feed-downloader [...] - python manage.py feed-downloader --max - python manage.py feed-downloader --random --max - python manage.py feed-downloader --toplist --max - python manage.py feed-downloader --update-new --max + envdir envs/local python manage.py feed-downloader + envdir envs/local python manage.py feed-downloader [...] + envdir envs/local python manage.py feed-downloader --max + envdir envs/local python manage.py feed-downloader --random --max + envdir envs/local python manage.py feed-downloader --toplist --max + envdir envs/local python manage.py feed-downloader --update-new --max or to only do a dry run (this won't do any web requests for feeds): .. code-block:: bash - python manage.py feed-downloader --list-only [other parameters] + envdir envs/local python manage.py feed-downloader --list-only [other parameters] Maintaining publisher relationships with user accounts @@ -127,7 +134,7 @@ To set a user as publisher for a given feed URL, use: .. code-block:: bash cd mygpo - python manage.py make-publisher [...] + envdir envs/local python manage.py make-publisher [...] Web-Server @@ -138,7 +145,7 @@ directory with .. code-block:: bash - python manage.py runserver + envdir envs/local python manage.py runserver If you want to run a production server, check out `Deploying Django `_. From 64b1ef4daa5363801da173ee36255379d1642b4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sat, 30 Dec 2017 21:28:48 +0100 Subject: [PATCH 22/23] Fix error handling in user check (fixes #81) --- mygpo/users/checks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mygpo/users/checks.py b/mygpo/users/checks.py index 61d730723..2810d1994 100644 --- a/mygpo/users/checks.py +++ b/mygpo/users/checks.py @@ -1,6 +1,6 @@ from django.core.checks import register, Warning from django.db import connection -from django.db.utils import OperationalError +from django.db.utils import OperationalError, ProgrammingError SQL = """ @@ -37,4 +37,12 @@ def check_case_insensitive_users(app_configs=None, **kwargs): else: raise + except ProgrammingError as pe: + if 'relation "auth_user" does not exist' in str(pe): + # Ignore if the table does not yet exist, eg when initally + # running ``manage.py migrate`` + pass + else: + raise + return errors From 02185513947d8c807600bb01f1ea5ad0d7af5279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20K=C3=B6gl?= Date: Sat, 30 Dec 2017 21:47:53 +0100 Subject: [PATCH 23/23] Avoid exceptions at slug assignment during podcast updates --- mygpo/core/slugs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mygpo/core/slugs.py b/mygpo/core/slugs.py index 230fb15ab..f4ec4826d 100644 --- a/mygpo/core/slugs.py +++ b/mygpo/core/slugs.py @@ -7,10 +7,6 @@ class SlugGenerator(object): """ Generates a unique slug for an object """ def __init__(self, obj): - if obj.slug: - raise ValueError('%(obj)s already has slug %(slug)s' % - dict(obj=obj, slug=obj.slug)) - self.obj = obj self.base_slug = self._get_base_slug(obj) @@ -26,6 +22,10 @@ def __iter__(self): The consumer can can consume until it get's an unused one """ + if obj.slug: + # The object already has a slug + raise StopIteration + if not self.base_slug: raise StopIteration