From 79b3e6055fbeff48f781afe4a4762df1f607dd5e Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 25 Sep 2020 20:55:56 -0400 Subject: [PATCH] tumblr_backup: Support backup of over 1000 likes Based on PR #114 by @aggroskater --- tumblr_backup.py | 50 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/tumblr_backup.py b/tumblr_backup.py index cf93c54..7250939 100755 --- a/tumblr_backup.py +++ b/tumblr_backup.py @@ -19,7 +19,7 @@ import threading import time from collections import defaultdict -from datetime import datetime +from datetime import datetime, timedelta from glob import glob from os.path import join, split, splitext from posixpath import basename as urlbasename, join as urlpathjoin, splitext as urlsplitext @@ -327,7 +327,7 @@ def apiparse(base, prev_resps, count, start=0, before=None): params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'} if before: params['before'] = before - if start > 0: + if start > 0 and not options.likes: params['offset'] = start url = base + '?' + urlencode(params) @@ -714,6 +714,16 @@ def footer(base, previous_page, next_page): f += '\n' return f + @staticmethod + def get_post_timestamps(posts): + for post in posts: + with io.open(post, encoding=FILE_ENCODING) as pf: + soup = BeautifulSoup(pf, 'lxml') + postdate = soup.find('time')['datetime'] + del soup + # No datetime.fromisoformat or datetime.timestamp on Python 2 + yield (datetime.strptime(postdate, '%Y-%m-%dT%H:%M:%SZ') - datetime(1970, 1, 1)) // timedelta(seconds=1) + def backup(self, account, prev_archive): """makes single files and an index for every post on a public Tumblr blog account""" @@ -744,16 +754,22 @@ def backup(self, account, prev_archive): # get the highest post id already saved ident_max = None if options.incremental: - try: - ident_max = max( - long(splitext(split(f)[1])[0]) - for f in glob(path_to(post_dir, '*' + post_ext)) - ) - log.status('Backing up posts after {}\r'.format(ident_max)) - except ValueError: # max() arg is an empty sequence - pass - else: - log.status('Getting basic information\r') + filter_ = join('*', dir_index) if options.dirs else '*' + post_ext + post_glob = glob(path_to(post_dir, filter_)) + if not post_glob: + pass # No posts to read + elif options.likes: + # Read every post to find the newest timestamp we've saved. + if BeautifulSoup is None: + raise RuntimeError("Incremental likes backup: module 'bs4' is not installed") + log('Finding newest liked post (may take a while)\n', account=True) + ident_max = max(self.get_post_timestamps(post_glob)) + else: + ident_max = max(long(splitext(split(f)[1])[0]) for f in post_glob) + if ident_max is not None: + log('Backing up posts after {}\n'.format(ident_max), account=True) + + log.status('Getting basic information\r') prev_resps, resp = initial_apiparse(base, prev_archive) if not resp: @@ -762,6 +778,10 @@ def backup(self, account, prev_archive): # collect all the meta information if options.likes: + if not resp.get('blog', {}).get('share_likes', True): + print('{} does not have public likes\n'.format(account)) + self.errors = True + return posts_key = 'liked_posts' blog = {} count_estimate = resp['liked_count'] @@ -785,7 +805,9 @@ def _backup(posts, post_respfiles): key=lambda x: x[0]['id'], reverse=True) for p, prf in sorted_posts: post = post_class(p, account, prf, prev_archive) - if ident_max and long(post.ident) <= ident_max: + if ident_max is None: + pass # No limit + elif (p['timestamp'] if options.likes else long(post.ident)) <= ident_max: return False if options.count and self.post_count >= options.count: return False @@ -845,6 +867,8 @@ def _backup(posts, post_respfiles): log.status('Backing up posts found empty set of posts, finishing\r') break + if options.likes: + before = resp['_links']['next']['query_params']['before'] i += MAX_POSTS except: # ensure proper thread pool termination