Skip to content

Commit

Permalink
Hacky fixes to archive likes
Browse files Browse the repository at this point in the history
  • Loading branch information
aspensmonster committed Dec 7, 2018
1 parent f906939 commit 9f2409c
Showing 1 changed file with 89 additions and 21 deletions.
110 changes: 89 additions & 21 deletions tumblr_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,33 @@ def set_period():
tm[i] += 1
options.p_stop = time.mktime(tm)

def apiparse_likes(base, count, before=0):
params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
if before > 0:
params['before'] = before
url = base + '?' + urllib.urlencode(params)
for _ in range(10):
try:
resp = urlopen(url)
data = resp.read()
except (EnvironmentError, HTTPException) as e:
sys.stderr.write("%s getting %s\n" % (e, url))
continue
if resp.info().gettype() == 'application/json':
break
sys.stderr.write("Unexpected Content-Type: '%s'\n" % resp.info().gettype())
return None
else:
return None
try:
doc = json.loads(data)
except ValueError as e:
sys.stderr.write('%s: %s\n%d %s %s\n%r\n' % (
e.__class__.__name__, e, resp.getcode(), resp.msg, resp.info().gettype(), data
))
return None
return doc if doc.get('meta', {}).get('status', 0) == 200 else None


def apiparse(base, count, start=0):
params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'}
Expand Down Expand Up @@ -581,28 +608,69 @@ def _backup(posts):
# start the thread pool
backup_pool = ThreadPool()
try:
# Get the JSON entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
last_batch = MAX_POSTS
i = options.skip
while i < last_post:
# find the upper bound
j = min(i + MAX_POSTS, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse(base, j - i, i)
if soup is None:
i += last_batch # try the next batch
self.errors = True
continue

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
break
if not options.likes:
# Get the JSON entries from the API, which we can only do for max 50 posts at once.
# Posts "arrive" in reverse chronological order. Post #0 is the most recent one.
last_batch = MAX_POSTS
i = options.skip
while i < last_post:
# find the upper bound
j = min(i + MAX_POSTS, last_post)
log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse(base, j - i, i)
if soup is None:
i += last_batch # try the next batch
self.errors = True
continue

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
break

last_batch = len(posts)
i += last_batch
last_batch = len(posts)
i += last_batch
else:
# Get the JSON entries from the API, which we can only do for max 20 likes at once.
# Likes "arrive" in reverse chronological order. Post #0 is the most recent one.
i = options.skip
finished_with_likes = False
before_timestamp = 0
#before_timestamp = 1485673434
#before_timestamp = 1488326400
#before_timestamp = 1326153600
while not finished_with_likes:
# find the upper bound
j = min(i + MAX_LIKES, last_post)
log(account, "Getting likes %d to %d of %d\r" % (i, j - 1, last_post))

soup = apiparse_likes(base, MAX_LIKES, before_timestamp)
if soup is None:
i += MAX_LIKES # try the next batch
self.errors = True
break
else:
try:
before_timestamp = soup['response']['_links']['next']['query_params']['before']
except KeyError:
if soup['meta']['status'] == 200 and not soup['response']['liked_posts']:
finished_with_likes = True
continue
else:
raise

posts = _get_content(soup)
# posts can be empty if we don't backup reblogged posts
if not posts or not _backup(posts):
finished_with_likes = True

# Don't want to blow through hourly or daily quota.
time.sleep(10)

i += MAX_LIKES


except:
# ensure proper thread pool termination
backup_pool.cancel()
Expand Down

0 comments on commit 9f2409c

Please sign in to comment.