From c60454e4c90b8f15714034b60e9fdbbe6ead4d35 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 24 Dec 2023 00:06:29 +0000 Subject: [PATCH] my.twitter.android: refactor into a proper module for now only extracting bookmarks, will use it for some time and see how it goes --- my/config.py | 2 + my/core/common.py | 1 + my/twitter/android.py | 125 ++++++++++++++++++++++++++++++++---------- 3 files changed, 99 insertions(+), 29 deletions(-) diff --git a/my/config.py b/my/config.py index ac44f415..e9b0ec85 100644 --- a/my/config.py +++ b/my/config.py @@ -177,6 +177,8 @@ class twitter_archive: class twitter: class talon: export_path: Paths + class android: + export_path: Paths class twint: diff --git a/my/core/common.py b/my/core/common.py index f1441a94..c429c8ce 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -686,6 +686,7 @@ def unique_everseen( if key is None: # todo check key return type as well? but it's more likely to be hashable if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: + # TODO return better error here, e.g. if there is no return type it crashes _check_all_hashable(fun) return more_itertools.unique_everseen(iterable=iterable, key=key) diff --git a/my/twitter/android.py b/my/twitter/android.py index dbb4946c..be411e39 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,13 +1,49 @@ """ -Data from offficial app for Android +Twitter data from offficial app for Android """ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path import re -from struct import unpack_from, calcsize +from struct import unpack_from +from typing import Iterator, Sequence +from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core.common import unique_everseen from my.core.sqlite import sqlite_connect_immutable +import my.config + +from .common import permalink + +logger = LazyLogger(__name__) + + +@dataclass +class config(my.config.twitter.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + -def _parse_content(data: bytes): +@dataclass(unsafe_hash=True) +class Tweet: + id_str: str + created_at: datetime_aware + screen_name: str + text: str + + @property + def permalink(self) -> str: + return permalink(screen_name=self.screen_name, id=self.id_str) + + +def _parse_content(data: bytes) -> str: pos = 0 def skip(count: int) -> None: @@ -107,29 +143,60 @@ def getstring(slen: int) -> str: text = text.replace(k, v) assert 'https://t.co/' not in text # make sure we detected all links - print(text) - - - -PATH_TO_DB = '/path/to/db' - - -with sqlite_connect_immutable(PATH_TO_DB) as db: - # TODO use statuses table instead? - # has r_ent_content?? - # TODO hmm r_ent_content contains expanded urls? - # but they are still ellipsized? e.g. you can check 1692905005479580039 - # TODO also I think content table has mappings from short urls to full, need to extract - for (tid, blob, blob2) in db.execute( - f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1', - ): - if blob is None: # TODO exclude in sql query? - continue - print("----") - try: - print("PARSING", tid) - _parse_content(blob) - # _parse_content(blob2) - except UnicodeDecodeError as ue: - raise ue - # print("DECODING ERROR FOR ", tid, ue.object) + return text + + +def _process_one(f: Path) -> Iterator[Res[Tweet]]: + with sqlite_connect_immutable(f) as db: + # NOTE: + # - it also has statuses_r_ent_content which has entities' links replaced + # but they are still ellipsized (e.g. check 1692905005479580039) + # so let's just uses statuses_content + # - there is also timeline_created_at, but they look like made up timestamps + # don't think they represent bookmarking time + # - not sure what's timeline_type? + # seems like 30 means bookmarks? + # there is one tweet with timeline type 18, but it has timeline_is_preview=1 + for ( + tweet_id, + user_name, + user_username, + created_ms, + blob, + ) in db.execute( + ''' + SELECT + statuses_status_id, + users_name, + users_username, + statuses_created, + CAST(statuses_content AS BLOB) + FROM timeline_view + WHERE statuses_bookmarked = 1 + ORDER BY timeline_sort_index DESC + ''', + ): + if blob is None: # TODO exclude in sql query? + continue + yield Tweet( + id_str=tweet_id, + # TODO double check it's utc? + created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc), + screen_name=user_username, + text=_parse_content(blob), + ) + + +def bookmarks() -> Iterator[Res[Tweet]]: + # TODO might need to sort by timeline_sort_index again? + # not sure if each database contains full history of bookmarks (likely not!) + def it() -> Iterator[Res[Tweet]]: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from _process_one(path) + + # TODO hmm maybe unique_everseen should be a decorator? + return unique_everseen(it)