my.twitter.android: refactor into a proper module

for now only extracting bookmarks, will use it for some time and see how it goes
karlicoss · Dec 24, 2023 · c60454e · c60454e
1 parent 3919c10
commit c60454e
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 29 deletions.
diff --git a/my/config.py b/my/config.py
@@ -177,6 +177,8 @@ class twitter_archive:
 class twitter:
     class talon:
         export_path: Paths
+    class android:
+        export_path: Paths
 
 
 class twint:

diff --git a/my/core/common.py b/my/core/common.py
@@ -686,6 +686,7 @@ def unique_everseen(
     if key is None:
         # todo check key return type as well? but it's more likely to be hashable
         if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
+            # TODO return better error here, e.g. if there is no return type it crashes
             _check_all_hashable(fun)
 
     return more_itertools.unique_everseen(iterable=iterable, key=key)

diff --git a/my/twitter/android.py b/my/twitter/android.py
@@ -1,13 +1,49 @@
 """
-Data from offficial app for Android
+Twitter data from offficial app for Android
 """
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
 import re
-from struct import unpack_from, calcsize
+from struct import unpack_from
+from typing import Iterator, Sequence
 
+from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
+from my.core.common import unique_everseen
 from my.core.sqlite import sqlite_connect_immutable
 
+import my.config
+
+from .common import permalink
+
+logger = LazyLogger(__name__)
+
+
+@dataclass
+class config(my.config.twitter.android):
+    # paths[s]/glob to the exported sqlite databases
+    export_path: Paths
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
 
-def _parse_content(data: bytes):
+@dataclass(unsafe_hash=True)
+class Tweet:
+    id_str: str
+    created_at: datetime_aware
+    screen_name: str
+    text: str
+
+    @property
+    def permalink(self) -> str:
+        return permalink(screen_name=self.screen_name, id=self.id_str)
+
+
+def _parse_content(data: bytes) -> str:
     pos = 0
 
     def skip(count: int) -> None:
@@ -107,29 +143,60 @@ def getstring(slen: int) -> str:
         text = text.replace(k, v)
     assert 'https://t.co/' not in text  # make sure we detected all links
 
-    print(text)
-
-
-
-PATH_TO_DB = '/path/to/db'
-
-
-with sqlite_connect_immutable(PATH_TO_DB) as db:
-    # TODO use statuses table instead?
-    # has r_ent_content??
-    # TODO hmm r_ent_content contains expanded urls?
-    # but they are still ellipsized? e.g. you can check 1692905005479580039
-    # TODO also I think content table has mappings from short urls to full, need to extract
-    for (tid, blob, blob2) in db.execute(
-        f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
-    ):
-        if blob is None:  # TODO exclude in sql query?
-            continue
-        print("----")
-        try:
-            print("PARSING", tid)
-            _parse_content(blob)
-            # _parse_content(blob2)
-        except UnicodeDecodeError as ue:
-            raise ue
-            # print("DECODING ERROR FOR ", tid, ue.object)
+    return text
+
+
+def _process_one(f: Path) -> Iterator[Res[Tweet]]:
+    with sqlite_connect_immutable(f) as db:
+        # NOTE:
+        # - it also has statuses_r_ent_content which has entities' links replaced
+        #   but they are still ellipsized (e.g. check 1692905005479580039)
+        #   so let's just uses statuses_content
+        # - there is also timeline_created_at, but they look like made up timestamps
+        #   don't think they represent bookmarking time
+        # - not sure what's timeline_type?
+        #   seems like 30 means bookmarks?
+        #   there is one tweet with timeline type 18, but it has timeline_is_preview=1
+        for (
+            tweet_id,
+            user_name,
+            user_username,
+            created_ms,
+            blob,
+        ) in db.execute(
+            '''
+            SELECT
+            statuses_status_id,
+            users_name,
+            users_username,
+            statuses_created,
+            CAST(statuses_content AS BLOB)
+            FROM timeline_view
+            WHERE statuses_bookmarked = 1
+            ORDER BY timeline_sort_index DESC
+            ''',
+        ):
+            if blob is None:  # TODO exclude in sql query?
+                continue
+            yield Tweet(
+                id_str=tweet_id,
+                # TODO double check it's utc?
+                created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
+                screen_name=user_username,
+                text=_parse_content(blob),
+            )
+
+
+def bookmarks() -> Iterator[Res[Tweet]]:
+    # TODO might need to sort by timeline_sort_index again?
+    # not sure if each database contains full history of bookmarks (likely not!)
+    def it() -> Iterator[Res[Tweet]]:
+        paths = inputs()
+        total = len(paths)
+        width = len(str(total))
+        for idx, path in enumerate(paths):
+            logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
+            yield from _process_one(path)
+
+    # TODO hmm maybe unique_everseen should be a decorator?
+    return unique_everseen(it)