instagram.android: improve thread id extraction and add more comments

karlicoss · Feb 7, 2025 · 3d838e7 · 3d838e7
1 parent ba4b5b6
commit 3d838e7
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 6 deletions.
diff --git a/src/my/instagram/android.py b/src/my/instagram/android.py
@@ -100,10 +100,27 @@ def __eq__(self, other) -> bool:
         return self.rest == other.rest
 
 
-def _parse_message(j: Json) -> _Message | None:
+def _parse_message(j: Json, tid_map: dict[str, str]) -> _Message | None:
     id = j['item_id']
     t = j['item_type']
-    tid = j['thread_key']['thread_id']
+
+    local_tid = j['thread_key']['thread_id']
+
+    # NOTE: j['thread_key']['thread_v2_id'] also contains server thread id in most cases
+    # however sometimes it's missing (perhaps if we are offline?)
+    # it seems that using the thread_v2_id from 'threads' table resutls is more reliable
+
+    # NOTE: this is the same id as in gdpr export
+    # ... well kind of. For latest android databases it seems to match
+    # But seems like it actually changes throughout time (perhaps in 2023/2024 there was some sort of migration for all users??)
+    # Overall doesn't seem like there is no obvious logic for it... so we still can't realy on thread id for merging..
+    thread_v2_id = tid_map.get(local_tid)
+    if thread_v2_id is None:
+        # it still is missing somehow (perhaps if we messaged a user while offline/no network?)
+        # in general it's not an issue, we'll get the same message from a later export
+        # todo not sure if we should emit exception or something instead..
+        return None
+
     uid = j['user_id']
     created: datetime_naive = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
     text: str | None = None
@@ -126,7 +143,7 @@ def _parse_message(j: Json) -> _Message | None:
         id=id,
         created=created,
         text=text,
-        thread_id=tid,
+        thread_id=thread_v2_id,
         user_id=uid,
         # reply_to_id='FIXME',
     )
@@ -143,8 +160,15 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]:
             username=config.full_name or 'USERS_OWN_USERNAME',
         )
 
+    # maps local tid to "server tid" (thread_v2_id)
+    tid_map: dict[str, str] = {}
+
     for (thread_json,) in select(('thread_info',), 'FROM threads', db=db):
         j = json.loads(thread_json)
+        thread_v2_id = j.get('thread_v2_id')
+        if thread_v2_id is not None:
+            # sometimes not present...
+            tid_map[j['thread_id']] = thread_v2_id
         # todo in principle should leave the thread attached to the message?
         # since thread is a group of users?
         pre_users = []
@@ -167,7 +191,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]:
         # eh, seems to contain everything in json?
         j = json.loads(msg_json)
         try:
-            m = _parse_message(j)
+            m = _parse_message(j, tid_map=tid_map)
             if m is not None:
                 yield m
         except Exception as e:

diff --git a/src/my/instagram/common.py b/src/my/instagram/common.py
@@ -38,9 +38,15 @@ def key(r: Res[Message]):
         # using text as key is a bit crap.. but atm there are no better shared fields
         return (without_us, r.text)
 
-    # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
-    # so the only way to correlate is to try and match messages
+    # ugh
+    #  - in gdpr export, User objects are kinda garbage
+    #  - thread ids are inconsistent even within Android databases
+    #    maybe always take latest gdpr when merging??
+    #  - TODO maybe always grab message ids from android? there is nothing in gdpr
+
+    # so the only way to correlate is to try and match messages bodies/timestamps
     # we also can't use unique_everseen here, otherwise will never get a chance to unify threads
+
     mmap: dict[str, Message] = {}
     thread_map = {}
     user_map = {}

diff --git a/src/my/instagram/gdpr.py b/src/my/instagram/gdpr.py
@@ -145,6 +145,7 @@ def iter_jsons() -> Iterator[Json]:
         # title should be the same across all files, so enough to extract only first
         conversation_title = _decode(first['title'])
 
+        # TODO older gdpr exports had 10 alnum characters?? with no relation to server id?
         m = re.fullmatch(r'(.*)_(\d+)', conversation)
         assert m is not None