diff --git a/src/my/instagram/android.py b/src/my/instagram/android.py index 6a19e01c..a8d00cc4 100644 --- a/src/my/instagram/android.py +++ b/src/my/instagram/android.py @@ -100,10 +100,27 @@ def __eq__(self, other) -> bool: return self.rest == other.rest -def _parse_message(j: Json) -> _Message | None: +def _parse_message(j: Json, tid_map: dict[str, str]) -> _Message | None: id = j['item_id'] t = j['item_type'] - tid = j['thread_key']['thread_id'] + + local_tid = j['thread_key']['thread_id'] + + # NOTE: j['thread_key']['thread_v2_id'] also contains server thread id in most cases + # however sometimes it's missing (perhaps if we are offline?) + # it seems that using the thread_v2_id from 'threads' table resutls is more reliable + + # NOTE: this is the same id as in gdpr export + # ... well kind of. For latest android databases it seems to match + # But seems like it actually changes throughout time (perhaps in 2023/2024 there was some sort of migration for all users??) + # Overall doesn't seem like there is no obvious logic for it... so we still can't realy on thread id for merging.. + thread_v2_id = tid_map.get(local_tid) + if thread_v2_id is None: + # it still is missing somehow (perhaps if we messaged a user while offline/no network?) + # in general it's not an issue, we'll get the same message from a later export + # todo not sure if we should emit exception or something instead.. + return None + uid = j['user_id'] created: datetime_naive = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000) text: str | None = None @@ -126,7 +143,7 @@ def _parse_message(j: Json) -> _Message | None: id=id, created=created, text=text, - thread_id=tid, + thread_id=thread_v2_id, user_id=uid, # reply_to_id='FIXME', ) @@ -143,8 +160,15 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]: username=config.full_name or 'USERS_OWN_USERNAME', ) + # maps local tid to "server tid" (thread_v2_id) + tid_map: dict[str, str] = {} + for (thread_json,) in select(('thread_info',), 'FROM threads', db=db): j = json.loads(thread_json) + thread_v2_id = j.get('thread_v2_id') + if thread_v2_id is not None: + # sometimes not present... + tid_map[j['thread_id']] = thread_v2_id # todo in principle should leave the thread attached to the message? # since thread is a group of users? pre_users = [] @@ -167,7 +191,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]: # eh, seems to contain everything in json? j = json.loads(msg_json) try: - m = _parse_message(j) + m = _parse_message(j, tid_map=tid_map) if m is not None: yield m except Exception as e: diff --git a/src/my/instagram/common.py b/src/my/instagram/common.py index 17d130fb..42a0211d 100644 --- a/src/my/instagram/common.py +++ b/src/my/instagram/common.py @@ -38,9 +38,15 @@ def key(r: Res[Message]): # using text as key is a bit crap.. but atm there are no better shared fields return (without_us, r.text) - # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) - # so the only way to correlate is to try and match messages + # ugh + # - in gdpr export, User objects are kinda garbage + # - thread ids are inconsistent even within Android databases + # maybe always take latest gdpr when merging?? + # - TODO maybe always grab message ids from android? there is nothing in gdpr + + # so the only way to correlate is to try and match messages bodies/timestamps # we also can't use unique_everseen here, otherwise will never get a chance to unify threads + mmap: dict[str, Message] = {} thread_map = {} user_map = {} diff --git a/src/my/instagram/gdpr.py b/src/my/instagram/gdpr.py index 263831f4..2dbaa8de 100644 --- a/src/my/instagram/gdpr.py +++ b/src/my/instagram/gdpr.py @@ -145,6 +145,7 @@ def iter_jsons() -> Iterator[Json]: # title should be the same across all files, so enough to extract only first conversation_title = _decode(first['title']) + # TODO older gdpr exports had 10 alnum characters?? with no relation to server id? m = re.fullmatch(r'(.*)_(\d+)', conversation) assert m is not None