Skip to content

Commit

Permalink
instagram.android: improve thread id extraction and add more comments
Browse files Browse the repository at this point in the history
  • Loading branch information
karlicoss committed Feb 7, 2025
1 parent ba4b5b6 commit 3d838e7
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 6 deletions.
32 changes: 28 additions & 4 deletions src/my/instagram/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,27 @@ def __eq__(self, other) -> bool:
return self.rest == other.rest


def _parse_message(j: Json) -> _Message | None:
def _parse_message(j: Json, tid_map: dict[str, str]) -> _Message | None:
id = j['item_id']
t = j['item_type']
tid = j['thread_key']['thread_id']

local_tid = j['thread_key']['thread_id']

# NOTE: j['thread_key']['thread_v2_id'] also contains server thread id in most cases
# however sometimes it's missing (perhaps if we are offline?)
# it seems that using the thread_v2_id from 'threads' table resutls is more reliable

# NOTE: this is the same id as in gdpr export
# ... well kind of. For latest android databases it seems to match
# But seems like it actually changes throughout time (perhaps in 2023/2024 there was some sort of migration for all users??)
# Overall doesn't seem like there is no obvious logic for it... so we still can't realy on thread id for merging..
thread_v2_id = tid_map.get(local_tid)
if thread_v2_id is None:
# it still is missing somehow (perhaps if we messaged a user while offline/no network?)
# in general it's not an issue, we'll get the same message from a later export
# todo not sure if we should emit exception or something instead..
return None

uid = j['user_id']
created: datetime_naive = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
text: str | None = None
Expand All @@ -126,7 +143,7 @@ def _parse_message(j: Json) -> _Message | None:
id=id,
created=created,
text=text,
thread_id=tid,
thread_id=thread_v2_id,
user_id=uid,
# reply_to_id='FIXME',
)
Expand All @@ -143,8 +160,15 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]:
username=config.full_name or 'USERS_OWN_USERNAME',
)

# maps local tid to "server tid" (thread_v2_id)
tid_map: dict[str, str] = {}

for (thread_json,) in select(('thread_info',), 'FROM threads', db=db):
j = json.loads(thread_json)
thread_v2_id = j.get('thread_v2_id')
if thread_v2_id is not None:
# sometimes not present...
tid_map[j['thread_id']] = thread_v2_id
# todo in principle should leave the thread attached to the message?
# since thread is a group of users?
pre_users = []
Expand All @@ -167,7 +191,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]:
# eh, seems to contain everything in json?
j = json.loads(msg_json)
try:
m = _parse_message(j)
m = _parse_message(j, tid_map=tid_map)
if m is not None:
yield m
except Exception as e:
Expand Down
10 changes: 8 additions & 2 deletions src/my/instagram/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ def key(r: Res[Message]):
# using text as key is a bit crap.. but atm there are no better shared fields
return (without_us, r.text)

# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
# so the only way to correlate is to try and match messages
# ugh
# - in gdpr export, User objects are kinda garbage
# - thread ids are inconsistent even within Android databases
# maybe always take latest gdpr when merging??
# - TODO maybe always grab message ids from android? there is nothing in gdpr

# so the only way to correlate is to try and match messages bodies/timestamps
# we also can't use unique_everseen here, otherwise will never get a chance to unify threads

mmap: dict[str, Message] = {}
thread_map = {}
user_map = {}
Expand Down
1 change: 1 addition & 0 deletions src/my/instagram/gdpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def iter_jsons() -> Iterator[Json]:
# title should be the same across all files, so enough to extract only first
conversation_title = _decode(first['title'])

# TODO older gdpr exports had 10 alnum characters?? with no relation to server id?
m = re.fullmatch(r'(.*)_(\d+)', conversation)
assert m is not None

Expand Down

0 comments on commit 3d838e7

Please sign in to comment.