Skip to content

Commit

Permalink
my.twitter.android: refactor into a proper module
Browse files Browse the repository at this point in the history
for now only extracting bookmarks, will use it for some time and see how it goes
  • Loading branch information
karlicoss committed Dec 24, 2023
1 parent 3919c10 commit c60454e
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 29 deletions.
2 changes: 2 additions & 0 deletions my/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ class twitter_archive:
class twitter:
class talon:
export_path: Paths
class android:
export_path: Paths


class twint:
Expand Down
1 change: 1 addition & 0 deletions my/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,7 @@ def unique_everseen(
if key is None:
# todo check key return type as well? but it's more likely to be hashable
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
# TODO return better error here, e.g. if there is no return type it crashes
_check_all_hashable(fun)

return more_itertools.unique_everseen(iterable=iterable, key=key)
Expand Down
125 changes: 96 additions & 29 deletions my/twitter/android.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,49 @@
"""
Data from offficial app for Android
Twitter data from offficial app for Android
"""
from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
import re
from struct import unpack_from, calcsize
from struct import unpack_from
from typing import Iterator, Sequence

from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
from my.core.common import unique_everseen
from my.core.sqlite import sqlite_connect_immutable

import my.config

from .common import permalink

logger = LazyLogger(__name__)


@dataclass
class config(my.config.twitter.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths


def inputs() -> Sequence[Path]:
return get_files(config.export_path)


def _parse_content(data: bytes):
@dataclass(unsafe_hash=True)
class Tweet:
id_str: str
created_at: datetime_aware
screen_name: str
text: str

@property
def permalink(self) -> str:
return permalink(screen_name=self.screen_name, id=self.id_str)


def _parse_content(data: bytes) -> str:
pos = 0

def skip(count: int) -> None:
Expand Down Expand Up @@ -107,29 +143,60 @@ def getstring(slen: int) -> str:
text = text.replace(k, v)
assert 'https://t.co/' not in text # make sure we detected all links

print(text)



PATH_TO_DB = '/path/to/db'


with sqlite_connect_immutable(PATH_TO_DB) as db:
# TODO use statuses table instead?
# has r_ent_content??
# TODO hmm r_ent_content contains expanded urls?
# but they are still ellipsized? e.g. you can check 1692905005479580039
# TODO also I think content table has mappings from short urls to full, need to extract
for (tid, blob, blob2) in db.execute(
f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
):
if blob is None: # TODO exclude in sql query?
continue
print("----")
try:
print("PARSING", tid)
_parse_content(blob)
# _parse_content(blob2)
except UnicodeDecodeError as ue:
raise ue
# print("DECODING ERROR FOR ", tid, ue.object)
return text


def _process_one(f: Path) -> Iterator[Res[Tweet]]:
with sqlite_connect_immutable(f) as db:
# NOTE:
# - it also has statuses_r_ent_content which has entities' links replaced
# but they are still ellipsized (e.g. check 1692905005479580039)
# so let's just uses statuses_content
# - there is also timeline_created_at, but they look like made up timestamps
# don't think they represent bookmarking time
# - not sure what's timeline_type?
# seems like 30 means bookmarks?
# there is one tweet with timeline type 18, but it has timeline_is_preview=1
for (
tweet_id,
user_name,
user_username,
created_ms,
blob,
) in db.execute(
'''
SELECT
statuses_status_id,
users_name,
users_username,
statuses_created,
CAST(statuses_content AS BLOB)
FROM timeline_view
WHERE statuses_bookmarked = 1
ORDER BY timeline_sort_index DESC
''',
):
if blob is None: # TODO exclude in sql query?
continue
yield Tweet(
id_str=tweet_id,
# TODO double check it's utc?
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
screen_name=user_username,
text=_parse_content(blob),
)


def bookmarks() -> Iterator[Res[Tweet]]:
# TODO might need to sort by timeline_sort_index again?
# not sure if each database contains full history of bookmarks (likely not!)
def it() -> Iterator[Res[Tweet]]:
paths = inputs()
total = len(paths)
width = len(str(total))
for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
yield from _process_one(path)

# TODO hmm maybe unique_everseen should be a decorator?
return unique_everseen(it)

0 comments on commit c60454e

Please sign in to comment.