Skip to content

Commit

Permalink
general: add 'destructive parsing' (kinda what we were doing in my.co…
Browse files Browse the repository at this point in the history
…re.konsume) to my.experimental

also some cleanup for my.codeforces and my.topcoder
  • Loading branch information
karlicoss committed Aug 12, 2024
1 parent 4e13779 commit 26d627e
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 106 deletions.
130 changes: 62 additions & 68 deletions my/codeforces.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,80 @@
from my.config import codeforces as config # type: ignore[attr-defined]


from dataclasses import dataclass
from datetime import datetime, timezone
from functools import cached_property
import json
from typing import NamedTuple, Dict, Iterator


from my.core import get_files, Res
from my.core.konsume import ignore, wrap
from pathlib import Path
from typing import Dict, Iterator, Sequence

from my.core import get_files, Res, datetime_aware
from my.core.common import assert_never

Cid = int

class Contest(NamedTuple):
cid: Cid
when: datetime
from my.config import codeforces as config # type: ignore[attr-defined]

@classmethod
def make(cls, j) -> 'Contest':
return cls(
cid=j['id'],
when=datetime.fromtimestamp(j['startTimeSeconds'], tz=timezone.utc),
)

Cmap = Dict[Cid, Contest]
def inputs() -> Sequence[Path]:
return get_files(config.export_path)


def get_contests() -> Cmap:
last = max(get_files(config.export_path, 'allcontests*.json'))
j = json.loads(last.read_text())
d = {}
for c in j['result']:
cc = Contest.make(c)
d[cc.cid] = cc
return d
ContestId = int


class Competition(NamedTuple):
contest_id: Cid
contest: str
cmap: Cmap
@dataclass
class Contest:
contest_id: ContestId
when: datetime_aware
name: str

@cached_property
def uid(self) -> Cid:
return self.contest_id

def __hash__(self):
return hash(self.contest_id)

@cached_property
def when(self) -> datetime:
return self.cmap[self.uid].when
@dataclass
class Competition:
contest: Contest
old_rating: int
new_rating: int

@cached_property
def summary(self) -> str:
return f'participated in {self.contest}' # TODO

@classmethod
def make(cls, cmap, json) -> Iterator[Res['Competition']]:
# TODO try here??
contest_id = json['contestId'].zoom().value
contest = json['contestName'].zoom().value
yield cls(
contest_id=contest_id,
contest=contest,
cmap=cmap,
)
# TODO ytry???
ignore(json, 'rank', 'oldRating', 'newRating')
def when(self) -> datetime_aware:
return self.contest.when


# todo not sure if parser is the best name? hmm
class Parser:
def __init__(self, *, inputs: Sequence[Path]) -> None:
self.inputs = inputs
self.contests: Dict[ContestId, Contest] = {}

def _parse_allcontests(self, p: Path) -> Iterator[Contest]:
j = json.loads(p.read_text())
for c in j['result']:
yield Contest(
contest_id=c['id'],
when=datetime.fromtimestamp(c['startTimeSeconds'], tz=timezone.utc),
name=c['name'],
)

def _parse_competitions(self, p: Path) -> Iterator[Competition]:
j = json.loads(p.read_text())
for c in j['result']:
contest_id = c['contestId']
contest = self.contests[contest_id]
yield Competition(
contest=contest,
old_rating=c['oldRating'],
new_rating=c['newRating'],
)

def parse(self) -> Iterator[Res[Competition]]:
for path in inputs():
if 'allcontests' in path.name:
# these contain information about all CF contests along with useful metadata
for contest in self._parse_allcontests(path):
# TODO some method to assert on mismatch if it exists? not sure
self.contests[contest.contest_id] = contest
elif 'codeforces' in path.name:
# these contain only contests the user participated in
yield from self._parse_competitions(path)
else:
raise RuntimeError("shouldn't happen") # TODO switch to compat.assert_never


def data() -> Iterator[Res[Competition]]:
cmap = get_contests()
last = max(get_files(config.export_path, 'codeforces*.json'))

with wrap(json.loads(last.read_text())) as j:
j['status'].ignore() # type: ignore[index]
res = j['result'].zoom() # type: ignore[index]

for c in list(res): # TODO maybe we want 'iter' method??
ignore(c, 'handle', 'ratingUpdateTimeSeconds')
yield from Competition.make(cmap=cmap, json=c)
c.consume()
# TODO maybe if they are all empty, no need to consume??
return Parser(inputs=inputs()).parse()
31 changes: 31 additions & 0 deletions my/core/konsume.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,34 @@ def test_zoom() -> None:


# TODO type check this...

# TODO feels like the whole thing kind of unnecessarily complex
# - cons:
# - in most cases this is not even needed? who cares if we miss a few attributes?
# - pro: on the other hand it could be interesting to know about new attributes in data,
# and without this kind of processing we wouldn't even know
# alternatives
# - manually process data
# e.g. use asserts, dict.pop and dict.values() methods to unpack things
# - pros:
# - very simple, since uses built in syntax
# - very performant, as fast as it gets
# - very flexible, easy to adjust behaviour
# - cons:
# - can forget to assert about extra entities etc, so error prone
# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes erro handling harder
# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though)
# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements
# - TODO perhaps combine warnings somehow or at least only emit once per module?
# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end?
# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718
# operator.itemgetter?
# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour
# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this
# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though
# - define a "schema" for data, then just recursively match data against the schema?
# possibly pydantic already does something like that? not sure about performance though
# pros:
# - much simpler to extend and understand what's going on
# cons:
# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes)
60 changes: 60 additions & 0 deletions my/experimental/destructive_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from dataclasses import dataclass
from typing import Any, Iterator, List, Tuple

from my.core import assert_never
from my.core.compat import NoneType


# TODO Popper? not sure
@dataclass
class Helper:
manager: 'Manager'
item: Any # todo realistically, list or dict? could at least type as indexable or something
path: Tuple[str, ...]

def pop_if_primitive(self, *keys: str) -> None:
"""
The idea that primitive TODO
"""
item = self.item
for k in keys:
v = item[k]
if isinstance(v, (str, bool, float, int, NoneType)):
item.pop(k) # todo kinda unfortunate to get dict item twice.. but not sure if can avoid?

def check(self, key: str, expected: Any) -> None:
actual = self.item.pop(key)
assert actual == expected, (key, actual, expected)

def zoom(self, key: str) -> 'Helper':
return self.manager.helper(item=self.item.pop(key), path=self.path + (key,))


def is_empty(x) -> bool:
if isinstance(x, dict):
return len(x) == 0
elif isinstance(x, list):
return all(map(is_empty, x))
else:
assert_never(x)


class Manager:
def __init__(self) -> None:
self.helpers: List[Helper] = []

def helper(self, item: Any, *, path: Tuple[str, ...] = ()) -> Helper:
res = Helper(manager=self, item=item, path=path)
self.helpers.append(res)
return res

def check(self) -> Iterator[Exception]:
remaining = []
for h in self.helpers:
# TODO recursively check it's primitive?
if is_empty(h.item):
continue
remaining.append((h.path, h.item))
if len(remaining) == 0:
return
yield RuntimeError(f'Unparsed items remaining: {remaining}')
77 changes: 39 additions & 38 deletions my/topcoder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from my.config import topcoder as config # type: ignore[attr-defined]


from dataclasses import dataclass
from functools import cached_property
import json
from pathlib import Path
from typing import Iterator, Sequence

from my.core import get_files, Res, datetime_aware
from my.core.compat import fromisoformat, NoneType
from my.core.compat import fromisoformat
from my.experimental.destructive_parsing import Manager

from my.config import topcoder as config # type: ignore[attr-defined]


def inputs() -> Sequence[Path]:
Expand All @@ -30,10 +30,6 @@ def uid(self) -> str:
def when(self) -> datetime_aware:
return fromisoformat(self.date_str)

@cached_property
def summary(self) -> str:
return f'participated in {self.contest}: {self.percentile:.0f}'

@classmethod
def make(cls, j) -> Iterator[Res['Competition']]:
assert isinstance(j.pop('rating'), float)
Expand All @@ -53,38 +49,43 @@ def make(cls, j) -> Iterator[Res['Competition']]:


def _parse_one(p: Path) -> Iterator[Res[Competition]]:
j = json.loads(p.read_text())

# this is kind of an experiment to parse it exhaustively, making sure we don't miss any data
assert isinstance(j.pop('version'), str)
assert isinstance(j.pop('id'), str)
[j] = j.values() # zoom in

assert j.pop('success') is True, j
assert j.pop('status') == 200, j
assert j.pop('metadata') is None, j
[j] = j.values() # zoom in

# todo hmm, potentially error handling could be nicer since .pop just reports key error
# also by the time error is reported, key is already removed?
for k in ['handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy']:
# check it's primitive
assert isinstance(j.pop(k), (str, bool, float, int, NoneType)), k

j.pop('DEVELOP') # TODO how to handle it?
[j] = j.values() # zoom in, DATA_SCIENCE section

mm = j.pop('MARATHON_MATCH')
[mm] = mm.values() # zoom into historu

srm = j.pop('SRM')
[srm] = srm.values() # zoom into history

assert len(j) == 0, j

for c in mm + srm:
d = json.loads(p.read_text())

# TODO manager should be a context manager?
m = Manager()

h = m.helper(d)
h.pop_if_primitive('version', 'id')

h = h.zoom('result')
h.check('success', True)
h.check('status', 200)
h.pop_if_primitive('metadata')

h = h.zoom('content')
h.pop_if_primitive('handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy')

# NOTE at the moment it's empty for me, but it will result in an error later if there is some data here
h.zoom('DEVELOP').zoom('subTracks')

h = h.zoom('DATA_SCIENCE')
# TODO multi zoom? not sure which axis, e.g.
# zoom('SRM', 'history') or zoom('SRM', 'MARATHON_MATCH')
# or zoom(('SRM', 'history'), ('MARATHON_MATCH', 'history'))
srms = h.zoom('SRM').zoom('history')
mms = h.zoom('MARATHON_MATCH').zoom('history')

for c in srms.item + mms.item:
# NOTE: so here we are actually just using pure dicts in .make method
# this is kinda ok since it will be checked by parent Helper
# but also expects cooperation from .make method (e.g. popping items from the dict)
# could also wrap in helper and pass to .make .. not sure
# an argument could be made that .make isn't really a class methond..
# it's pretty specific to this parser onl
yield from Competition.make(j=c)

yield from m.check()


def data() -> Iterator[Res[Competition]]:
*_, last = inputs()
Expand Down

0 comments on commit 26d627e

Please sign in to comment.