diff --git a/my/codeforces.py b/my/codeforces.py index a97c3600..7b37ec68 100644 --- a/my/codeforces.py +++ b/my/codeforces.py @@ -1,86 +1,80 @@ -from my.config import codeforces as config # type: ignore[attr-defined] - - +from dataclasses import dataclass from datetime import datetime, timezone from functools import cached_property import json -from typing import NamedTuple, Dict, Iterator - - -from my.core import get_files, Res -from my.core.konsume import ignore, wrap +from pathlib import Path +from typing import Dict, Iterator, Sequence +from my.core import get_files, Res, datetime_aware +from my.core.common import assert_never -Cid = int - -class Contest(NamedTuple): - cid: Cid - when: datetime +from my.config import codeforces as config # type: ignore[attr-defined] - @classmethod - def make(cls, j) -> 'Contest': - return cls( - cid=j['id'], - when=datetime.fromtimestamp(j['startTimeSeconds'], tz=timezone.utc), - ) -Cmap = Dict[Cid, Contest] +def inputs() -> Sequence[Path]: + return get_files(config.export_path) -def get_contests() -> Cmap: - last = max(get_files(config.export_path, 'allcontests*.json')) - j = json.loads(last.read_text()) - d = {} - for c in j['result']: - cc = Contest.make(c) - d[cc.cid] = cc - return d +ContestId = int -class Competition(NamedTuple): - contest_id: Cid - contest: str - cmap: Cmap +@dataclass +class Contest: + contest_id: ContestId + when: datetime_aware + name: str - @cached_property - def uid(self) -> Cid: - return self.contest_id - def __hash__(self): - return hash(self.contest_id) - - @cached_property - def when(self) -> datetime: - return self.cmap[self.uid].when +@dataclass +class Competition: + contest: Contest + old_rating: int + new_rating: int @cached_property - def summary(self) -> str: - return f'participated in {self.contest}' # TODO - - @classmethod - def make(cls, cmap, json) -> Iterator[Res['Competition']]: - # TODO try here?? - contest_id = json['contestId'].zoom().value - contest = json['contestName'].zoom().value - yield cls( - contest_id=contest_id, - contest=contest, - cmap=cmap, - ) - # TODO ytry??? - ignore(json, 'rank', 'oldRating', 'newRating') + def when(self) -> datetime_aware: + return self.contest.when + + +# todo not sure if parser is the best name? hmm +class Parser: + def __init__(self, *, inputs: Sequence[Path]) -> None: + self.inputs = inputs + self.contests: Dict[ContestId, Contest] = {} + + def _parse_allcontests(self, p: Path) -> Iterator[Contest]: + j = json.loads(p.read_text()) + for c in j['result']: + yield Contest( + contest_id=c['id'], + when=datetime.fromtimestamp(c['startTimeSeconds'], tz=timezone.utc), + name=c['name'], + ) + + def _parse_competitions(self, p: Path) -> Iterator[Competition]: + j = json.loads(p.read_text()) + for c in j['result']: + contest_id = c['contestId'] + contest = self.contests[contest_id] + yield Competition( + contest=contest, + old_rating=c['oldRating'], + new_rating=c['newRating'], + ) + + def parse(self) -> Iterator[Res[Competition]]: + for path in inputs(): + if 'allcontests' in path.name: + # these contain information about all CF contests along with useful metadata + for contest in self._parse_allcontests(path): + # TODO some method to assert on mismatch if it exists? not sure + self.contests[contest.contest_id] = contest + elif 'codeforces' in path.name: + # these contain only contests the user participated in + yield from self._parse_competitions(path) + else: + raise RuntimeError("shouldn't happen") # TODO switch to compat.assert_never def data() -> Iterator[Res[Competition]]: - cmap = get_contests() - last = max(get_files(config.export_path, 'codeforces*.json')) - - with wrap(json.loads(last.read_text())) as j: - j['status'].ignore() # type: ignore[index] - res = j['result'].zoom() # type: ignore[index] - - for c in list(res): # TODO maybe we want 'iter' method?? - ignore(c, 'handle', 'ratingUpdateTimeSeconds') - yield from Competition.make(cmap=cmap, json=c) - c.consume() - # TODO maybe if they are all empty, no need to consume?? + return Parser(inputs=inputs()).parse() diff --git a/my/core/konsume.py b/my/core/konsume.py index 588bfe1e..10bea8d1 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -209,3 +209,34 @@ def test_zoom() -> None: # TODO type check this... + +# TODO feels like the whole thing kind of unnecessarily complex +# - cons: +# - in most cases this is not even needed? who cares if we miss a few attributes? +# - pro: on the other hand it could be interesting to know about new attributes in data, +# and without this kind of processing we wouldn't even know +# alternatives +# - manually process data +# e.g. use asserts, dict.pop and dict.values() methods to unpack things +# - pros: +# - very simple, since uses built in syntax +# - very performant, as fast as it gets +# - very flexible, easy to adjust behaviour +# - cons: +# - can forget to assert about extra entities etc, so error prone +# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes erro handling harder +# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though) +# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements +# - TODO perhaps combine warnings somehow or at least only emit once per module? +# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end? +# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718 +# operator.itemgetter? +# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour +# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this +# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though +# - define a "schema" for data, then just recursively match data against the schema? +# possibly pydantic already does something like that? not sure about performance though +# pros: +# - much simpler to extend and understand what's going on +# cons: +# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes) diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py new file mode 100644 index 00000000..3fc739c2 --- /dev/null +++ b/my/experimental/destructive_parsing.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from typing import Any, Iterator, List, Tuple + +from my.core import assert_never +from my.core.compat import NoneType + + +# TODO Popper? not sure +@dataclass +class Helper: + manager: 'Manager' + item: Any # todo realistically, list or dict? could at least type as indexable or something + path: Tuple[str, ...] + + def pop_if_primitive(self, *keys: str) -> None: + """ + The idea that primitive TODO + """ + item = self.item + for k in keys: + v = item[k] + if isinstance(v, (str, bool, float, int, NoneType)): + item.pop(k) # todo kinda unfortunate to get dict item twice.. but not sure if can avoid? + + def check(self, key: str, expected: Any) -> None: + actual = self.item.pop(key) + assert actual == expected, (key, actual, expected) + + def zoom(self, key: str) -> 'Helper': + return self.manager.helper(item=self.item.pop(key), path=self.path + (key,)) + + +def is_empty(x) -> bool: + if isinstance(x, dict): + return len(x) == 0 + elif isinstance(x, list): + return all(map(is_empty, x)) + else: + assert_never(x) + + +class Manager: + def __init__(self) -> None: + self.helpers: List[Helper] = [] + + def helper(self, item: Any, *, path: Tuple[str, ...] = ()) -> Helper: + res = Helper(manager=self, item=item, path=path) + self.helpers.append(res) + return res + + def check(self) -> Iterator[Exception]: + remaining = [] + for h in self.helpers: + # TODO recursively check it's primitive? + if is_empty(h.item): + continue + remaining.append((h.path, h.item)) + if len(remaining) == 0: + return + yield RuntimeError(f'Unparsed items remaining: {remaining}') diff --git a/my/topcoder.py b/my/topcoder.py index d9631dc6..8e39252e 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -1,6 +1,3 @@ -from my.config import topcoder as config # type: ignore[attr-defined] - - from dataclasses import dataclass from functools import cached_property import json @@ -8,7 +5,10 @@ from typing import Iterator, Sequence from my.core import get_files, Res, datetime_aware -from my.core.compat import fromisoformat, NoneType +from my.core.compat import fromisoformat +from my.experimental.destructive_parsing import Manager + +from my.config import topcoder as config # type: ignore[attr-defined] def inputs() -> Sequence[Path]: @@ -30,10 +30,6 @@ def uid(self) -> str: def when(self) -> datetime_aware: return fromisoformat(self.date_str) - @cached_property - def summary(self) -> str: - return f'participated in {self.contest}: {self.percentile:.0f}' - @classmethod def make(cls, j) -> Iterator[Res['Competition']]: assert isinstance(j.pop('rating'), float) @@ -53,38 +49,43 @@ def make(cls, j) -> Iterator[Res['Competition']]: def _parse_one(p: Path) -> Iterator[Res[Competition]]: - j = json.loads(p.read_text()) - - # this is kind of an experiment to parse it exhaustively, making sure we don't miss any data - assert isinstance(j.pop('version'), str) - assert isinstance(j.pop('id'), str) - [j] = j.values() # zoom in - - assert j.pop('success') is True, j - assert j.pop('status') == 200, j - assert j.pop('metadata') is None, j - [j] = j.values() # zoom in - - # todo hmm, potentially error handling could be nicer since .pop just reports key error - # also by the time error is reported, key is already removed? - for k in ['handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy']: - # check it's primitive - assert isinstance(j.pop(k), (str, bool, float, int, NoneType)), k - - j.pop('DEVELOP') # TODO how to handle it? - [j] = j.values() # zoom in, DATA_SCIENCE section - - mm = j.pop('MARATHON_MATCH') - [mm] = mm.values() # zoom into historu - - srm = j.pop('SRM') - [srm] = srm.values() # zoom into history - - assert len(j) == 0, j - - for c in mm + srm: + d = json.loads(p.read_text()) + + # TODO manager should be a context manager? + m = Manager() + + h = m.helper(d) + h.pop_if_primitive('version', 'id') + + h = h.zoom('result') + h.check('success', True) + h.check('status', 200) + h.pop_if_primitive('metadata') + + h = h.zoom('content') + h.pop_if_primitive('handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy') + + # NOTE at the moment it's empty for me, but it will result in an error later if there is some data here + h.zoom('DEVELOP').zoom('subTracks') + + h = h.zoom('DATA_SCIENCE') + # TODO multi zoom? not sure which axis, e.g. + # zoom('SRM', 'history') or zoom('SRM', 'MARATHON_MATCH') + # or zoom(('SRM', 'history'), ('MARATHON_MATCH', 'history')) + srms = h.zoom('SRM').zoom('history') + mms = h.zoom('MARATHON_MATCH').zoom('history') + + for c in srms.item + mms.item: + # NOTE: so here we are actually just using pure dicts in .make method + # this is kinda ok since it will be checked by parent Helper + # but also expects cooperation from .make method (e.g. popping items from the dict) + # could also wrap in helper and pass to .make .. not sure + # an argument could be made that .make isn't really a class methond.. + # it's pretty specific to this parser onl yield from Competition.make(j=c) + yield from m.check() + def data() -> Iterator[Res[Competition]]: *_, last = inputs()