Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add min- and max-created filters to lddb_json_shape.py #1312

Merged
merged 4 commits into from
Oct 2, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 47 additions & 13 deletions librisxl-tools/scripts/lddb_json_shape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations
from datetime import datetime
import json
import os

Expand Down Expand Up @@ -68,33 +70,54 @@ def count_value(k, v, shape):
shape[k] = stats + 1


def isodatetime(s):
# NOTE: fromisoformat with zulu time requires Python 3.11+
if s.endswith('Z'):
s = s[:-1] + '+00:00'
return datetime.fromisoformat(s)


if __name__ == '__main__':
from pathlib import Path
from time import time
import argparse
import sys
from pathlib import Path

args = sys.argv[:]
cmd = args.pop(0)
if not args:
print(f'USAGE: {cmd} OUT_DIR', file=sys.stderr)
sys.exit(1)
argp = argparse.ArgumentParser()
argp.add_argument('-d', '--debug', action='store_true', default=False)
argp.add_argument('-c', '--min-created') # inclusive
argp.add_argument('-C', '--max-created') # exclusive
argp.add_argument('outdir', metavar='OUT_DIR')

args = argp.parse_args()

outpath = Path(args.pop(0))
SUFFIX = '.json'

outpath: Path|None = Path(args.outdir)
assert outpath

if outpath.suffix == SUFFIX:
outdir = outpath.parent
else:
outdir = outpath
outpath = None

if not outdir.is_dir():
outdir.mkdir(parents=True, exist_ok=True)

index = {}
work_by_type_index = {}
instance_index = {}
work_index = {}
min_inc_created: datetime | None = isodatetime(args.min_created) if args.min_created else None
max_ex_created: datetime | None = isodatetime(args.max_created) if args.max_created else None
if min_inc_created:
print(f"Filter - min created (inclusive): {min_inc_created}", file=sys.stderr)
if max_ex_created:
print(f"Filter - max created (exclusive): {max_ex_created}", file=sys.stderr)

index: dict = {}
work_by_type_index: dict = {}
instance_index: dict = {}
work_index: dict = {}

t_last = 0
t_last = 0.0
cr = '\r'
for i, l in enumerate(sys.stdin):
if not l.rstrip():
Expand All @@ -109,6 +132,17 @@ def count_value(k, v, shape):

try:
data = json.loads(l)

if (min_inc_created or max_ex_created) and '@graph' in data:
try:
created = isodatetime(data['@graph'][0]['created'])
if min_inc_created and created < min_inc_created:
continue
if max_ex_created and created >= max_ex_created:
continue
except (KeyError, ValueError):
pass

thing, work = reshape(data)
compute_shape(thing, index)
if work:
Expand All @@ -117,7 +151,7 @@ def count_value(k, v, shape):
compute_shape(work, work_index, type_key='Work')

except (ValueError, AttributeError) as e:
print(f'ERROR at: {i} in data:', file=sys.stderr)
print(f'ERROR at: {i + 1} in data:', file=sys.stderr)
print(l, file=sys.stderr)
print(e, file=sys.stderr)

Expand Down