Skip to content

Commit

Permalink
Merge pull request #1312 from libris/feature/update-shapes-script
Browse files Browse the repository at this point in the history
Add min- and max-created filters to lddb_json_shape.py
  • Loading branch information
niklasl authored Oct 2, 2023
2 parents 9e94ba0 + 61d5464 commit 0f72406
Showing 1 changed file with 47 additions and 13 deletions.
60 changes: 47 additions & 13 deletions librisxl-tools/scripts/lddb_json_shape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations
from datetime import datetime
import json
import os

Expand Down Expand Up @@ -68,33 +70,54 @@ def count_value(k, v, shape):
shape[k] = stats + 1


def isodatetime(s):
# NOTE: fromisoformat with zulu time requires Python 3.11+
if s.endswith('Z'):
s = s[:-1] + '+00:00'
return datetime.fromisoformat(s)


if __name__ == '__main__':
from pathlib import Path
from time import time
import argparse
import sys
from pathlib import Path

args = sys.argv[:]
cmd = args.pop(0)
if not args:
print(f'USAGE: {cmd} OUT_DIR', file=sys.stderr)
sys.exit(1)
argp = argparse.ArgumentParser()
argp.add_argument('-d', '--debug', action='store_true', default=False)
argp.add_argument('-c', '--min-created') # inclusive
argp.add_argument('-C', '--max-created') # exclusive
argp.add_argument('outdir', metavar='OUT_DIR')

args = argp.parse_args()

outpath = Path(args.pop(0))
SUFFIX = '.json'

outpath: Path|None = Path(args.outdir)
assert outpath

if outpath.suffix == SUFFIX:
outdir = outpath.parent
else:
outdir = outpath
outpath = None

if not outdir.is_dir():
outdir.mkdir(parents=True, exist_ok=True)

index = {}
work_by_type_index = {}
instance_index = {}
work_index = {}
min_inc_created: datetime | None = isodatetime(args.min_created) if args.min_created else None
max_ex_created: datetime | None = isodatetime(args.max_created) if args.max_created else None
if min_inc_created:
print(f"Filter - min created (inclusive): {min_inc_created}", file=sys.stderr)
if max_ex_created:
print(f"Filter - max created (exclusive): {max_ex_created}", file=sys.stderr)

index: dict = {}
work_by_type_index: dict = {}
instance_index: dict = {}
work_index: dict = {}

t_last = 0
t_last = 0.0
cr = '\r'
for i, l in enumerate(sys.stdin):
if not l.rstrip():
Expand All @@ -109,6 +132,17 @@ def count_value(k, v, shape):

try:
data = json.loads(l)

if (min_inc_created or max_ex_created) and '@graph' in data:
try:
created = isodatetime(data['@graph'][0]['created'])
if min_inc_created and created < min_inc_created:
continue
if max_ex_created and created >= max_ex_created:
continue
except (KeyError, ValueError):
pass

thing, work = reshape(data)
compute_shape(thing, index)
if work:
Expand All @@ -117,7 +151,7 @@ def count_value(k, v, shape):
compute_shape(work, work_index, type_key='Work')

except (ValueError, AttributeError) as e:
print(f'ERROR at: {i} in data:', file=sys.stderr)
print(f'ERROR at: {i + 1} in data:', file=sys.stderr)
print(l, file=sys.stderr)
print(e, file=sys.stderr)

Expand Down

0 comments on commit 0f72406

Please sign in to comment.