Skip to content

Commit

Permalink
Add min- and max-created filters to lddb_json_shape.py
Browse files Browse the repository at this point in the history
  • Loading branch information
niklasl committed Sep 27, 2023
1 parent cfb00ac commit 13c7b24
Showing 1 changed file with 34 additions and 12 deletions.
46 changes: 34 additions & 12 deletions librisxl-tools/scripts/lddb_json_shape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations
import json
import os

Expand Down Expand Up @@ -69,32 +70,45 @@ def count_value(k, v, shape):


if __name__ == '__main__':
from datetime import datetime
from pathlib import Path
from time import time
import argparse
import sys
from pathlib import Path

args = sys.argv[:]
cmd = args.pop(0)
if not args:
print(f'USAGE: {cmd} OUT_DIR', file=sys.stderr)
sys.exit(1)
argp = argparse.ArgumentParser()
argp.add_argument('-d', '--debug', action='store_true', default=False)
argp.add_argument('-c', '--min-created') # inclusive
argp.add_argument('-C', '--max-created') # exclusive
argp.add_argument('outdir', metavar='OUT_DIR')

args = argp.parse_args()

outpath = Path(args.pop(0))
SUFFIX = '.json'

outpath: Path|None = Path(args.outdir)
assert outpath

if outpath.suffix == SUFFIX:
outdir = outpath.parent
else:
outdir = outpath
outpath = None

if not outdir.is_dir():
outdir.mkdir(parents=True, exist_ok=True)

index = {}
work_by_type_index = {}
instance_index = {}
work_index = {}
# NOTE: fromisoformat with timezones requires Python 3.11
min_inc_created: datetime|None = datetime.fromisoformat(args.min_created) if args.min_created else None
max_ex_created: datetime|None = datetime.fromisoformat(args.max_created) if args.max_created else None
print(min_inc_created, max_ex_created, file=sys.stderr)

index: dict = {}
work_by_type_index: dict = {}
instance_index: dict = {}
work_index: dict = {}

t_last = 0
t_last = 0.0
cr = '\r'
for i, l in enumerate(sys.stdin):
if not l.rstrip():
Expand All @@ -109,6 +123,14 @@ def count_value(k, v, shape):

try:
data = json.loads(l)

if '@graph' in data:
created = datetime.fromisoformat(data['@graph'][0]['created'])
if min_inc_created and created < min_inc_created:
continue
if max_ex_created and created >= max_ex_created:
continue

thing, work = reshape(data)
compute_shape(thing, index)
if work:
Expand Down

0 comments on commit 13c7b24

Please sign in to comment.