From 13c7b241efe9550f83ea46cebe175325890a6ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Wed, 27 Sep 2023 17:53:59 +0200 Subject: [PATCH 1/4] Add min- and max-created filters to lddb_json_shape.py --- librisxl-tools/scripts/lddb_json_shape.py | 46 +++++++++++++++++------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/librisxl-tools/scripts/lddb_json_shape.py b/librisxl-tools/scripts/lddb_json_shape.py index 78ceeefa86..f809f02452 100644 --- a/librisxl-tools/scripts/lddb_json_shape.py +++ b/librisxl-tools/scripts/lddb_json_shape.py @@ -1,3 +1,4 @@ +from __future__ import annotations import json import os @@ -69,32 +70,45 @@ def count_value(k, v, shape): if __name__ == '__main__': + from datetime import datetime + from pathlib import Path from time import time + import argparse import sys - from pathlib import Path - args = sys.argv[:] - cmd = args.pop(0) - if not args: - print(f'USAGE: {cmd} OUT_DIR', file=sys.stderr) - sys.exit(1) + argp = argparse.ArgumentParser() + argp.add_argument('-d', '--debug', action='store_true', default=False) + argp.add_argument('-c', '--min-created') # inclusive + argp.add_argument('-C', '--max-created') # exclusive + argp.add_argument('outdir', metavar='OUT_DIR') + + args = argp.parse_args() - outpath = Path(args.pop(0)) SUFFIX = '.json' + + outpath: Path|None = Path(args.outdir) + assert outpath + if outpath.suffix == SUFFIX: outdir = outpath.parent else: outdir = outpath outpath = None + if not outdir.is_dir(): outdir.mkdir(parents=True, exist_ok=True) - index = {} - work_by_type_index = {} - instance_index = {} - work_index = {} + # NOTE: fromisoformat with timezones requires Python 3.11 + min_inc_created: datetime|None = datetime.fromisoformat(args.min_created) if args.min_created else None + max_ex_created: datetime|None = datetime.fromisoformat(args.max_created) if args.max_created else None + print(min_inc_created, max_ex_created, file=sys.stderr) + + index: dict = {} + work_by_type_index: dict = {} + instance_index: dict = {} + work_index: dict = {} - t_last = 0 + t_last = 0.0 cr = '\r' for i, l in enumerate(sys.stdin): if not l.rstrip(): @@ -109,6 +123,14 @@ def count_value(k, v, shape): try: data = json.loads(l) + + if '@graph' in data: + created = datetime.fromisoformat(data['@graph'][0]['created']) + if min_inc_created and created < min_inc_created: + continue + if max_ex_created and created >= max_ex_created: + continue + thing, work = reshape(data) compute_shape(thing, index) if work: From faf13dbb0e7da8f40867643f70c14604f603fed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 2 Oct 2023 15:02:57 +0200 Subject: [PATCH 2/4] Handle bad or missing created dates --- librisxl-tools/scripts/lddb_json_shape.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/librisxl-tools/scripts/lddb_json_shape.py b/librisxl-tools/scripts/lddb_json_shape.py index f809f02452..789ff653e0 100644 --- a/librisxl-tools/scripts/lddb_json_shape.py +++ b/librisxl-tools/scripts/lddb_json_shape.py @@ -101,7 +101,10 @@ def count_value(k, v, shape): # NOTE: fromisoformat with timezones requires Python 3.11 min_inc_created: datetime|None = datetime.fromisoformat(args.min_created) if args.min_created else None max_ex_created: datetime|None = datetime.fromisoformat(args.max_created) if args.max_created else None - print(min_inc_created, max_ex_created, file=sys.stderr) + if min_inc_created: + print(f"Filter - min created (inclusive): {min_inc_created}", file=sys.stderr) + if max_ex_created: + print(f"Filter - max created (exclusive): {max_ex_created}", file=sys.stderr) index: dict = {} work_by_type_index: dict = {} @@ -125,11 +128,14 @@ def count_value(k, v, shape): data = json.loads(l) if '@graph' in data: - created = datetime.fromisoformat(data['@graph'][0]['created']) - if min_inc_created and created < min_inc_created: - continue - if max_ex_created and created >= max_ex_created: - continue + try: + created = datetime.fromisoformat(data['@graph'][0]['created']) + if min_inc_created and created < min_inc_created: + continue + if max_ex_created and created >= max_ex_created: + continue + except (KeyError, ValueError): + pass thing, work = reshape(data) compute_shape(thing, index) @@ -139,7 +145,7 @@ def count_value(k, v, shape): compute_shape(work, work_index, type_key='Work') except (ValueError, AttributeError) as e: - print(f'ERROR at: {i} in data:', file=sys.stderr) + print(f'ERROR at: {i + 1} in data:', file=sys.stderr) print(l, file=sys.stderr) print(e, file=sys.stderr) From 6045f1f9375ebfd3fd5ea1bbe6d4155dda82423b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 2 Oct 2023 17:48:05 +0200 Subject: [PATCH 3/4] Strip TZ info if fromisoformat fails --- librisxl-tools/scripts/lddb_json_shape.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/librisxl-tools/scripts/lddb_json_shape.py b/librisxl-tools/scripts/lddb_json_shape.py index 789ff653e0..61d6e6d996 100644 --- a/librisxl-tools/scripts/lddb_json_shape.py +++ b/librisxl-tools/scripts/lddb_json_shape.py @@ -1,6 +1,8 @@ from __future__ import annotations +from datetime import datetime import json import os +import re MAX_STATS = int(os.environ.get('MAX_STATS', '512')) @@ -69,8 +71,16 @@ def count_value(k, v, shape): shape[k] = stats + 1 +def isodatetime(s): + # NOTE: fromisoformat with timezones requires Python 3.11+ + try: + return datetime.fromisoformat(s) + except ValueError: + # Strip TZ info: + return datetime.fromisoformat(re.sub(r'(\+[0-9:]+|[A-Z]+)$', '', s)) + + if __name__ == '__main__': - from datetime import datetime from pathlib import Path from time import time import argparse @@ -98,9 +108,8 @@ def count_value(k, v, shape): if not outdir.is_dir(): outdir.mkdir(parents=True, exist_ok=True) - # NOTE: fromisoformat with timezones requires Python 3.11 - min_inc_created: datetime|None = datetime.fromisoformat(args.min_created) if args.min_created else None - max_ex_created: datetime|None = datetime.fromisoformat(args.max_created) if args.max_created else None + min_inc_created: datetime|None = isodatetime(args.min_created) if args.min_created else None + max_ex_created: datetime|None = isodatetime(args.max_created) if args.max_created else None if min_inc_created: print(f"Filter - min created (inclusive): {min_inc_created}", file=sys.stderr) if max_ex_created: @@ -129,7 +138,7 @@ def count_value(k, v, shape): if '@graph' in data: try: - created = datetime.fromisoformat(data['@graph'][0]['created']) + created = isodatetime(data['@graph'][0]['created']) if min_inc_created and created < min_inc_created: continue if max_ex_created and created >= max_ex_created: From 61d5464b5102667b439fd700232802b57884ddbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 2 Oct 2023 18:00:04 +0200 Subject: [PATCH 4/4] Improve datetime usage --- librisxl-tools/scripts/lddb_json_shape.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/librisxl-tools/scripts/lddb_json_shape.py b/librisxl-tools/scripts/lddb_json_shape.py index 61d6e6d996..6508e5e8cf 100644 --- a/librisxl-tools/scripts/lddb_json_shape.py +++ b/librisxl-tools/scripts/lddb_json_shape.py @@ -2,7 +2,6 @@ from datetime import datetime import json import os -import re MAX_STATS = int(os.environ.get('MAX_STATS', '512')) @@ -72,12 +71,10 @@ def count_value(k, v, shape): def isodatetime(s): - # NOTE: fromisoformat with timezones requires Python 3.11+ - try: - return datetime.fromisoformat(s) - except ValueError: - # Strip TZ info: - return datetime.fromisoformat(re.sub(r'(\+[0-9:]+|[A-Z]+)$', '', s)) + # NOTE: fromisoformat with zulu time requires Python 3.11+ + if s.endswith('Z'): + s = s[:-1] + '+00:00' + return datetime.fromisoformat(s) if __name__ == '__main__': @@ -108,8 +105,8 @@ def isodatetime(s): if not outdir.is_dir(): outdir.mkdir(parents=True, exist_ok=True) - min_inc_created: datetime|None = isodatetime(args.min_created) if args.min_created else None - max_ex_created: datetime|None = isodatetime(args.max_created) if args.max_created else None + min_inc_created: datetime | None = isodatetime(args.min_created) if args.min_created else None + max_ex_created: datetime | None = isodatetime(args.max_created) if args.max_created else None if min_inc_created: print(f"Filter - min created (inclusive): {min_inc_created}", file=sys.stderr) if max_ex_created: @@ -136,7 +133,7 @@ def isodatetime(s): try: data = json.loads(l) - if '@graph' in data: + if (min_inc_created or max_ex_created) and '@graph' in data: try: created = isodatetime(data['@graph'][0]['created']) if min_inc_created and created < min_inc_created: