diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 3c82fc9..964de5d 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -2,10 +2,10 @@ name: check LEGEND channel maps format entry: validate-legend-chmaps language: python - types: [json] + types: [yaml] - id: validate-legend-detdb name: check LEGEND detector database format entry: validate-legend-detdb language: python - types: [json] + types: [yaml] diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index f937009..08a5ed4 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -38,7 +38,7 @@ Let's consider the following database: │   └── file1.json ├── file2.json ├── file3.yaml - └── validity.jsonl + └── validity.yaml With: @@ -80,20 +80,47 @@ Metadata validity ----------------- Mappings of metadata to time periods, data taking systems etc. are specified -through JSONL files (`specification +through YAML files (`specification `_). -If a ``.jsonl`` file is present in a directory, ``TextDB`` +If a ``validity.yaml`` file is present in a directory, ``TextDB`` exposes the :meth:`~.textdb.textdb.on` interface to perform a query. Let's assume the ``legend-metadata`` directory from the example above contains the following file: -.. code-block:: +.. code-block:: yaml :linenos: - :caption: ``validity.jsonl`` - - {"valid_from": "20220628T000000Z", "select": "all", "apply": ["file2.json"]} - {"valid_from": "20220629T000000Z", "select": "all", "apply": ["file3.yaml"]} + :caption: ``validity.yaml`` + + - valid_from: 20230101T000000Z + category: all + apply: + - file3.yaml + + - valid_from: 20230102T000000Z + category: all + mode: append + apply: + - file2.yaml + + - valid_from: 20230103T000000Z + category: all + mode: remove + apply: + - file2.yaml + + - valid_from: 20230104T000000Z + category: all + mode: reset + apply: + - file2.yaml + + - valid_from: 20230105T000000Z + category: all + mode: replace + apply: + - file2.yaml + - file3.yaml From code, it's possible to obtain the metadata valid for a certain time point: diff --git a/src/legendmeta/catalog.py b/src/legendmeta/catalog.py index 6299310..fa610fa 100644 --- a/src/legendmeta/catalog.py +++ b/src/legendmeta/catalog.py @@ -17,13 +17,14 @@ import bisect import collections import copy -import json import types from collections import namedtuple from datetime import datetime from pathlib import Path from string import Template +import yaml + from . import utils @@ -33,6 +34,7 @@ def to_datetime(value): def unix_time(value): + """Convert a LEGEND timestamp or datetime object to Unix time value""" if isinstance(value, str): return datetime.timestamp(datetime.strptime(value, "%Y%m%dT%H%M%SZ")) @@ -44,6 +46,8 @@ def unix_time(value): class PropsStream: + """Simple class to control loading of validity.yaml files""" + @staticmethod def get(value): if isinstance(value, str): @@ -57,13 +61,14 @@ def get(value): @staticmethod def read_from(file_name): - with Path(file_name).open() as file: - for json_str in file: - yield json.loads(json_str) + with Path(file_name).open() as r: + file = yaml.safe_load(r) + file = sorted(file, key=lambda item: unix_time(item["valid_from"])) + yield from file class Catalog(namedtuple("Catalog", ["entries"])): - """Implementation of the `JSONL metadata validity specification `_.""" + """Implementation of the `YAML metadata validity specification `_.""" __slots__ = () @@ -83,15 +88,40 @@ def get(value): @staticmethod def read_from(file_name): + """Read from a valdiity YAML file and build a Catalog object""" entries = {} - for props in PropsStream.get(file_name): timestamp = props["valid_from"] system = "all" if props.get("category") is None else props["category"] file_key = props["apply"] if system not in entries: entries[system] = [] - entries[system].append(Catalog.Entry(unix_time(timestamp), file_key)) + mode = "append" if props.get("mode") is None else props["mode"] + mode = "reset" if len(entries[system]) == 0 else mode + if mode == "reset": + new = file_key + elif mode == "append": + new = entries[system][-1].file.copy() + file_key + elif mode == "remove": + new = entries[system][-1].file.copy() + for file in file_key: + new.remove(file) + elif mode == "replace": + new = entries[system][-1].file.copy() + if len(file_key) != 2: + msg = f"Invalid number of elements in replace mode: {len(file_key)}" + raise ValueError(msg) + new.remove(file_key[0]) + new += [file_key[1]] + + else: + msg = f"Unknown mode for {timestamp}" + raise ValueError(msg) + + if timestamp in [entry.valid_from for entry in entries[system]]: + msg = f"Duplicate timestamp: {timestamp}, use reset mode instead with a single entry" + raise ValueError(msg) + entries[system].append(Catalog.Entry(unix_time(timestamp), new)) for system in entries: entries[system] = sorted( @@ -100,6 +130,7 @@ def read_from(file_name): return Catalog(entries) def valid_for(self, timestamp, system="all", allow_none=False): + """Get the valid entries for a given timestamp and system""" if system in self.entries: valid_from = [entry.valid_from for entry in self.entries[system]] pos = bisect.bisect_right(valid_from, unix_time(timestamp)) @@ -126,11 +157,14 @@ def valid_for(self, timestamp, system="all", allow_none=False): @staticmethod def get_files(catalog_file, timestamp, category="all"): + """Helper function to get the files for a given timestamp and category""" catalog = Catalog.read_from(catalog_file) return Catalog.valid_for(catalog, timestamp, category) class Props: + """Class to handle overwriting of dictionaries in cascade order""" + @staticmethod def read_from(sources, subst_pathvar=False, trim_null=False): def read_impl(sources): diff --git a/src/legendmeta/police.py b/src/legendmeta/police.py index 57b10a0..9becbc6 100644 --- a/src/legendmeta/police.py +++ b/src/legendmeta/police.py @@ -16,12 +16,13 @@ from __future__ import annotations import argparse -import json import re import sys from importlib import resources from pathlib import Path +import yaml + from . import utils from .textdb import TextDB @@ -96,10 +97,11 @@ def validate_legend_channel_map() -> bool: db = TextDB(d) valid = True - with Path(f"{d}/validity.jsonl").open() as f: - for line in f.readlines(): - ts = json.loads(line)["valid_from"] - sy = json.loads(line)["select"] + with Path(f"{d}/validity.yaml").open() as f: + validity = yaml.safe_load(f) + for line in validity(): + ts = line["valid_from"] + sy = line["apply"] chmap = db.on(ts, system=sy) for k, v in chmap.items(): diff --git a/src/legendmeta/textdb.py b/src/legendmeta/textdb.py index e68a6d6..bc2b20b 100644 --- a/src/legendmeta/textdb.py +++ b/src/legendmeta/textdb.py @@ -381,11 +381,11 @@ def on( ) -> AttrsDict | list: """Query database in `time[, file pattern, system]`. - A (only one) valid ``validity.jsonl`` file must exist in the directory + A (only one) valid ``validity.yaml`` file must exist in the directory to specify a validity mapping. This functionality relies on the :class:`.catalog.Catalog` class. - The JSONL specification is documented at `this link + The YAML specification is documented at `this link `_. The special ``$_`` string is expanded to the directory containing the @@ -401,12 +401,15 @@ def on( system: 'all', 'phy', 'cal', 'lar', ... query only a data taking "system". """ - jsonl = self.__path__ / "validity.jsonl" - if not jsonl.is_file(): - msg = f"no validity.jsonl file found in {self.__path__!s}" + for ext in utils.__file_extensions__["yaml"]: + yml = self.__path__ / f"validity{ext}" + if yml.is_file(): + break + if not yml.is_file(): + msg = f"no validity.yaml / validity.yml file found in {self.__path__!s}" raise RuntimeError(msg) - file_list = Catalog.get_files(str(jsonl), timestamp, system) + file_list = Catalog.get_files(str(yml), timestamp, system) # select only files matching pattern if specified if pattern is not None: c = re.compile(pattern) diff --git a/tests/test_jsondb.py b/tests/test_jsondb.py index 7b5ef9e..484e007 100644 --- a/tests/test_jsondb.py +++ b/tests/test_jsondb.py @@ -19,7 +19,7 @@ def test_props(): # test subst_vars Props.subst_vars(test_dict, var_values={"_": str(Path(__file__).parent / "testdb")}) assert test_dict["filepath"] == str( - Path(__file__).parent / "testdb/dir1/file3.json" + Path(__file__).parent / "testdb/dir1/file3.yaml" ) test_dict2 = Props.read_from(str(Path(__file__).parent / "testdb/file3.json")) @@ -43,7 +43,7 @@ def test_props(): ) assert test_dict["data"] == 3 assert test_dict["filepath"] == str( - Path(__file__).parent / "testdb/dir1/file3.json" + Path(__file__).parent / "testdb/dir1/file3.yaml" ) with pytest.raises(KeyError): test_dict["null_key"] @@ -55,12 +55,12 @@ def test_access(): assert isinstance(jdb["file2.yaml"], AttrsDict) assert isinstance(jdb["file1"], AttrsDict) assert isinstance(jdb["dir1"], TextDB) - assert isinstance(jdb["dir1"]["file3.json"], AttrsDict) + assert isinstance(jdb["dir1"]["file3.yaml"], AttrsDict) assert isinstance(jdb["dir1"]["file3"], AttrsDict) - assert isinstance(jdb["dir1/file3.json"], AttrsDict) + assert isinstance(jdb["dir1/file3.yaml"], AttrsDict) assert isinstance(jdb["dir1"]["dir2"], TextDB) - assert isinstance(jdb["dir1"]["dir2"]["file4.json"], AttrsDict) - assert isinstance(jdb["dir1/dir2/file4.json"], AttrsDict) + assert isinstance(jdb["dir1"]["dir2"]["file4.yaml"], AttrsDict) + assert isinstance(jdb["dir1/dir2/file4.yaml"], AttrsDict) assert jdb["file1.json"]["data"] == 1 assert isinstance(jdb["file1"]["group"], AttrsDict) @@ -82,7 +82,7 @@ def test_access(): assert jdb.arrays[1].array[0] == 1 assert jdb.arrays[1].array[1].data == 2 - assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.json") + assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.yaml") with pytest.raises(ValueError): TextDB("non-existent-db") @@ -98,7 +98,7 @@ def test_access(): def test_keys(): jdb = TextDB(testdb, lazy=False) assert sorted(jdb.keys()) == ["arrays", "dir1", "dir2", "file1", "file2", "file3"] - assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5"] + assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5", "file6", "validity"] assert "arrays" in jdb @@ -162,28 +162,33 @@ def test_scan(): def test_time_validity(): jdb = TextDB(testdb) - assert isinstance(jdb["dir1"].on("20220628T221955Z"), AttrsDict) + assert isinstance(jdb["dir1"].on("20230101T000001Z"), AttrsDict) - assert jdb["dir1"].on("20220628T221955Z")["data"] == 1 - assert jdb.dir1.on("20220629T221955Z").data == 2 + assert jdb["dir1"].on("20230101T000000Z")["data"] == 1 + assert jdb.dir1.on("20230102T000000Z").data == 2 # time point in between - assert jdb["dir1"].on("20220628T233500Z")["data"] == 1 + assert jdb["dir1"].on("20230101T120000Z")["data"] == 1 # time point after - assert jdb["dir1"].on("20220630T233500Z")["data"] == 2 + assert jdb["dir1"].on("20230102T120000Z")["data"] == 2 # time point before with pytest.raises(RuntimeError): - jdb["dir1"].on("20220627T233500Z")["data"] - - # directory with no .jsonl + jdb["dir1"].on("20210101T000000Z")["data"] + # test remove functionality + assert jdb["dir1"].on("20230103T120000Z")["data"] == 1 + # test reset functionality + assert jdb["dir1"].on("20230104T120000Z")["data"] == 3 + # test replace functionality + assert jdb["dir1"].on("20230105T120000Z")["data"] == 1 + # directory with no .yml with pytest.raises(RuntimeError): - jdb["dir1"]["dir2"].on("20220627T233500Z") + jdb["dir1"]["dir2"].on("20230101T000001Z") # invalid timestamp with pytest.raises(ValueError): - jdb.dir1.on("20220627T2335002Z") + jdb.dir1.on("20230627T2335002Z") # test usage of datetime object - tstamp = datetime(2022, 6, 28, 23, 35, 00, tzinfo=timezone.utc) + tstamp = datetime(2023, 6, 28, 23, 35, 00, tzinfo=timezone.utc) assert jdb.dir1.on(tstamp).data == 1 assert jdb.dir1.on(tstamp, r"^file3.*", "all").data == 1 @@ -241,7 +246,15 @@ def test_merging(): jdb = TextDB(testdb, lazy=False) j = jdb.dir1 | jdb.dir2 assert isinstance(j, AttrsDict) - assert sorted(j.keys()) == ["dir2", "file3", "file5", "file7", "file8"] + assert sorted(j.keys()) == [ + "dir2", + "file3", + "file5", + "file6", + "file7", + "file8", + "validity", + ] assert hasattr(j, "dir2") assert hasattr(j, "file8") diff --git a/tests/testdb/dir1/file3.json b/tests/testdb/dir1/file3.json deleted file mode 100644 index 858a13c..0000000 --- a/tests/testdb/dir1/file3.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "data": 1 -} diff --git a/tests/testdb/dir1/file3.yaml b/tests/testdb/dir1/file3.yaml new file mode 100644 index 0000000..937ac32 --- /dev/null +++ b/tests/testdb/dir1/file3.yaml @@ -0,0 +1 @@ +data: 1 diff --git a/tests/testdb/dir1/file5.json b/tests/testdb/dir1/file5.json deleted file mode 100644 index 087e30f..0000000 --- a/tests/testdb/dir1/file5.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "data": 2 -} diff --git a/tests/testdb/dir1/file5.yaml b/tests/testdb/dir1/file5.yaml new file mode 100644 index 0000000..4182ac4 --- /dev/null +++ b/tests/testdb/dir1/file5.yaml @@ -0,0 +1 @@ +data: 2 diff --git a/tests/testdb/dir1/file6.yaml b/tests/testdb/dir1/file6.yaml new file mode 100644 index 0000000..ebf3e8d --- /dev/null +++ b/tests/testdb/dir1/file6.yaml @@ -0,0 +1 @@ +data: 3 diff --git a/tests/testdb/dir1/validity.jsonl b/tests/testdb/dir1/validity.jsonl deleted file mode 100644 index 30c101f..0000000 --- a/tests/testdb/dir1/validity.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"valid_from":"20220628T221955Z","select":"all","apply":["file3.json"]} -{"valid_from":"20220629T221955Z","select":"all","apply":["file5.json"]} diff --git a/tests/testdb/dir1/validity.yaml b/tests/testdb/dir1/validity.yaml new file mode 100644 index 0000000..3604dc7 --- /dev/null +++ b/tests/testdb/dir1/validity.yaml @@ -0,0 +1,29 @@ +- valid_from: 20230101T000000Z + category: all + apply: + - file3.yaml + +- valid_from: 20230102T000000Z + category: all + mode: append + apply: + - file5.yaml + +- valid_from: 20230103T000000Z + category: all + mode: remove + apply: + - file5.yaml + +- valid_from: 20230104T000000Z + category: all + mode: reset + apply: + - file6.yaml + +- valid_from: 20230105T000000Z + category: all + mode: replace + apply: + - file6.yaml + - file3.yaml diff --git a/tests/testdb/file2.yaml b/tests/testdb/file2.yaml index 70e68b8..e7c6d8a 100644 --- a/tests/testdb/file2.yaml +++ b/tests/testdb/file2.yaml @@ -1,5 +1,5 @@ data: 2 -filepath: $_/dir1/file3.json +filepath: $_/dir1/file3.yaml key: label: 3 label: 3