Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update validity management code to new format #67

Merged
merged 6 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
name: check LEGEND channel maps format
entry: validate-legend-chmaps
language: python
types: [json]
types: [yaml]

- id: validate-legend-detdb
name: check LEGEND detector database format
entry: validate-legend-detdb
language: python
types: [json]
types: [yaml]
43 changes: 35 additions & 8 deletions docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Let's consider the following database:
│   └── file1.json
├── file2.json
├── file3.yaml
└── validity.jsonl
└── validity.yaml

With:

Expand Down Expand Up @@ -80,20 +80,47 @@ Metadata validity
-----------------

Mappings of metadata to time periods, data taking systems etc. are specified
through JSONL files (`specification
through YAML files (`specification
<https://legend-exp.github.io/legend-data-format-specs/dev/metadata>`_).
If a ``.jsonl`` file is present in a directory, ``TextDB``
If a ``validity.yaml`` file is present in a directory, ``TextDB``
exposes the :meth:`~.textdb.textdb.on` interface to perform a query.

Let's assume the ``legend-metadata`` directory from the example above contains
the following file:

.. code-block::
.. code-block:: yaml
:linenos:
:caption: ``validity.jsonl``

{"valid_from": "20220628T000000Z", "select": "all", "apply": ["file2.json"]}
{"valid_from": "20220629T000000Z", "select": "all", "apply": ["file3.yaml"]}
:caption: ``validity.yaml``

- valid_from: 20230101T000000Z
category: all
apply:
- file3.yaml

- valid_from: 20230102T000000Z
category: all
mode: append
apply:
- file2.yaml

- valid_from: 20230103T000000Z
category: all
mode: remove
apply:
- file2.yaml

- valid_from: 20230104T000000Z
category: all
mode: reset
apply:
- file2.yaml

- valid_from: 20230105T000000Z
category: all
mode: replace
apply:
- file2.yaml
- file3.yaml

From code, it's possible to obtain the metadata valid for a certain time point:

Expand Down
48 changes: 41 additions & 7 deletions src/legendmeta/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
import bisect
import collections
import copy
import json
import types
from collections import namedtuple
from datetime import datetime
from pathlib import Path
from string import Template

import yaml

from . import utils


Expand All @@ -33,6 +34,7 @@ def to_datetime(value):


def unix_time(value):
"""Convert a LEGEND timestamp or datetime object to Unix time value"""
if isinstance(value, str):
return datetime.timestamp(datetime.strptime(value, "%Y%m%dT%H%M%SZ"))

Expand All @@ -44,6 +46,8 @@ def unix_time(value):


class PropsStream:
"""Simple class to control loading of validity.yaml files"""

@staticmethod
def get(value):
if isinstance(value, str):
Expand All @@ -57,13 +61,14 @@ def get(value):

@staticmethod
def read_from(file_name):
with Path(file_name).open() as file:
for json_str in file:
yield json.loads(json_str)
with Path(file_name).open() as r:
file = yaml.safe_load(r)
file = sorted(file, key=lambda item: unix_time(item["valid_from"]))
yield from file


class Catalog(namedtuple("Catalog", ["entries"])):
"""Implementation of the `JSONL metadata validity specification <https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_."""
"""Implementation of the `YAML metadata validity specification <https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_."""

__slots__ = ()

Expand All @@ -83,15 +88,40 @@ def get(value):

@staticmethod
def read_from(file_name):
"""Read from a valdiity YAML file and build a Catalog object"""
entries = {}

for props in PropsStream.get(file_name):
timestamp = props["valid_from"]
system = "all" if props.get("category") is None else props["category"]
file_key = props["apply"]
if system not in entries:
entries[system] = []
entries[system].append(Catalog.Entry(unix_time(timestamp), file_key))
mode = "append" if props.get("mode") is None else props["mode"]
mode = "reset" if len(entries[system]) == 0 else mode
if mode == "reset":
new = file_key
elif mode == "append":
new = entries[system][-1].file.copy() + file_key
elif mode == "remove":
new = entries[system][-1].file.copy()
for file in file_key:
new.remove(file)
elif mode == "replace":
new = entries[system][-1].file.copy()
if len(file_key) != 2:
msg = f"Invalid number of elements in replace mode: {len(file_key)}"
raise ValueError(msg)
new.remove(file_key[0])
new += [file_key[1]]

else:
msg = f"Unknown mode for {timestamp}"
raise ValueError(msg)

if timestamp in [entry.valid_from for entry in entries[system]]:
msg = f"Duplicate timestamp: {timestamp}, use reset mode instead with a single entry"
raise ValueError(msg)
entries[system].append(Catalog.Entry(unix_time(timestamp), new))

for system in entries:
entries[system] = sorted(
Expand All @@ -100,6 +130,7 @@ def read_from(file_name):
return Catalog(entries)

def valid_for(self, timestamp, system="all", allow_none=False):
"""Get the valid entries for a given timestamp and system"""
if system in self.entries:
valid_from = [entry.valid_from for entry in self.entries[system]]
pos = bisect.bisect_right(valid_from, unix_time(timestamp))
Expand All @@ -126,11 +157,14 @@ def valid_for(self, timestamp, system="all", allow_none=False):

@staticmethod
def get_files(catalog_file, timestamp, category="all"):
"""Helper function to get the files for a given timestamp and category"""
catalog = Catalog.read_from(catalog_file)
return Catalog.valid_for(catalog, timestamp, category)


class Props:
"""Class to handle overwriting of dictionaries in cascade order"""

@staticmethod
def read_from(sources, subst_pathvar=False, trim_null=False):
def read_impl(sources):
Expand Down
12 changes: 7 additions & 5 deletions src/legendmeta/police.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
from __future__ import annotations

import argparse
import json
import re
import sys
from importlib import resources
from pathlib import Path

import yaml

from . import utils
from .textdb import TextDB

Expand Down Expand Up @@ -96,10 +97,11 @@ def validate_legend_channel_map() -> bool:
db = TextDB(d)
valid = True

with Path(f"{d}/validity.jsonl").open() as f:
for line in f.readlines():
ts = json.loads(line)["valid_from"]
sy = json.loads(line)["select"]
with Path(f"{d}/validity.yaml").open() as f:
validity = yaml.safe_load(f)
for line in validity():
ts = line["valid_from"]
sy = line["apply"]
chmap = db.on(ts, system=sy)

for k, v in chmap.items():
Expand Down
15 changes: 9 additions & 6 deletions src/legendmeta/textdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,11 @@ def on(
) -> AttrsDict | list:
"""Query database in `time[, file pattern, system]`.

A (only one) valid ``validity.jsonl`` file must exist in the directory
A (only one) valid ``validity.yaml`` file must exist in the directory
to specify a validity mapping. This functionality relies on the
:class:`.catalog.Catalog` class.

The JSONL specification is documented at `this link
The YAML specification is documented at `this link
<https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_.

The special ``$_`` string is expanded to the directory containing the
Expand All @@ -401,12 +401,15 @@ def on(
system: 'all', 'phy', 'cal', 'lar', ...
query only a data taking "system".
"""
jsonl = self.__path__ / "validity.jsonl"
if not jsonl.is_file():
msg = f"no validity.jsonl file found in {self.__path__!s}"
for ext in utils.__file_extensions__["yaml"]:
yml = self.__path__ / f"validity{ext}"
if yml.is_file():
break
if not yml.is_file():
msg = f"no validity.yaml / validity.yml file found in {self.__path__!s}"
raise RuntimeError(msg)

file_list = Catalog.get_files(str(jsonl), timestamp, system)
file_list = Catalog.get_files(str(yml), timestamp, system)
# select only files matching pattern if specified
if pattern is not None:
c = re.compile(pattern)
Expand Down
53 changes: 33 additions & 20 deletions tests/test_jsondb.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_props():
# test subst_vars
Props.subst_vars(test_dict, var_values={"_": str(Path(__file__).parent / "testdb")})
assert test_dict["filepath"] == str(
Path(__file__).parent / "testdb/dir1/file3.json"
Path(__file__).parent / "testdb/dir1/file3.yaml"
)

test_dict2 = Props.read_from(str(Path(__file__).parent / "testdb/file3.json"))
Expand All @@ -43,7 +43,7 @@ def test_props():
)
assert test_dict["data"] == 3
assert test_dict["filepath"] == str(
Path(__file__).parent / "testdb/dir1/file3.json"
Path(__file__).parent / "testdb/dir1/file3.yaml"
)
with pytest.raises(KeyError):
test_dict["null_key"]
Expand All @@ -55,12 +55,12 @@ def test_access():
assert isinstance(jdb["file2.yaml"], AttrsDict)
assert isinstance(jdb["file1"], AttrsDict)
assert isinstance(jdb["dir1"], TextDB)
assert isinstance(jdb["dir1"]["file3.json"], AttrsDict)
assert isinstance(jdb["dir1"]["file3.yaml"], AttrsDict)
assert isinstance(jdb["dir1"]["file3"], AttrsDict)
assert isinstance(jdb["dir1/file3.json"], AttrsDict)
assert isinstance(jdb["dir1/file3.yaml"], AttrsDict)
assert isinstance(jdb["dir1"]["dir2"], TextDB)
assert isinstance(jdb["dir1"]["dir2"]["file4.json"], AttrsDict)
assert isinstance(jdb["dir1/dir2/file4.json"], AttrsDict)
assert isinstance(jdb["dir1"]["dir2"]["file4.yaml"], AttrsDict)
assert isinstance(jdb["dir1/dir2/file4.yaml"], AttrsDict)
assert jdb["file1.json"]["data"] == 1
assert isinstance(jdb["file1"]["group"], AttrsDict)

Expand All @@ -82,7 +82,7 @@ def test_access():
assert jdb.arrays[1].array[0] == 1
assert jdb.arrays[1].array[1].data == 2

assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.json")
assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.yaml")

with pytest.raises(ValueError):
TextDB("non-existent-db")
Expand All @@ -98,7 +98,7 @@ def test_access():
def test_keys():
jdb = TextDB(testdb, lazy=False)
assert sorted(jdb.keys()) == ["arrays", "dir1", "dir2", "file1", "file2", "file3"]
assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5"]
assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5", "file6", "validity"]

assert "arrays" in jdb

Expand Down Expand Up @@ -162,28 +162,33 @@ def test_scan():

def test_time_validity():
jdb = TextDB(testdb)
assert isinstance(jdb["dir1"].on("20220628T221955Z"), AttrsDict)
assert isinstance(jdb["dir1"].on("20230101T000001Z"), AttrsDict)

assert jdb["dir1"].on("20220628T221955Z")["data"] == 1
assert jdb.dir1.on("20220629T221955Z").data == 2
assert jdb["dir1"].on("20230101T000000Z")["data"] == 1
assert jdb.dir1.on("20230102T000000Z").data == 2
# time point in between
assert jdb["dir1"].on("20220628T233500Z")["data"] == 1
assert jdb["dir1"].on("20230101T120000Z")["data"] == 1
# time point after
assert jdb["dir1"].on("20220630T233500Z")["data"] == 2
assert jdb["dir1"].on("20230102T120000Z")["data"] == 2
# time point before
with pytest.raises(RuntimeError):
jdb["dir1"].on("20220627T233500Z")["data"]

# directory with no .jsonl
jdb["dir1"].on("20210101T000000Z")["data"]
# test remove functionality
assert jdb["dir1"].on("20230103T120000Z")["data"] == 1
# test reset functionality
assert jdb["dir1"].on("20230104T120000Z")["data"] == 3
# test replace functionality
assert jdb["dir1"].on("20230105T120000Z")["data"] == 1
# directory with no .yml
with pytest.raises(RuntimeError):
jdb["dir1"]["dir2"].on("20220627T233500Z")
jdb["dir1"]["dir2"].on("20230101T000001Z")

# invalid timestamp
with pytest.raises(ValueError):
jdb.dir1.on("20220627T2335002Z")
jdb.dir1.on("20230627T2335002Z")

# test usage of datetime object
tstamp = datetime(2022, 6, 28, 23, 35, 00, tzinfo=timezone.utc)
tstamp = datetime(2023, 6, 28, 23, 35, 00, tzinfo=timezone.utc)
assert jdb.dir1.on(tstamp).data == 1
assert jdb.dir1.on(tstamp, r"^file3.*", "all").data == 1

Expand Down Expand Up @@ -241,7 +246,15 @@ def test_merging():
jdb = TextDB(testdb, lazy=False)
j = jdb.dir1 | jdb.dir2
assert isinstance(j, AttrsDict)
assert sorted(j.keys()) == ["dir2", "file3", "file5", "file7", "file8"]
assert sorted(j.keys()) == [
"dir2",
"file3",
"file5",
"file6",
"file7",
"file8",
"validity",
]
assert hasattr(j, "dir2")
assert hasattr(j, "file8")

Expand Down
3 changes: 0 additions & 3 deletions tests/testdb/dir1/file3.json

This file was deleted.

1 change: 1 addition & 0 deletions tests/testdb/dir1/file3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 1
3 changes: 0 additions & 3 deletions tests/testdb/dir1/file5.json

This file was deleted.

1 change: 1 addition & 0 deletions tests/testdb/dir1/file5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 2
1 change: 1 addition & 0 deletions tests/testdb/dir1/file6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 3
2 changes: 0 additions & 2 deletions tests/testdb/dir1/validity.jsonl

This file was deleted.

Loading