From 27e5d80ca18e6891d9288dc9158f7463d9e3c269 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Fri, 12 Jul 2024 17:03:22 -0700 Subject: [PATCH 1/6] Fix missing queryDataIds argument validation Match the DirectButler behavior in RemoteButler queryDataIds by throwing an ArgumentError if collections is specified without datasets. --- python/lsst/daf/butler/remote_butler/_registry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py index 168665992c..f7dee118bd 100644 --- a/python/lsst/daf/butler/remote_butler/_registry.py +++ b/python/lsst/daf/butler/remote_butler/_registry.py @@ -57,6 +57,7 @@ Registry, RegistryDefaults, ) +from ..registry._exceptions import ArgumentError from ..registry.queries import ( ChainedDatasetQueryResults, DataCoordinateQueryResults, @@ -448,6 +449,9 @@ def queryDataIds( check: bool = True, **kwargs: Any, ) -> DataCoordinateQueryResults: + if collections is not None and datasets is None: + raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") + dimensions = self.dimensions.conform(dimensions) args = self._convert_common_query_arguments( dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections From 533718564cac9670244d036725ee2ca18b07ec9a Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 30 Jul 2024 11:43:24 -0700 Subject: [PATCH 2/6] Tweak test for RemoteButler behavior The query shims for RemoteButler are not able to determine whether there are any errors until they go to resolve the query, so resolve the query in a test to allow this instance of MissingDatasetTypeError to be raised lazily. --- python/lsst/daf/butler/registry/tests/_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index 25c9bdee53..0247de4f32 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -3008,7 +3008,7 @@ def testQueryResultSummaries(self): ] with self.assertRaises(MissingDatasetTypeError): # Dataset type name doesn't match any existing dataset types. - registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...) + list(registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...)) with self.assertRaises(MissingDatasetTypeError): # Dataset type name doesn't match any existing dataset types. registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...).any() From 26248696d6724c14180b8ae3531ed285999401c2 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 30 Jul 2024 11:57:26 -0700 Subject: [PATCH 3/6] Provide better error message for missing dimension Matching the DirectButler behavior, return a more user-friendly error when an empty string is passed to order_by. --- python/lsst/daf/butler/queries/convert_args.py | 3 +++ python/lsst/daf/butler/registry/tests/_registry.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/queries/convert_args.py b/python/lsst/daf/butler/queries/convert_args.py index 8c06f6b4fd..16d6a432eb 100644 --- a/python/lsst/daf/butler/queries/convert_args.py +++ b/python/lsst/daf/butler/queries/convert_args.py @@ -35,6 +35,7 @@ from collections.abc import Mapping, Set from typing import Any +from .._exceptions import InvalidQueryError from ..dimensions import DataCoordinate, DataId, DimensionGroup from ._expression_strings import convert_expression_string_to_predicate from ._identifiers import IdentifierContext, interpret_identifier @@ -147,6 +148,8 @@ def convert_order_by_args( if arg.startswith("-"): reverse = True arg = arg[1:] + if len(arg) == 0: + raise InvalidQueryError("Empty dimension name in ORDER BY") arg = interpret_identifier(context, arg) if reverse: arg = Reversed(operand=arg) diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index 0247de4f32..d39930ee3b 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -3231,7 +3231,7 @@ def do_query(dimensions=("visit", "tract"), datasets=None, collections=None): # errors in a name for order_by in ("", "-"): - with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"): + with self.assertRaisesRegex((ValueError, InvalidQueryError), "Empty dimension name in ORDER BY"): list(do_query().order_by(order_by)) for order_by in ("undimension.name", "-undimension.name"): From 56cdc0c306cca01624bbc7e465d09d9457d2cd90 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 30 Jul 2024 14:24:46 -0700 Subject: [PATCH 4/6] Update query tests for new query system diagnostics Update unit tests related to interpreting identifiers in queries to account for differences between the old and new query systems. --- .../daf/butler/registry/tests/_registry.py | 63 ++++++++++++++----- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index d39930ee3b..4095a58034 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -3138,12 +3138,20 @@ def testQueryDataIdsExpressionError(self): registry = self.makeRegistry() self.loadData(registry, "base.yaml") bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")} - with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."): - registry.queryDataIds(["detector"], where="foo.bar = 12") + # The diagnostics raised are slightly different between the old query + # system (ValueError, first error string) and the new query system + # (InvalidQueryError, second error string). with self.assertRaisesRegex( - LookupError, "Dimension element name cannot be inferred in this context." + (LookupError, InvalidQueryError), + r"(No dimension element with name 'foo' in 'foo\.bar'\.)|(Unrecognized identifier 'foo.bar')", ): - registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind) + list(registry.queryDataIds(["detector"], where="foo.bar = 12")) + with self.assertRaisesRegex( + (LookupError, InvalidQueryError), + "(Dimension element name cannot be inferred in this context.)" + "|(Unrecognized identifier 'timespan')", + ): + list(registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)) def testQueryDataIdsOrderBy(self): """Test order_by and limit on result returned by queryDataIds().""" @@ -3229,42 +3237,67 @@ def do_query(dimensions=("visit", "tract"), datasets=None, collections=None): with query.materialize(): pass - # errors in a name + # Test exceptions for errors in a name. + # Many of these raise slightly different diagnostics in the old query + # system (ValueError, first error string) than the new query system + # (InvalidQueryError, second error string). for order_by in ("", "-"): with self.assertRaisesRegex((ValueError, InvalidQueryError), "Empty dimension name in ORDER BY"): list(do_query().order_by(order_by)) for order_by in ("undimension.name", "-undimension.name"): - with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"): + with self.assertRaisesRegex( + (ValueError, InvalidQueryError), + "(Unknown dimension element 'undimension')|(Unrecognized identifier 'undimension.name')", + ): list(do_query().order_by(order_by)) for order_by in ("attract", "-attract"): - with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"): + with self.assertRaisesRegex( + (ValueError, InvalidQueryError), + "(Metadata 'attract' cannot be found in any dimension)|(Unrecognized identifier 'attract')", + ): list(do_query().order_by(order_by)) - with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"): + with self.assertRaisesRegex( + (ValueError, InvalidQueryError), + "(Metadata 'exposure_time' exists in more than one dimension)" + "|(Ambiguous identifier 'exposure_time' matches multiple fields)", + ): list(do_query(("exposure", "visit")).order_by("exposure_time")) with self.assertRaisesRegex( - ValueError, - r"Timespan exists in more than one dimension element \(day_obs, exposure, visit\); " - r"qualify timespan with specific dimension name\.", + (ValueError, InvalidQueryError), + r"(Timespan exists in more than one dimension element \(day_obs, exposure, visit\); " + r"qualify timespan with specific dimension name\.)|" + r"(Ambiguous identifier 'timespan' matches multiple fields)", ): list(do_query(("exposure", "visit")).order_by("timespan.begin")) with self.assertRaisesRegex( - ValueError, "Cannot find any temporal dimension element for 'timespan.begin'" + (ValueError, InvalidQueryError), + "(Cannot find any temporal dimension element for 'timespan.begin')" + "|(Unrecognized identifier 'timespan')", ): list(do_query("tract").order_by("timespan.begin")) - with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"): + with self.assertRaisesRegex( + (ValueError, InvalidQueryError), + "(Cannot use 'timespan.begin' with non-temporal element)" + "|(Unrecognized field 'timespan' for tract)", + ): list(do_query("tract").order_by("tract.timespan.begin")) - with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."): + with self.assertRaisesRegex( + (ValueError, InvalidQueryError), + "(Field 'name' does not exist in 'tract')" "|(Unrecognized field 'name' for tract.)", + ): list(do_query("tract").order_by("tract.name")) with self.assertRaisesRegex( - ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?" + (ValueError, InvalidQueryError), + r"(Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?)" + r"|(Unrecognized identifier 'timestamp.begin')", ): list(do_query("visit").order_by("timestamp.begin")) From 5a406b3bd1454dfbd6cab7575742c2eb19d76c19 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 30 Jul 2024 14:58:52 -0700 Subject: [PATCH 5/6] Update test for minor query system differences Tweak the registry query unit tests to handle minor differences between the old and new query systems. --- .../daf/butler/registry/tests/_registry.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index 4095a58034..f60e4fc151 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -1378,12 +1378,20 @@ def testSkyMapDimensions(self): self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7)) self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",)) - # Specifying non-existing skymap is an exception - with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"): - rows = registry.queryDataIds( + def do_query(): + return registry.queryDataIds( dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'" ).toSet() + if self.supportsQueryGovernorValidation: + # Specifying non-existing skymap is an exception + with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"): + do_query() + else: + # New query system returns zero rows instead of raising an + # exception. + self.assertEqual(len(do_query()), 0) + def testSpatialJoin(self): """Test queries that involve spatial overlap joins.""" registry = self.makeRegistry() @@ -2663,11 +2671,12 @@ def testSkipCalibs(self): ) with self.assertRaises(exception_type): list(registry.queryDatasets("bias", collections=coll_list, findFirst=True)) - # explicit list will raise if there are temporal dimensions - with self.assertRaises(NotImplementedError): - registry.queryDataIds( - ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list - ).count() + if not self.supportsCalibrationCollectionInFindFirst: + # explicit list will raise if there are temporal dimensions + with self.assertRaises(NotImplementedError): + registry.queryDataIds( + ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list + ).count() # chain will skip datasets = list(registry.queryDatasets("bias", collections=chain)) From 66e3efcf0345a33cdf41a3ff659bf4e1bc8759cf Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 30 Jul 2024 15:03:27 -0700 Subject: [PATCH 6/6] Add test cover for RemoteButler query exceptions Fixed an issue where the "Hybrid" query results object for queryDataIds was testing the DirectButler implementation in some cases where it should have been testing the RemoteButler implementation. --- .../butler/tests/hybrid_butler_registry.py | 52 +++++++++++-------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/python/lsst/daf/butler/tests/hybrid_butler_registry.py b/python/lsst/daf/butler/tests/hybrid_butler_registry.py index 450e831952..83effc6eca 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_registry.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_registry.py @@ -28,7 +28,7 @@ from __future__ import annotations import contextlib -from collections.abc import Iterable, Iterator, Mapping, Sequence +from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from typing import Any, cast from .._dataset_association import DatasetAssociation @@ -312,17 +312,6 @@ def queryDataIds( check: bool = True, **kwargs: Any, ) -> DataCoordinateQueryResults: - direct = self._direct.queryDataIds( - dimensions, - dataId=dataId, - datasets=datasets, - collections=collections, - where=where, - bind=bind, - check=check, - **kwargs, - ) - remote = self._remote.queryDataIds( dimensions, dataId=dataId, @@ -334,8 +323,25 @@ def queryDataIds( **kwargs, ) + # Defer creation of the DirectButler version until we really need the + # object for handling an unimplemented method. This avoids masking of + # missing exception handling in the RemoteButler side -- otherwise + # exceptions from DirectButler would cause tests to pass. + def create_direct_result() -> DataCoordinateQueryResults: + return self._direct.queryDataIds( + dimensions, + dataId=dataId, + datasets=datasets, + collections=collections, + where=where, + bind=bind, + check=check, + **kwargs, + ) + return cast( - DataCoordinateQueryResults, _HybridDataCoordinateQueryResults(direct=direct, remote=remote) + DataCoordinateQueryResults, + _HybridDataCoordinateQueryResults(direct=create_direct_result, remote=remote), ) def queryDimensionRecords( @@ -388,7 +394,9 @@ class _HybridDataCoordinateQueryResults: provide a few methods that aren't implemented yet. """ - def __init__(self, *, direct: DataCoordinateQueryResults, remote: DataCoordinateQueryResults) -> None: + def __init__( + self, *, direct: Callable[[], DataCoordinateQueryResults], remote: DataCoordinateQueryResults + ) -> None: self._direct = direct self._remote = remote @@ -401,20 +409,20 @@ def __iter__(self) -> Iterator[DataCoordinate]: def order_by(self, *args: str) -> _HybridDataCoordinateQueryResults: return _HybridDataCoordinateQueryResults( - direct=self._direct.order_by(*args), remote=self._remote.order_by(*args) + direct=lambda: self._direct().order_by(*args), remote=self._remote.order_by(*args) ) def limit(self, limit: int, offset: int | None = 0) -> _HybridDataCoordinateQueryResults: return _HybridDataCoordinateQueryResults( - direct=self._direct.limit(limit, offset), remote=self._remote.limit(limit, offset) + direct=lambda: self._direct().limit(limit, offset), remote=self._remote.limit(limit, offset) ) def materialize(self) -> contextlib.AbstractContextManager[DataCoordinateQueryResults]: - return self._direct.materialize() + return self._direct().materialize() def expanded(self) -> _HybridDataCoordinateQueryResults: return _HybridDataCoordinateQueryResults( - remote=self._remote.expanded(), direct=self._direct.expanded() + remote=self._remote.expanded(), direct=lambda: self._direct().expanded() ) def subset( @@ -424,7 +432,7 @@ def subset( unique: bool = False, ) -> _HybridDataCoordinateQueryResults: return _HybridDataCoordinateQueryResults( - direct=self._direct.subset(dimensions, unique=unique), + direct=lambda: self._direct().subset(dimensions, unique=unique), remote=self._remote.subset(dimensions, unique=unique), ) @@ -436,7 +444,9 @@ def findDatasets( findFirst: bool = True, components: bool = False, ) -> ParentDatasetQueryResults: - return self._direct.findDatasets(datasetType, collections, findFirst=findFirst, components=components) + return self._direct().findDatasets( + datasetType, collections, findFirst=findFirst, components=components + ) def findRelatedDatasets( self, @@ -446,6 +456,6 @@ def findRelatedDatasets( findFirst: bool = True, dimensions: DimensionGroup | Iterable[str] | None = None, ) -> Iterable[tuple[DataCoordinate, DatasetRef]]: - return self._direct.findRelatedDatasets( + return self._direct().findRelatedDatasets( datasetType, collections, findFirst=findFirst, dimensions=dimensions )