From d461ccb68b87d386102658f0c765f3f4d901c384 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 8 Aug 2024 11:29:17 -0700 Subject: [PATCH] Disallow re.Pattern in dataset type queries RFC-879. --- doc/lsst.daf.butler/queries.rst | 3 +-- python/lsst/daf/butler/registry/_registry.py | 6 ++---- python/lsst/daf/butler/registry/sql_registry.py | 6 ++---- python/lsst/daf/butler/registry/wildcards.py | 10 ++++++++-- tests/test_butler.py | 9 +++++++++ 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/doc/lsst.daf.butler/queries.rst b/doc/lsst.daf.butler/queries.rst index 98802b44a5..3a6d43c69e 100644 --- a/doc/lsst.daf.butler/queries.rst +++ b/doc/lsst.daf.butler/queries.rst @@ -26,11 +26,10 @@ Arguments that specify one or more dataset types can generally take any of the f - `DatasetType` instances; - `str` values (corresponding to `DatasetType.name`); - `str` values using glob wildcard syntax which will be converted to `re.Pattern`; - - `re.Pattern` values (matched to `DatasetType.name` strings, via `~re.Pattern.fullmatch`); - iterables of any of the above; - the special value "``...``", which matches all dataset types. -Wildcards (`re.Pattern` and ``...``) are not allowed in certain contexts, such as `Registry.queryDataIds` and `Registry.queryDimensionRecords`, particularly when datasets are used only as a constraint on what is returned. +Wildcards (globs and ``...``) are not allowed in certain contexts, such as `Registry.queryDataIds` and `Registry.queryDimensionRecords`, particularly when datasets are used only as a constraint on what is returned. `Registry.queryDatasetTypes` can be used to resolve patterns before calling these methods when desired. In these contexts, passing a dataset type or name that is not registered with the repository will result in `MissingDatasetTypeError` being raised, while contexts that do accept wildcards will typically ignore unregistered dataset types (for example, `Registry.queryDatasets` will return no datasets for these). diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py index f9ee587441..30bd1a99fc 100644 --- a/python/lsst/daf/butler/registry/_registry.py +++ b/python/lsst/daf/butler/registry/_registry.py @@ -1241,10 +1241,8 @@ def queryDataIds( ``exposure``, ``detector``, and ``physical_filter`` values to only those for which at least one "raw" dataset exists in ``collections``. Allowed types include `DatasetType`, `str`, - and iterables thereof. Regular expression objects (i.e. - `re.Pattern`) are deprecated and will be removed after the v26 - release. See :ref:`daf_butler_dataset_type_expressions` for more - information. + and iterables thereof. See + :ref:`daf_butler_dataset_type_expressions` for more information. collections : collection expression, optional An expression that identifies the collections to search for datasets, such as a `str` (for full matches or partial matches diff --git a/python/lsst/daf/butler/registry/sql_registry.py b/python/lsst/daf/butler/registry/sql_registry.py index b11a614de8..23c6355f9b 100644 --- a/python/lsst/daf/butler/registry/sql_registry.py +++ b/python/lsst/daf/butler/registry/sql_registry.py @@ -2148,10 +2148,8 @@ def queryDataIds( ``exposure``, ``detector``, and ``physical_filter`` values to only those for which at least one "raw" dataset exists in ``collections``. Allowed types include `DatasetType`, `str`, - and iterables thereof. Regular expression objects (i.e. - `re.Pattern`) are deprecated and will be removed after the v26 - release. See :ref:`daf_butler_dataset_type_expressions` for more - information. + and iterables thereof. See + :ref:`daf_butler_dataset_type_expressions` for more information. collections : collection expression, optional An expression that identifies the collections to search for datasets, such as a `str` (for full matches or partial matches diff --git a/python/lsst/daf/butler/registry/wildcards.py b/python/lsst/daf/butler/registry/wildcards.py index c885af6f4c..666bb72f6e 100644 --- a/python/lsst/daf/butler/registry/wildcards.py +++ b/python/lsst/daf/butler/registry/wildcards.py @@ -434,7 +434,6 @@ def from_expression(cls, expression: Any) -> DatasetTypeWildcard: - a `str` dataset type name; - a `DatasetType` instance; - - a `re.Pattern` to match against dataset type names; - an iterable whose elements may be any of the above (any dataset type matching any element in the list is an overall match); - an existing `DatasetTypeWildcard` instance; @@ -455,9 +454,16 @@ def from_expression(cls, expression: Any) -> DatasetTypeWildcard: """ if isinstance(expression, cls): return expression + # CategorizedWildcard currently allows globs and regex as patterns + # but RFC-879 drops support for regex in dataset type specifications. + # Therefore check for their presence. + for exp in ensure_iterable(expression): + if isinstance(exp, re.Pattern): + raise DatasetTypeExpressionError("Regular expressions are not supported.") try: wildcard = CategorizedWildcard.fromExpression( - expression, coerceUnrecognized=lambda d: (d.name, d) + expression, + coerceUnrecognized=lambda d: (d.name, d), ) except TypeError as err: raise DatasetTypeExpressionError(f"Invalid dataset type expression: {expression!r}.") from err diff --git a/tests/test_butler.py b/tests/test_butler.py index 87e2105d5a..5dc0ae6967 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -36,6 +36,7 @@ import pickle import posixpath import random +import re import shutil import string import tempfile @@ -94,6 +95,7 @@ def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] CollectionTypeError, ConflictingDefinitionError, DataIdValueError, + DatasetTypeExpressionError, MissingCollectionError, OrphanedRecordError, ) @@ -1142,6 +1144,13 @@ def testGetDatasetTypes(self) -> None: fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) + # Query with wildcard. + dataset_types = butler.registry.queryDatasetTypes("metric*") + self.assertEqual(len(dataset_types), 4, f"Got: {dataset_types}") + # but not regex. + with self.assertRaises(DatasetTypeExpressionError): + butler.registry.queryDatasetTypes(["pvi", re.compile("metric.*")]) + # Now that we have some dataset types registered, validate them butler.validateConfiguration( ignore=[