From d8444aa59390e9448a8d75a04b9d506c37f0424c Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 14 Aug 2024 16:15:19 -0700 Subject: [PATCH 01/26] Fix test so that it is using a tuple not iterating over characters --- tests/test_cliCmdPruneDatasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cliCmdPruneDatasets.py b/tests/test_cliCmdPruneDatasets.py index e816deedee..137359d5d1 100644 --- a/tests/test_cliCmdPruneDatasets.py +++ b/tests/test_cliCmdPruneDatasets.py @@ -472,7 +472,7 @@ def test_purgeOnNonRunCollection(self, mockGetCollectionType): exPruneDatasetsCallArgs=None, exQueryDatasetsCallArgs=None, exGetTablesCalled=False, - exMsgs=(pruneDatasets_errPruneOnNotRun.format(collection=collectionName)), + exMsgs=(pruneDatasets_errPruneOnNotRun.format(collection=collectionName),), exPruneDatasetsExitCode=1, ) From 212d620e9fae083967f195d78d2e98b6164a3c64 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 11:12:43 -0700 Subject: [PATCH 02/26] Provide explicit reason when client/server tests are skipped Sometimes it's not missing Safir or httpx but a circular import. --- tests/test_query_remote.py | 7 +++++-- tests/test_remote_butler.py | 26 ++++++++++++++++++++------ tests/test_server.py | 7 +++++-- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/tests/test_query_remote.py b/tests/test_query_remote.py index 2557243d27..27e90e2e42 100644 --- a/tests/test_query_remote.py +++ b/tests/test_query_remote.py @@ -38,13 +38,16 @@ try: from lsst.daf.butler.tests.server import create_test_server -except ImportError: + + reason_text = "" +except ImportError as e: create_test_server = None + reason_text = str(e) TESTDIR = os.path.abspath(os.path.dirname(__file__)) -@unittest.skipIf(create_test_server is None, "Server dependencies not installed.") +@unittest.skipIf(create_test_server is None, f"Server dependencies not installed: {reason_text}") class RemoteButlerQueryTests(ButlerQueryTests, unittest.TestCase): """Test query system using client/server butler.""" diff --git a/tests/test_remote_butler.py b/tests/test_remote_butler.py index 41b1a119ff..b9df899567 100644 --- a/tests/test_remote_butler.py +++ b/tests/test_remote_butler.py @@ -42,20 +42,28 @@ try: import httpx from lsst.daf.butler.remote_butler import ButlerServerError, RemoteButler -except ImportError: + + remote_butler_import_fail_message = "" +except ImportError as e: # httpx is not available in rubin-env yet, so skip these tests if it's not # available RemoteButler = None + remote_butler_import_fail_message = str(e) try: from lsst.daf.butler.tests.server import create_test_server -except ImportError: + + server_import_fail_message = "" +except ImportError as e: create_test_server = None + server_import_fail_message = str(e) TESTDIR = os.path.abspath(os.path.dirname(__file__)) -@unittest.skipIf(RemoteButler is None, "httpx is not installed") +@unittest.skipIf( + RemoteButler is None, f"Remote butler can not be imported: {remote_butler_import_fail_message}" +) class RemoteButlerConfigTests(unittest.TestCase): """Test construction of RemoteButler via Butler()""" @@ -64,7 +72,9 @@ def test_bad_config(self): Butler({"cls": "lsst.daf.butler.remote_butler.RemoteButler", "remote_butler": {"url": "!"}}) -@unittest.skipIf(create_test_server is None, "Server dependencies not installed") +@unittest.skipIf( + create_test_server is None, f"Server dependencies not installed: {server_import_fail_message}" +) class RemoteButlerErrorHandlingTests(unittest.TestCase): """Test RemoteButler error handling.""" @@ -200,7 +210,9 @@ def test_query_projection_drop_postprocessing(self): pass -@unittest.skipIf(create_test_server is None, "Server dependencies not installed.") +@unittest.skipIf( + create_test_server is None, f"Server dependencies not installed: {server_import_fail_message}" +) class RemoteButlerSqliteRegistryTests(RemoteButlerRegistryTests, unittest.TestCase): """Tests for RemoteButler's registry shim, with a SQLite DB backing the server. @@ -209,7 +221,9 @@ class RemoteButlerSqliteRegistryTests(RemoteButlerRegistryTests, unittest.TestCa postgres = None -@unittest.skipIf(create_test_server is None, "Server dependencies not installed.") +@unittest.skipIf( + create_test_server is None, f"Server dependencies not installed: {server_import_fail_message}" +) class RemoteButlerPostgresRegistryTests(RemoteButlerRegistryTests, unittest.TestCase): """Tests for RemoteButler's registry shim, with a Postgres DB backing the server. diff --git a/tests/test_server.py b/tests/test_server.py index c3fb6c31a3..2c6c846608 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -41,8 +41,11 @@ from lsst.daf.butler.remote_butler.server import create_app from lsst.daf.butler.remote_butler.server._dependencies import butler_factory_dependency from lsst.daf.butler.tests.server import TEST_REPOSITORY_NAME, UnhandledServerError, create_test_server -except ImportError: + + reason_text = "" +except ImportError as e: create_test_server = None + reason_text = str(e) from unittest.mock import NonCallableMock, patch @@ -66,7 +69,7 @@ TESTDIR = os.path.abspath(os.path.dirname(__file__)) -@unittest.skipIf(create_test_server is None, "Server dependencies not installed.") +@unittest.skipIf(create_test_server is None, f"Server dependencies not installed: {reason_text}") class ButlerClientServerTestCase(unittest.TestCase): """Test for Butler client/server.""" From ac61b4d274d207446133f114a19a8a9e591ba3ef Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 13 Aug 2024 14:53:48 -0700 Subject: [PATCH 03/26] Change butler.collections to be a ButlerCollections sequence --- python/lsst/daf/butler/_butler.py | 10 ++++--- python/lsst/daf/butler/_butler_collections.py | 29 +++++++++++++++++-- .../butler/direct_butler/_direct_butler.py | 15 +++------- .../_direct_butler_collections.py | 6 +++- .../butler/remote_butler/_remote_butler.py | 13 +++++---- python/lsst/daf/butler/tests/hybrid_butler.py | 8 ++--- tests/test_butler.py | 2 +- 7 files changed, 54 insertions(+), 29 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 2c16478aa9..621be17b17 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -1422,15 +1422,17 @@ def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains (`~lsst.daf.butler.ButlerCollections`). - Use of this object is preferred over `registry` wherever possible. + Deprecated. Replaced with ``collections`` property. """ raise NotImplementedError() @property @abstractmethod - def collections(self) -> Sequence[str]: - """The collections to search by default, in order - (`~collections.abc.Sequence` [ `str` ]). + def collections(self) -> ButlerCollections: + """Object with methods for modifying and querying collections + (`~lsst.daf.butler.ButlerCollections`). + + Use of this object is preferred over `registry` wherever possible. """ raise NotImplementedError() diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index e6032b5f2d..da457d34f5 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -30,12 +30,37 @@ __all__ = ("ButlerCollections",) from abc import ABC, abstractmethod -from collections.abc import Iterable +from collections.abc import Iterable, Sequence +from typing import Any, overload -class ButlerCollections(ABC): +class ButlerCollections(ABC, Sequence): """Methods for working with collections stored in the Butler.""" + @overload + def __getitem__(self, index: int) -> str: ... + + @overload + def __getitem__(self, index: slice) -> Sequence[str]: ... + + def __getitem__(self, index: int | slice) -> str | Sequence[str]: + return self.defaults[index] + + def __len__(self) -> int: + return len(self.defaults) + + def __eq__(self, other: Any) -> bool: + # Do not try to compare registry instances. + if not isinstance(other, type(self)): + return False + return self.defaults == other.defaults + + @property + @abstractmethod + def defaults(self) -> Sequence[str]: + """Collection defaults associated with this butler.""" + raise NotImplementedError("Defaults must be implemented by a subclass") + @abstractmethod def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: """Add children to the end of a CHAINED collection. diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index e1a3a73fd6..71e22d045b 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -296,7 +296,7 @@ def __reduce__(self) -> tuple: DirectButler._unpickle, ( self._config, - self.collections, + self.collections.defaults, self.run, dict(self._registry.defaults.dataId.required), self._registry.isWriteable(), @@ -2154,16 +2154,9 @@ def collection_chains(self) -> DirectButlerCollections: return DirectButlerCollections(self._registry) @property - def collections(self) -> Sequence[str]: - """The collections to search by default, in order - (`~collections.abc.Sequence` [ `str` ]). - - This is an alias for ``self.registry.defaults.collections``. It cannot - be set directly in isolation, but all defaults may be changed together - by assigning a new `RegistryDefaults` instance to - ``self.registry.defaults``. - """ - return self._registry.defaults.collections + def collections(self) -> DirectButlerCollections: + """Object with methods for modifying and inspecting collections.""" + return DirectButlerCollections(self._registry) @property def run(self) -> str | None: diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 1f5ac270b4..224d734132 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -29,7 +29,7 @@ __all__ = ("DirectButlerCollections",) -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from lsst.utils.iteration import ensure_iterable @@ -49,6 +49,10 @@ class DirectButlerCollections(ButlerCollections): def __init__(self, registry: SqlRegistry): self._registry = registry + @property + def defaults(self) -> Sequence[str]: + return self._registry.defaults.collections + def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: return self._registry._managers.collections.extend_collection_chain( parent_collection_name, list(ensure_iterable(child_collection_names)) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 3392fc8791..7a9cf40dbe 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -61,6 +61,7 @@ from ._collection_args import convert_collection_arg_to_glob_string_list from ._query_driver import RemoteQueryDriver from ._ref_utils import apply_storage_class_override, normalize_dataset_type_name, simplify_dataId +from ._remote_butler_collections import RemoteButlerCollections from .server_models import ( CollectionList, FindDatasetRequestModel, @@ -145,7 +146,12 @@ def isWriteable(self) -> bool: @property def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" - raise NotImplementedError() + return RemoteButlerCollections(self._registry) + + @property + def collections(self) -> ButlerCollections: + """Object with methods for modifying collection chains.""" + return RemoteButlerCollections(self._registry) @property def dimensions(self) -> DimensionUniverse: @@ -510,11 +516,6 @@ def validateConfiguration( # Docstring inherited. raise NotImplementedError() - @property - def collections(self) -> Sequence[str]: - # Docstring inherited. - return self._registry_defaults.collections - @property def run(self) -> str | None: # Docstring inherited. diff --git a/python/lsst/daf/butler/tests/hybrid_butler.py b/python/lsst/daf/butler/tests/hybrid_butler.py index c64d2a4ad3..9c66f1af1b 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler.py +++ b/python/lsst/daf/butler/tests/hybrid_butler.py @@ -321,10 +321,6 @@ def validateConfiguration( ) -> None: return self._direct_butler.validateConfiguration(logFailures, datasetTypeNames, ignore) - @property - def collections(self) -> Sequence[str]: - return self._remote_butler.collections - @property def run(self) -> str | None: return self._remote_butler.run @@ -459,3 +455,7 @@ def _extract_all_dimension_records_from_data_ids( @property def collection_chains(self) -> ButlerCollections: return self._direct_butler.collection_chains + + @property + def collections(self) -> ButlerCollections: + return self._remote_butler.collection_chains diff --git a/tests/test_butler.py b/tests/test_butler.py index 5dc0ae6967..f49173dcd3 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -652,7 +652,7 @@ def testConstructor(self) -> None: self.assertEqual(collections, {special_run}) butler2 = Butler.from_config(butler=butler, collections=["other"]) - self.assertEqual(butler2.collections, ("other",)) + self.assertEqual(butler2.collections.defaults, ("other",)) self.assertIsNone(butler2.run) self.assertEqual(type(butler._datastore), type(butler2._datastore)) self.assertEqual(butler._datastore.config, butler2._datastore.config) From 44602597494381cdb10ae1d6cf9f4252033bf9f3 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 13 Aug 2024 15:10:36 -0700 Subject: [PATCH 04/26] Initial work on extending butler collections API --- python/lsst/daf/butler/_butler_collections.py | 84 +++++++++++++++- .../_direct_butler_collections.py | 36 ++++++- .../butler/remote_butler/_remote_butler.py | 8 +- .../_remote_butler_collections.py | 97 +++++++++++++++++++ .../daf/butler/script/queryCollections.py | 52 +++++----- tests/test_server.py | 2 +- 6 files changed, 242 insertions(+), 37 deletions(-) create mode 100644 python/lsst/daf/butler/remote_butler/_remote_butler_collections.py diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index da457d34f5..c2e1df1120 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -27,12 +27,31 @@ from __future__ import annotations -__all__ = ("ButlerCollections",) +__all__ = ("ButlerCollections", "CollectionInfo") from abc import ABC, abstractmethod -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Sequence, Set from typing import Any, overload +from pydantic import BaseModel + +from ._collection_type import CollectionType + + +class CollectionInfo(BaseModel): + """Information about a single Butler collection.""" + + name: str + """Name of the collection.""" + type: CollectionType + """Type of the collection.""" + doc: str = "" + """Documentation string associated with this collection.""" + children: tuple[str, ...] = tuple() + """Children of this collection (only if CHAINED).""" + parents: frozenset[str] = frozenset() + """Any parents of this collection.""" + class ButlerCollections(ABC, Sequence): """Methods for working with collections stored in the Butler.""" @@ -190,3 +209,64 @@ def remove_from_chain( transactions short. """ raise NotImplementedError() + + @abstractmethod + def query( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Sequence[str]: + """Query the butler for collections matching an expression. + + Parameters + ---------- + expression : `str` or `~collections.abc.Iterable` [ `str` ] + One or more collection names or globs to include in the search. + collection_types : `set` [`CollectionType`], `CollectionType` or `None` + Restrict the types of collections to be searched. If `None` all + collection types are searched. + flatten_chains : `bool`, optional + If `True` (`False` is default), recursively yield the child + collections of matching `~CollectionType.CHAINED` collections. + include_chains : `bool` or `None`, optional + If `True`, yield records for matching `~CollectionType.CHAINED` + collections. Default is the opposite of ``flatten_chains``: + include either CHAINED collections or their children, but not both. + + Returns + ------- + collections : `~collections.abc.Sequence` [ `str` ] + The names of collections that match ``expression``. + + Notes + ----- + The order in which collections are returned is unspecified, except that + the children of a `~CollectionType.CHAINED` collection are guaranteed + to be in the order in which they are searched. When multiple parent + `~CollectionType.CHAINED` collections match the same criteria, the + order in which the two lists appear is unspecified, and the lists of + children may be incomplete if a child has multiple parents. + """ + raise NotImplementedError() + + @abstractmethod + def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + """Obtain information for a specific collection. + + Parameters + ---------- + name : `str` + The name of the collection of interest. + include_doc : `bool`, optional + If `True` any documentation about this collection will be included. + include_parents : `bool`, optional + If `True` any parents of this collection will be included. + + Returns + ------- + info : `CollectionInfo` + Information on the requested collection. + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 224d734132..3705e2d3e8 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -29,11 +29,13 @@ __all__ = ("DirectButlerCollections",) -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Sequence, Set from lsst.utils.iteration import ensure_iterable -from .._butler_collections import ButlerCollections +from .._butler_collections import ButlerCollections, CollectionInfo +from .._collection_type import CollectionType +from ..registry.interfaces import ChainedCollectionRecord from ..registry.sql_registry import SqlRegistry @@ -76,3 +78,33 @@ def remove_from_chain( return self._registry._managers.collections.remove_from_collection_chain( parent_collection_name, list(ensure_iterable(child_collection_names)) ) + + def query( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Sequence[str]: + if collection_types is None: + collection_types = CollectionType.all() + return self._registry.queryCollections( + expression, + collectionTypes=collection_types, + flattenChains=flatten_chains, + includeChains=include_chains, + ) + + def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + record = self._registry.get_collection_record(name) + doc = "" + if include_doc: + doc = self._registry.getCollectionDocumentation(name) or "" + children: tuple[str, ...] = tuple() + if record.type == CollectionType.CHAINED: + assert isinstance(record, ChainedCollectionRecord) + children = tuple(record.children) + parents: set[str] = set() + if include_parents: + parents = self._registry.getCollectionParentChains(name) + return CollectionInfo(name=name, type=record.type, doc=doc, parents=parents, children=children) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 7a9cf40dbe..fbabfd0f02 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -146,12 +146,16 @@ def isWriteable(self) -> bool: @property def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" - return RemoteButlerCollections(self._registry) + from ._registry import RemoteButlerRegistry + + return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry)) @property def collections(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" - return RemoteButlerCollections(self._registry) + from ._registry import RemoteButlerRegistry + + return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry)) @property def dimensions(self) -> DimensionUniverse: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py new file mode 100644 index 0000000000..ae572e701c --- /dev/null +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -0,0 +1,97 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("RemoteButlerCollections",) + +from collections.abc import Iterable, Sequence, Set +from typing import TYPE_CHECKING + +from .._butler_collections import ButlerCollections, CollectionInfo +from .._collection_type import CollectionType + +if TYPE_CHECKING: + from ._registry import RemoteButlerRegistry + + +class RemoteButlerCollections(ButlerCollections): + """Implementation of ButlerCollections for RemoteButler. + + Parameters + ---------- + registry : `~lsst.daf.butler.registry.sql_registry.SqlRegistry` + Registry object used to work with the collections database. + """ + + def __init__(self, registry: RemoteButlerRegistry): + self._registry = registry + + @property + def defaults(self) -> Sequence[str]: + return self._registry.defaults.collections + + def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: + raise NotImplementedError("Not yet available") + + def prepend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: + raise NotImplementedError("Not yet available") + + def redefine_chain( + self, parent_collection_name: str, child_collection_names: str | Iterable[str] + ) -> None: + raise NotImplementedError("Not yet available") + + def remove_from_chain( + self, parent_collection_name: str, child_collection_names: str | Iterable[str] + ) -> None: + raise NotImplementedError("Not yet available") + + def query( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Sequence[str]: + if collection_types is None: + collection_types = CollectionType.all() + return self._registry.queryCollections( + expression, + collectionTypes=collection_types, + flattenChains=flatten_chains, + includeChains=include_chains, + ) + + def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + info = self._registry._get_collection_info( + name, include_doc=include_doc, include_parents=include_parents + ) + doc = info.doc or "" + children = info.children or () + parents = info.parents or set() + return CollectionInfo(name=name, type=info.type, doc=doc, parents=parents, children=children) diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py index 2951080c98..82d2b6f1ce 100644 --- a/python/lsst/daf/butler/script/queryCollections.py +++ b/python/lsst/daf/butler/script/queryCollections.py @@ -71,38 +71,34 @@ def _getTable( dtype=(str, str, str), ) butler = Butler.from_config(repo) - names = sorted( - butler.registry.queryCollections(collectionTypes=frozenset(collection_type), expression=glob or ...) - ) + names = sorted(butler.collections.query(glob or "*", collection_types=frozenset(collection_type))) if inverse: for name in names: - type = butler.registry.getCollectionType(name) - parentNames = butler.registry.getCollectionParentChains(name) - if parentNames: + info = butler.collections.get_info(name, include_parents=True) + if info.parents: first = True - for parentName in sorted(parentNames): - table.add_row((name if first else "", type.name if first else "", parentName)) + for parentName in sorted(info.parents): + table.add_row((name if first else "", info.type.name if first else "", parentName)) first = False else: - table.add_row((name, type.name, "")) + table.add_row((name, info.type.name, "")) # If none of the datasets has a parent dataset then remove the # description column. if not any(c for c in table[descriptionCol]): del table[descriptionCol] else: for name in names: - type = butler.registry.getCollectionType(name) - if type == CollectionType.CHAINED: - children = butler.registry.getCollectionChain(name) - if children: + info = butler.collections.get_info(name) + if info.type == CollectionType.CHAINED: + if info.children: first = True - for child in children: - table.add_row((name if first else "", type.name if first else "", child)) + for child in info.children: + table.add_row((name if first else "", info.type.name if first else "", child)) first = False else: - table.add_row((name, type.name, "")) + table.add_row((name, info.type.name, "")) else: - table.add_row((name, type.name, "")) + table.add_row((name, info.type.name, "")) # If there aren't any CHAINED datasets in the results then remove the # description column. if not any(columnVal == CollectionType.CHAINED.name for columnVal in table[typeCol]): @@ -147,21 +143,17 @@ def _getTree( butler = Butler.from_config(repo, without_datastore=True) def addCollection(name: str, level: int = 0) -> None: - collectionType = butler.registry.getCollectionType(name) - table.add_row((" " * level + name, collectionType.name)) + info = butler.collections.get_info(name, include_parents=inverse) + table.add_row((" " * level + name, info.type.name)) if inverse: - parentNames = butler.registry.getCollectionParentChains(name) - for pname in sorted(parentNames): + for pname in sorted(info.parents): addCollection(pname, level + 1) else: - if collectionType == CollectionType.CHAINED: - childNames = butler.registry.getCollectionChain(name) - for name in childNames: + if info.type == CollectionType.CHAINED: + for name in info.children: addCollection(name, level + 1) - collections = butler.registry.queryCollections( - collectionTypes=frozenset(collection_type), expression=glob or ... - ) + collections = butler.collections.query(glob or "*", collection_types=frozenset(collection_type)) for collection in sorted(collections): addCollection(collection) return table @@ -174,12 +166,12 @@ def _getFlatten( ) -> Table: butler = Butler.from_config(repo) collectionNames = list( - butler.registry.queryCollections( - collectionTypes=frozenset(collection_type), flattenChains=True, expression=glob or ... + butler.collections.query( + glob or "*", collection_types=frozenset(collection_type), flatten_chains=True ) ) - collectionTypes = [butler.registry.getCollectionType(c).name for c in collectionNames] + collectionTypes = [butler.collections.get_info(c).type.name for c in collectionNames] return Table((collectionNames, collectionTypes), names=("Name", "Type")) diff --git a/tests/test_server.py b/tests/test_server.py index 2c6c846608..90c0e8c14c 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -222,7 +222,7 @@ def override_read(http_resource_path): ) self.assertIsInstance(butler, RemoteButler) self.assertEqual(butler._connection.server_url, server_url) - self.assertEqual(butler.collections, ("collection1", "collection2")) + self.assertEqual(butler.collections.defaults, ("collection1", "collection2")) self.assertEqual(butler.run, "collection2") butler_factory = LabeledButlerFactory({"server": server_url}) From 1ad0ab1effe2bdb7d0f933309b3036b2cd026cc4 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 14 Aug 2024 16:16:49 -0700 Subject: [PATCH 05/26] Use new collections API in more scripts --- .../lsst/daf/butler/script/_pruneDatasets.py | 4 ++-- .../daf/butler/script/certifyCalibrations.py | 11 +++++----- .../lsst/daf/butler/script/collectionChain.py | 11 +++++----- python/lsst/daf/butler/script/exportCalibs.py | 16 +++++++------- python/lsst/daf/butler/script/queryDataIds.py | 4 ++-- .../lsst/daf/butler/script/queryDatasets.py | 5 ++--- .../butler/script/queryDimensionRecords.py | 4 ++-- .../daf/butler/script/removeCollections.py | 22 +++++-------------- python/lsst/daf/butler/script/removeRuns.py | 16 +++++--------- .../daf/butler/script/retrieveArtifacts.py | 5 ++--- .../daf/butler/script/transferDatasets.py | 5 ++--- tests/test_cliCmdPruneDatasets.py | 20 ++++++++--------- 12 files changed, 51 insertions(+), 72 deletions(-) diff --git a/python/lsst/daf/butler/script/_pruneDatasets.py b/python/lsst/daf/butler/script/_pruneDatasets.py index 1c8caf2fd5..f3a7a155c8 100644 --- a/python/lsst/daf/butler/script/_pruneDatasets.py +++ b/python/lsst/daf/butler/script/_pruneDatasets.py @@ -223,8 +223,8 @@ def pruneDatasets( # If purging, verify that the collection to purge is RUN type collection. if purge_run: butler = Butler.from_config(repo, without_datastore=True) - collectionType = butler.registry.getCollectionType(purge_run) - if collectionType is not CollectionType.RUN: + collection_info = butler.collections.get_info(purge_run) + if collection_info.type is not CollectionType.RUN: return PruneDatasetsResult( state=PruneDatasetsResult.State.ERR_PRUNE_ON_NOT_RUN, errDict=dict(collection=purge_run) ) diff --git a/python/lsst/daf/butler/script/certifyCalibrations.py b/python/lsst/daf/butler/script/certifyCalibrations.py index cec8142aed..77759badf7 100644 --- a/python/lsst/daf/butler/script/certifyCalibrations.py +++ b/python/lsst/daf/butler/script/certifyCalibrations.py @@ -70,13 +70,14 @@ def certifyCalibrations( collection, instead of just the most recent one. """ butler = Butler.from_config(repo, writeable=True, without_datastore=True) - registry = butler.registry timespan = Timespan( begin=astropy.time.Time(begin_date, scale="tai") if begin_date is not None else None, end=astropy.time.Time(end_date, scale="tai") if end_date is not None else None, ) - if not search_all_inputs and registry.getCollectionType(input_collection) is CollectionType.CHAINED: - input_collection = next(iter(registry.getCollectionChain(input_collection))) + if not search_all_inputs: + collection_info = butler.collections.get_info(input_collection) + if collection_info.type is CollectionType.CHAINED: + input_collection = collection_info.children[0] with butler._query() as query: results = query.datasets(dataset_type_name, collections=input_collection) @@ -86,5 +87,5 @@ def certifyCalibrations( raise RuntimeError( f"No inputs found for dataset {dataset_type_name} in {input_collection}. {explanation}" ) - registry.registerCollection(output_collection, type=CollectionType.CALIBRATION) - registry.certify(output_collection, refs, timespan) + butler.registry.registerCollection(output_collection, type=CollectionType.CALIBRATION) + butler.registry.certify(output_collection, refs, timespan) diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py index 36a7b2b138..03054a22bb 100644 --- a/python/lsst/daf/butler/script/collectionChain.py +++ b/python/lsst/daf/butler/script/collectionChain.py @@ -32,7 +32,6 @@ from .._butler import Butler from .._collection_type import CollectionType from ..registry import MissingCollectionError -from ..registry.wildcards import CollectionWildcard def collectionChain( @@ -80,7 +79,7 @@ def collectionChain( raise RuntimeError(f"Must provide children when defining a collection chain in mode {mode}.") try: - butler.registry.getCollectionType(parent) + butler.collections.get_info(parent) except MissingCollectionError: # Create it -- but only if mode can work with empty chain. if mode in ("redefine", "extend", "prepend"): @@ -96,12 +95,11 @@ def collectionChain( if flatten: if mode not in ("redefine", "prepend", "extend"): raise RuntimeError(f"'flatten' flag is not allowed for {mode}") - wildcard = CollectionWildcard.from_names(children) - children = butler.registry.queryCollections(wildcard, flattenChains=True) + children = butler.collections.query(children, flatten_chains=True) _modify_collection_chain(butler, mode, parent, children) - return tuple(butler.registry.getCollectionChain(parent)) + return butler.collections.get_info(parent).children def _modify_collection_chain(butler: Butler, mode: str, parent: str, children: Iterable[str]) -> None: @@ -125,7 +123,8 @@ def _find_children_to_pop(butler: Butler, parent: str, children: Iterable[str]) the given indexes. """ children = list(children) - current = butler.registry.getCollectionChain(parent) + collection_info = butler.collections.get_info(parent) + current = collection_info.children n_current = len(current) if children: diff --git a/python/lsst/daf/butler/script/exportCalibs.py b/python/lsst/daf/butler/script/exportCalibs.py index 25573f0c7b..6d2a9e8233 100644 --- a/python/lsst/daf/butler/script/exportCalibs.py +++ b/python/lsst/daf/butler/script/exportCalibs.py @@ -68,7 +68,7 @@ def find_calibration_datasets( RuntimeError Raised if the collection to search is not a CALIBRATION collection. """ - if butler.registry.getCollectionType(collection) != CollectionType.CALIBRATION: + if butler.collections.get_info(collection).type != CollectionType.CALIBRATION: raise RuntimeError(f"Collection {collection} is not a CALIBRATION collection.") exportDatasets = [] @@ -122,7 +122,7 @@ def exportCalibs( butler = Butler.from_config(repo, writeable=False) dataset_type_query = dataset_type or ... - collections_query = collections or ... + collections_query = collections or "*" calibTypes = [ datasetType @@ -133,18 +133,18 @@ def exportCalibs( collectionsToExport = [] datasetsToExport = [] - for collection in butler.registry.queryCollections( + for collection in butler.collections.query( collections_query, - flattenChains=True, - includeChains=True, - collectionTypes={CollectionType.CALIBRATION, CollectionType.CHAINED}, + flatten_chains=True, + include_chains=True, + collection_types={CollectionType.CALIBRATION, CollectionType.CHAINED}, ): log.info("Checking collection: %s", collection) # Get collection information. collectionsToExport.append(collection) - collectionType = butler.registry.getCollectionType(collection) - if collectionType == CollectionType.CALIBRATION: + info = butler.collections.get_info(collection) + if info.type == CollectionType.CALIBRATION: exportDatasets = find_calibration_datasets(butler, collection, calibTypes) datasetsToExport.extend(exportDatasets) diff --git a/python/lsst/daf/butler/script/queryDataIds.py b/python/lsst/daf/butler/script/queryDataIds.py index c6053b5127..dd75eb4824 100644 --- a/python/lsst/daf/butler/script/queryDataIds.py +++ b/python/lsst/daf/butler/script/queryDataIds.py @@ -168,9 +168,9 @@ def queryDataIds( with butler._query() as query: if datasets: # Need to constrain results based on dataset type and collection. - query_collections = collections or ... + query_collections = collections or "*" - expanded_collections = butler.registry.queryCollections(query_collections) + expanded_collections = butler.collections.query(query_collections) sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) for dt in dataset_types: diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index 73dde6b96a..325d2bb8e4 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -29,7 +29,6 @@ import dataclasses from collections import defaultdict from collections.abc import Iterable, Iterator -from types import EllipsisType from typing import TYPE_CHECKING import numpy as np @@ -214,14 +213,14 @@ def getDatasets(self) -> Iterator[DatasetRef]: Dataset references matching the given query criteria. """ datasetTypes = self._dataset_type_glob or ... - query_collections: Iterable[str] | EllipsisType = self._collections_wildcard or ... + query_collections: Iterable[str] = self._collections_wildcard or ["*"] # Currently need to use old interface to get all the matching # dataset types and loop over the dataset types executing a new # query each time. dataset_types = self.butler.registry.queryDatasetTypes(datasetTypes) with self.butler._query() as query: - query_collections = self.butler.registry.queryCollections(query_collections) + query_collections = self.butler.collections.query(query_collections) # Accumulate over dataset types. for dt in dataset_types: results = query.datasets(dt, collections=query_collections, find_first=self._find_first) diff --git a/python/lsst/daf/butler/script/queryDimensionRecords.py b/python/lsst/daf/butler/script/queryDimensionRecords.py index ee8ad09995..074efac915 100644 --- a/python/lsst/daf/butler/script/queryDimensionRecords.py +++ b/python/lsst/daf/butler/script/queryDimensionRecords.py @@ -79,8 +79,8 @@ def queryDimensionRecords( with butler._query() as query: if datasets: - query_collections = collections or ... - expanded_collections = butler.registry.queryCollections(query_collections) + query_collections = collections or "*" + expanded_collections = butler.collections.query(query_collections) dataset_types = list(butler.registry.queryDatasetTypes(datasets)) sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index 0b8a1ec87e..df3f41b106 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -85,30 +85,18 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo, without_datastore=True) try: - names = sorted( - butler.registry.queryCollections( - collectionTypes=frozenset( - ( - CollectionType.RUN, - CollectionType.TAGGED, - CollectionType.CHAINED, - CollectionType.CALIBRATION, - ) - ), - expression=collection, - includeChains=True, - ) - ) + names = sorted(butler.collections.query(collection, include_chains=True)) except MissingCollectionError: + # Hide the error and act like no collections should be removed. names = [] collections = Table(names=("Collection", "Collection Type"), dtype=(str, str)) runCollections = Table(names=("Collection",), dtype=(str,)) for name in names: - collectionType = butler.registry.getCollectionType(name).name - if collectionType == "RUN": + collection_info = butler.collections.get_info(name) + if collection_info.type == CollectionType.RUN: runCollections.add_row((name,)) else: - collections.add_row((name, collectionType)) + collections.add_row((name, collection_info.type.name)) return CollectionInfo(collections, runCollections) diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index efd93e4a9d..50ad5b9b2b 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -87,23 +87,17 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo) try: - collectionNames = list( - butler.registry.queryCollections( - collectionTypes=frozenset((CollectionType.RUN,)), - expression=collection, - includeChains=False, - ) - ) + collectionNames = butler.collections.query(collection, CollectionType.RUN, include_chains=False) except MissingCollectionError: + # Act as if no collections matched. collectionNames = [] - dataset_types = butler.registry.queryDatasetTypes(...) runs = [] datasets: dict[str, int] = defaultdict(int) for collectionName in collectionNames: - assert butler.registry.getCollectionType(collectionName).name == "RUN" - parents = butler.registry.getCollectionParentChains(collectionName) - runs.append(RemoveRun(collectionName, list(parents))) + collection_info = butler.collections.get_info(collectionName, include_parents=True) + assert collection_info.type == CollectionType.RUN + runs.append(RemoveRun(collectionName, list(collection_info.parents))) with butler._query() as query: for dt in dataset_types: results = query.datasets(dt, collections=collectionName) diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index bbfc74f95d..ca25999344 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -30,7 +30,6 @@ __all__ = ("retrieveArtifacts",) import logging -from types import EllipsisType from typing import TYPE_CHECKING from .._butler import Butler @@ -85,7 +84,7 @@ def retrieveArtifacts( The destination URIs of every transferred artifact. """ query_types = dataset_type or ... - query_collections: tuple[str, ...] | EllipsisType = collections or ... + query_collections: tuple[str, ...] = collections or ("*",) butler = Butler.from_config(repo, writeable=False) @@ -94,7 +93,7 @@ def retrieveArtifacts( dataset_types = butler.registry.queryDatasetTypes(query_types) refs: list[DatasetRef] = [] with butler._query() as query: - expanded_collections = butler.registry.queryCollections(query_collections) + expanded_collections = butler.collections.query(query_collections) for dt in dataset_types: results = query.datasets(dt, collections=expanded_collections, find_first=find_first) if where: diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py index 5b9eeab9ec..e5e390bc88 100644 --- a/python/lsst/daf/butler/script/transferDatasets.py +++ b/python/lsst/daf/butler/script/transferDatasets.py @@ -29,7 +29,6 @@ __all__ = ("transferDatasets",) import logging -from types import EllipsisType from lsst.daf.butler import DatasetRef @@ -79,12 +78,12 @@ def transferDatasets( dest_butler = Butler.from_config(dest, writeable=True) dataset_type_expr = dataset_type or ... - collections_expr: tuple[str, ...] | EllipsisType = collections or ... + collections_expr: tuple[str, ...] = collections or ("*",) dataset_types = source_butler.registry.queryDatasetTypes(dataset_type_expr) source_refs: list[DatasetRef] = [] with source_butler._query() as query: - query_collections = source_butler.registry.queryCollections(collections_expr) + query_collections = source_butler.collections.query(collections_expr) # Loop over dataset types and accumulate. for dt in dataset_types: results = query.datasets(dt, collections=query_collections, find_first=find_first) diff --git a/tests/test_cliCmdPruneDatasets.py b/tests/test_cliCmdPruneDatasets.py index 137359d5d1..03f3e8ab38 100644 --- a/tests/test_cliCmdPruneDatasets.py +++ b/tests/test_cliCmdPruneDatasets.py @@ -35,7 +35,7 @@ import lsst.daf.butler.registry.sql_registry import lsst.daf.butler.script from astropy.table import Table -from lsst.daf.butler import CollectionType +from lsst.daf.butler import CollectionInfo, CollectionType from lsst.daf.butler.cli.butler import cli as butlerCli from lsst.daf.butler.cli.cmd.commands import ( pruneDatasets_askContinueMsg, @@ -400,9 +400,9 @@ def test_purgeNoOp(self): ) @patch.object( - lsst.daf.butler.registry.sql_registry.SqlRegistry, - "getCollectionType", - side_effect=lambda x: CollectionType.RUN, + lsst.daf.butler.direct_butler._direct_butler_collections.DirectButlerCollections, + "get_info", + side_effect=lambda x: CollectionInfo(name="run", type=CollectionType.RUN), ) def test_purgeImpliedArgs(self, mockGetCollectionType): """Verify the arguments implied by --purge. @@ -432,9 +432,9 @@ def test_purgeImpliedArgs(self, mockGetCollectionType): ) @patch.object( - lsst.daf.butler.registry.sql_registry.SqlRegistry, - "getCollectionType", - side_effect=lambda x: CollectionType.RUN, + lsst.daf.butler.direct_butler._direct_butler_collections.DirectButlerCollections, + "get_info", + side_effect=lambda x: CollectionInfo(name="run", type=CollectionType.RUN), ) def test_purgeImpliedArgsWithCollections(self, mockGetCollectionType): """Verify the arguments implied by --purge, with a COLLECTIONS.""" @@ -457,9 +457,9 @@ def test_purgeImpliedArgsWithCollections(self, mockGetCollectionType): ) @patch.object( - lsst.daf.butler.registry.sql_registry.SqlRegistry, - "getCollectionType", - side_effect=lambda x: CollectionType.TAGGED, + lsst.daf.butler.direct_butler._direct_butler_collections.DirectButlerCollections, + "get_info", + side_effect=lambda x: CollectionInfo(name="myTaggedCollection", type=CollectionType.TAGGED), ) def test_purgeOnNonRunCollection(self, mockGetCollectionType): """Verify calling run on a non-run collection fails with expected From 3c3d3b799c5967778590d7e210f2ee4cd71981ca Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 11:13:25 -0700 Subject: [PATCH 06/26] Replace queryCollections call in export --- python/lsst/daf/butler/transfers/_context.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/lsst/daf/butler/transfers/_context.py b/python/lsst/daf/butler/transfers/_context.py index 37af23a58d..0c293c830d 100644 --- a/python/lsst/daf/butler/transfers/_context.py +++ b/python/lsst/daf/butler/transfers/_context.py @@ -358,11 +358,10 @@ def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: collectionTypes = {CollectionType.TAGGED} if datasetType.isCalibration(): collectionTypes.add(CollectionType.CALIBRATION) - resolved_collections = self._butler._registry.queryCollections( + resolved_collections = self._butler.collections.query( self._collections.keys(), - datasetType=datasetType, - collectionTypes=collectionTypes, - flattenChains=False, + collection_types=collectionTypes, + flatten_chains=False, ) with self._butler._query() as query: query = query.join_dataset_search(datasetType, resolved_collections) From 077b5e521e6ab45599d03ed32b1d041dc4d9ef6a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 12:55:16 -0700 Subject: [PATCH 07/26] Add butler.collections.register and use it --- python/lsst/daf/butler/_butler_collections.py | 27 +++++++++++++++++++ .../butler/direct_butler/_direct_butler.py | 2 +- .../_direct_butler_collections.py | 3 +++ .../_remote_butler_collections.py | 3 +++ python/lsst/daf/butler/script/_associate.py | 2 +- .../daf/butler/script/certifyCalibrations.py | 2 +- .../lsst/daf/butler/script/collectionChain.py | 2 +- python/lsst/daf/butler/tests/utils.py | 4 +-- 8 files changed, 39 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index c2e1df1120..68c91aa919 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -270,3 +270,30 @@ def get_info(self, name: str, include_doc: bool = False, include_parents: bool = Information on the requested collection. """ raise NotImplementedError() + + @abstractmethod + def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: + """Add a new collection if one with the given name does not exist. + + Parameters + ---------- + name : `str` + The name of the collection to create. + type : `CollectionType`, optional + Enum value indicating the type of collection to create. Default + is to create a RUN collection. + doc : `str`, optional + Documentation string for the collection. + + Returns + ------- + registered : `bool` + Boolean indicating whether the collection was already registered + or was created by this call. + + Notes + ----- + This method cannot be called within transactions, as it needs to be + able to perform its own transaction to be concurrent + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 71e22d045b..2465a026ba 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -1975,7 +1975,7 @@ def transfer_from( if registry := getattr(source_butler, "registry", None): run_doc = registry.getCollectionDocumentation(run) if not dry_run: - registered = self._registry.registerRun(run, doc=run_doc) + registered = self.collections.register(run, doc=run_doc) else: registered = True handled_collections.add(run) diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 3705e2d3e8..a2804a0c47 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -108,3 +108,6 @@ def get_info(self, name: str, include_doc: bool = False, include_parents: bool = if include_parents: parents = self._registry.getCollectionParentChains(name) return CollectionInfo(name=name, type=record.type, doc=doc, parents=parents, children=children) + + def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: + return self._registry.registerCollection(name, type, doc) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index ae572e701c..13baf3a12d 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -95,3 +95,6 @@ def get_info(self, name: str, include_doc: bool = False, include_parents: bool = children = info.children or () parents = info.parents or set() return CollectionInfo(name=name, type=info.type, doc=doc, parents=parents, children=children) + + def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: + raise NotImplementedError("Not yet available.") diff --git a/python/lsst/daf/butler/script/_associate.py b/python/lsst/daf/butler/script/_associate.py index 74a4bb5baa..3e1f16e743 100644 --- a/python/lsst/daf/butler/script/_associate.py +++ b/python/lsst/daf/butler/script/_associate.py @@ -60,7 +60,7 @@ def associate( """ butler = Butler.from_config(repo, writeable=True, without_datastore=True) - butler.registry.registerCollection(collection, CollectionType.TAGGED) + butler.collections.register(collection, CollectionType.TAGGED) results = QueryDatasets( butler=butler, diff --git a/python/lsst/daf/butler/script/certifyCalibrations.py b/python/lsst/daf/butler/script/certifyCalibrations.py index 77759badf7..0eb2dfbe69 100644 --- a/python/lsst/daf/butler/script/certifyCalibrations.py +++ b/python/lsst/daf/butler/script/certifyCalibrations.py @@ -87,5 +87,5 @@ def certifyCalibrations( raise RuntimeError( f"No inputs found for dataset {dataset_type_name} in {input_collection}. {explanation}" ) - butler.registry.registerCollection(output_collection, type=CollectionType.CALIBRATION) + butler.collections.register(output_collection, type=CollectionType.CALIBRATION) butler.registry.certify(output_collection, refs, timespan) diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py index 03054a22bb..8ccc9271ac 100644 --- a/python/lsst/daf/butler/script/collectionChain.py +++ b/python/lsst/daf/butler/script/collectionChain.py @@ -85,7 +85,7 @@ def collectionChain( if mode in ("redefine", "extend", "prepend"): if not doc: doc = None - butler.registry.registerCollection(parent, CollectionType.CHAINED, doc) + butler.collections.register(parent, CollectionType.CHAINED, doc) else: raise RuntimeError( f"Mode '{mode}' requires that the collection exists " diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index 8efbe446e5..81a898f5a7 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -281,7 +281,7 @@ def _do_init(self, butler: Butler, butlerConfigFile: str) -> None: # New datasets will be added to run and tag, but we will only look in # tag when looking up datasets. - self.butler.registry.registerCollection(self._DEFAULT_TAG, CollectionType.TAGGED) + self.butler.collections.register(self._DEFAULT_TAG, CollectionType.TAGGED) # Create and register a DatasetType self.datasetType = addDatasetType( @@ -351,7 +351,7 @@ def addDataset( A reference to the added dataset. """ if run: - self.butler.registry.registerCollection(run, type=CollectionType.RUN) + self.butler.collections.register(run) else: run = self._DEFAULT_RUN metric = self._makeExampleMetrics() From 8c7236b160000d03f594bb82b7d77f3e02fc65ed Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 14:21:37 -0700 Subject: [PATCH 08/26] Change collections.query to collections.x_query for "experimental" --- python/lsst/daf/butler/_butler_collections.py | 2 +- .../daf/butler/direct_butler/_direct_butler_collections.py | 2 +- .../daf/butler/remote_butler/_remote_butler_collections.py | 2 +- python/lsst/daf/butler/script/collectionChain.py | 2 +- python/lsst/daf/butler/script/exportCalibs.py | 2 +- python/lsst/daf/butler/script/queryCollections.py | 6 +++--- python/lsst/daf/butler/script/queryDataIds.py | 2 +- python/lsst/daf/butler/script/queryDatasets.py | 2 +- python/lsst/daf/butler/script/queryDimensionRecords.py | 2 +- python/lsst/daf/butler/script/removeCollections.py | 2 +- python/lsst/daf/butler/script/removeRuns.py | 2 +- python/lsst/daf/butler/script/retrieveArtifacts.py | 2 +- python/lsst/daf/butler/script/transferDatasets.py | 2 +- python/lsst/daf/butler/transfers/_context.py | 2 +- 14 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 68c91aa919..814c4635a8 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -211,7 +211,7 @@ def remove_from_chain( raise NotImplementedError() @abstractmethod - def query( + def x_query( self, expression: str | Iterable[str], collection_types: Set[CollectionType] | CollectionType | None = None, diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index a2804a0c47..9ce20a3efa 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -79,7 +79,7 @@ def remove_from_chain( parent_collection_name, list(ensure_iterable(child_collection_names)) ) - def query( + def x_query( self, expression: str | Iterable[str], collection_types: Set[CollectionType] | CollectionType | None = None, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 13baf3a12d..4e4d195e62 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -71,7 +71,7 @@ def remove_from_chain( ) -> None: raise NotImplementedError("Not yet available") - def query( + def x_query( self, expression: str | Iterable[str], collection_types: Set[CollectionType] | CollectionType | None = None, diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py index 8ccc9271ac..463fc35aa1 100644 --- a/python/lsst/daf/butler/script/collectionChain.py +++ b/python/lsst/daf/butler/script/collectionChain.py @@ -95,7 +95,7 @@ def collectionChain( if flatten: if mode not in ("redefine", "prepend", "extend"): raise RuntimeError(f"'flatten' flag is not allowed for {mode}") - children = butler.collections.query(children, flatten_chains=True) + children = butler.collections.x_query(children, flatten_chains=True) _modify_collection_chain(butler, mode, parent, children) diff --git a/python/lsst/daf/butler/script/exportCalibs.py b/python/lsst/daf/butler/script/exportCalibs.py index 6d2a9e8233..cddd0f3cfd 100644 --- a/python/lsst/daf/butler/script/exportCalibs.py +++ b/python/lsst/daf/butler/script/exportCalibs.py @@ -133,7 +133,7 @@ def exportCalibs( collectionsToExport = [] datasetsToExport = [] - for collection in butler.collections.query( + for collection in butler.collections.x_query( collections_query, flatten_chains=True, include_chains=True, diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py index 82d2b6f1ce..43cee5d2ab 100644 --- a/python/lsst/daf/butler/script/queryCollections.py +++ b/python/lsst/daf/butler/script/queryCollections.py @@ -71,7 +71,7 @@ def _getTable( dtype=(str, str, str), ) butler = Butler.from_config(repo) - names = sorted(butler.collections.query(glob or "*", collection_types=frozenset(collection_type))) + names = sorted(butler.collections.x_query(glob or "*", collection_types=frozenset(collection_type))) if inverse: for name in names: info = butler.collections.get_info(name, include_parents=True) @@ -153,7 +153,7 @@ def addCollection(name: str, level: int = 0) -> None: for name in info.children: addCollection(name, level + 1) - collections = butler.collections.query(glob or "*", collection_types=frozenset(collection_type)) + collections = butler.collections.x_query(glob or "*", collection_types=frozenset(collection_type)) for collection in sorted(collections): addCollection(collection) return table @@ -166,7 +166,7 @@ def _getFlatten( ) -> Table: butler = Butler.from_config(repo) collectionNames = list( - butler.collections.query( + butler.collections.x_query( glob or "*", collection_types=frozenset(collection_type), flatten_chains=True ) ) diff --git a/python/lsst/daf/butler/script/queryDataIds.py b/python/lsst/daf/butler/script/queryDataIds.py index dd75eb4824..a70aba1887 100644 --- a/python/lsst/daf/butler/script/queryDataIds.py +++ b/python/lsst/daf/butler/script/queryDataIds.py @@ -170,7 +170,7 @@ def queryDataIds( # Need to constrain results based on dataset type and collection. query_collections = collections or "*" - expanded_collections = butler.collections.query(query_collections) + expanded_collections = butler.collections.x_query(query_collections) sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) for dt in dataset_types: diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index 325d2bb8e4..c95e56fad9 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -220,7 +220,7 @@ def getDatasets(self) -> Iterator[DatasetRef]: # query each time. dataset_types = self.butler.registry.queryDatasetTypes(datasetTypes) with self.butler._query() as query: - query_collections = self.butler.collections.query(query_collections) + query_collections = self.butler.collections.x_query(query_collections) # Accumulate over dataset types. for dt in dataset_types: results = query.datasets(dt, collections=query_collections, find_first=self._find_first) diff --git a/python/lsst/daf/butler/script/queryDimensionRecords.py b/python/lsst/daf/butler/script/queryDimensionRecords.py index 074efac915..11470e8f17 100644 --- a/python/lsst/daf/butler/script/queryDimensionRecords.py +++ b/python/lsst/daf/butler/script/queryDimensionRecords.py @@ -80,7 +80,7 @@ def queryDimensionRecords( if datasets: query_collections = collections or "*" - expanded_collections = butler.collections.query(query_collections) + expanded_collections = butler.collections.x_query(query_collections) dataset_types = list(butler.registry.queryDatasetTypes(datasets)) sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index df3f41b106..d6962791aa 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -85,7 +85,7 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo, without_datastore=True) try: - names = sorted(butler.collections.query(collection, include_chains=True)) + names = sorted(butler.collections.x_query(collection, include_chains=True)) except MissingCollectionError: # Hide the error and act like no collections should be removed. names = [] diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index 50ad5b9b2b..9daea8b55e 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -87,7 +87,7 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo) try: - collectionNames = butler.collections.query(collection, CollectionType.RUN, include_chains=False) + collectionNames = butler.collections.x_query(collection, CollectionType.RUN, include_chains=False) except MissingCollectionError: # Act as if no collections matched. collectionNames = [] diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index ca25999344..f9abc50669 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -93,7 +93,7 @@ def retrieveArtifacts( dataset_types = butler.registry.queryDatasetTypes(query_types) refs: list[DatasetRef] = [] with butler._query() as query: - expanded_collections = butler.collections.query(query_collections) + expanded_collections = butler.collections.x_query(query_collections) for dt in dataset_types: results = query.datasets(dt, collections=expanded_collections, find_first=find_first) if where: diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py index e5e390bc88..5060065bad 100644 --- a/python/lsst/daf/butler/script/transferDatasets.py +++ b/python/lsst/daf/butler/script/transferDatasets.py @@ -83,7 +83,7 @@ def transferDatasets( dataset_types = source_butler.registry.queryDatasetTypes(dataset_type_expr) source_refs: list[DatasetRef] = [] with source_butler._query() as query: - query_collections = source_butler.collections.query(collections_expr) + query_collections = source_butler.collections.x_query(collections_expr) # Loop over dataset types and accumulate. for dt in dataset_types: results = query.datasets(dt, collections=query_collections, find_first=find_first) diff --git a/python/lsst/daf/butler/transfers/_context.py b/python/lsst/daf/butler/transfers/_context.py index 0c293c830d..af1a80d551 100644 --- a/python/lsst/daf/butler/transfers/_context.py +++ b/python/lsst/daf/butler/transfers/_context.py @@ -358,7 +358,7 @@ def _computeDatasetAssociations(self) -> dict[str, list[DatasetAssociation]]: collectionTypes = {CollectionType.TAGGED} if datasetType.isCalibration(): collectionTypes.add(CollectionType.CALIBRATION) - resolved_collections = self._butler.collections.query( + resolved_collections = self._butler.collections.x_query( self._collections.keys(), collection_types=collectionTypes, flatten_chains=False, From 5c500572165ea195c924dc605b0eeab5561b4d5c Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 14:40:25 -0700 Subject: [PATCH 09/26] Add butler.collections.x_remove --- python/lsst/daf/butler/_butler_collections.py | 33 +++++++++++++++++++ .../_direct_butler_collections.py | 3 ++ .../_remote_butler_collections.py | 3 ++ .../daf/butler/script/removeCollections.py | 2 +- 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 814c4635a8..969c05f55d 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -297,3 +297,36 @@ def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: st able to perform its own transaction to be concurrent """ raise NotImplementedError() + + @abstractmethod + def x_remove(self, name: str) -> None: + """Remove the given collection from the registry. + + Parameters + ---------- + name : `str` + The name of the collection to remove. + + Raises + ------ + lsst.daf.butler.registry.MissingCollectionError + Raised if no collection with the given name exists. + sqlalchemy.exc.IntegrityError + Raised if the database rows associated with the collection are + still referenced by some other table, such as a dataset in a + datastore (for `~CollectionType.RUN` collections only) or a + `~CollectionType.CHAINED` collection of which this collection is + a child. + + Notes + ----- + If this is a `~CollectionType.RUN` collection, all datasets and quanta + in it will removed from the `Registry` database. This requires that + those datasets be removed (or at least trashed) from any datastores + that hold them first. + + A collection may not be deleted as long as it is referenced by a + `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must + be deleted or redefined first. + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 9ce20a3efa..ab4b0665bc 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -111,3 +111,6 @@ def get_info(self, name: str, include_doc: bool = False, include_parents: bool = def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: return self._registry.registerCollection(name, type, doc) + + def x_remove(self, name: str) -> None: + self._registry.removeCollection(name) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 4e4d195e62..3d052b9aa7 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -98,3 +98,6 @@ def get_info(self, name: str, include_doc: bool = False, include_parents: bool = def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: raise NotImplementedError("Not yet available.") + + def x_remove(self, name: str) -> None: + raise NotImplementedError("Not yet available.") diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index d6962791aa..1ae062cac1 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -126,7 +126,7 @@ def _doRemove(collections: Table) -> None: """Perform the prune collection step.""" butler = Butler.from_config(repo, writeable=True, without_datastore=True) for name in collections["Collection"]: - butler.registry.removeCollection(name) + butler.collections.x_remove(name) result = RemoveCollectionResult( onConfirmation=partial(_doRemove, collectionInfo.nonRunCollections), From 970f5890d4174ceb9b3bb4458bca649a6e36bc4f Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 16:00:37 -0700 Subject: [PATCH 10/26] Add hybrid butler collections class for hybrid butler --- .../lsst/daf/butler/script/collectionChain.py | 10 +- python/lsst/daf/butler/script/removeRuns.py | 2 +- .../lsst/daf/butler/tests/butler_queries.py | 6 +- python/lsst/daf/butler/tests/hybrid_butler.py | 5 +- .../butler/tests/hybrid_butler_collections.py | 103 ++++++++++++++++++ tests/test_butler.py | 38 +++---- 6 files changed, 134 insertions(+), 30 deletions(-) create mode 100644 python/lsst/daf/butler/tests/hybrid_butler_collections.py diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py index 463fc35aa1..1857507f57 100644 --- a/python/lsst/daf/butler/script/collectionChain.py +++ b/python/lsst/daf/butler/script/collectionChain.py @@ -104,16 +104,16 @@ def collectionChain( def _modify_collection_chain(butler: Butler, mode: str, parent: str, children: Iterable[str]) -> None: if mode == "prepend": - butler.collection_chains.prepend_chain(parent, children) + butler.collections.prepend_chain(parent, children) elif mode == "redefine": - butler.collection_chains.redefine_chain(parent, children) + butler.collections.redefine_chain(parent, children) elif mode == "remove": - butler.collection_chains.remove_from_chain(parent, children) + butler.collections.remove_from_chain(parent, children) elif mode == "pop": children_to_pop = _find_children_to_pop(butler, parent, children) - butler.collection_chains.remove_from_chain(parent, children_to_pop) + butler.collections.remove_from_chain(parent, children_to_pop) elif mode == "extend": - butler.collection_chains.extend_chain(parent, children) + butler.collections.extend_chain(parent, children) else: raise ValueError(f"Unrecognized update mode: '{mode}'") diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index 9daea8b55e..bf17c2f1d5 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -134,7 +134,7 @@ def _doRemove(runs: Sequence[RemoveRun]) -> None: with butler.transaction(): for run in runs: for parent in run.parents: - butler.collection_chains.remove_from_chain(parent, run.name) + butler.collections.remove_from_chain(parent, run.name) butler.removeRuns([r.name for r in runs], unstore=True) result = RemoveRunsResult( diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index b641bafb84..f39cd89830 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -434,9 +434,9 @@ def test_dataset_constrained_record_query(self) -> None: """ butler = self.make_butler("base.yaml", "datasets.yaml") butler.registry.insertDimensionData("instrument", {"name": "Cam2"}) - butler.registry.registerCollection("empty", CollectionType.RUN) - butler.registry.registerCollection("chain", CollectionType.CHAINED) - butler.registry.setCollectionChain("chain", ["imported_g", "empty", "imported_r"]) + butler.collections.register("empty", CollectionType.RUN) + butler.collections.register("chain", CollectionType.CHAINED) + butler.collections.redefine_chain("chain", ["imported_g", "empty", "imported_r"]) with butler._query() as query: # No collections here or in defaults is an error. with self.assertRaises(NoDefaultCollectionError): diff --git a/python/lsst/daf/butler/tests/hybrid_butler.py b/python/lsst/daf/butler/tests/hybrid_butler.py index 9c66f1af1b..86c9c05c74 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler.py +++ b/python/lsst/daf/butler/tests/hybrid_butler.py @@ -57,6 +57,7 @@ from ..registry import Registry from ..remote_butler import RemoteButler from ..transfers import RepoExportContext +from .hybrid_butler_collections import HybridButlerCollections from .hybrid_butler_registry import HybridButlerRegistry @@ -454,8 +455,8 @@ def _extract_all_dimension_records_from_data_ids( @property def collection_chains(self) -> ButlerCollections: - return self._direct_butler.collection_chains + return HybridButlerCollections(self) @property def collections(self) -> ButlerCollections: - return self._remote_butler.collection_chains + return HybridButlerCollections(self) diff --git a/python/lsst/daf/butler/tests/hybrid_butler_collections.py b/python/lsst/daf/butler/tests/hybrid_butler_collections.py new file mode 100644 index 0000000000..b89fd30f10 --- /dev/null +++ b/python/lsst/daf/butler/tests/hybrid_butler_collections.py @@ -0,0 +1,103 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("HybridButlerCollections",) + +from collections.abc import Iterable, Sequence, Set +from typing import TYPE_CHECKING + +from .._butler_collections import ButlerCollections, CollectionInfo +from .._collection_type import CollectionType + +if TYPE_CHECKING: + from .hybrid_butler import HybridButler + + +class HybridButlerCollections(ButlerCollections): + """Implementation of ButlerCollections for HybridButler. + + Parameters + ---------- + butler : `~lsst.daf.butler.tests.hybrid_butler.HybridButler` + Hybrid butler to use. + """ + + def __init__(self, butler: HybridButler): + self._hybrid = butler + + @property + def defaults(self) -> Sequence[str]: + return self._hybrid._remote_butler.collections.defaults + + def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: + return self._hybrid._direct_butler.collections.extend_chain( + parent_collection_name, child_collection_names + ) + + def prepend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: + return self._hybrid._direct_butler.collections.prepend_chain( + parent_collection_name, child_collection_names + ) + + def redefine_chain( + self, parent_collection_name: str, child_collection_names: str | Iterable[str] + ) -> None: + self._hybrid._direct_butler.collections.redefine_chain(parent_collection_name, child_collection_names) + + def remove_from_chain( + self, parent_collection_name: str, child_collection_names: str | Iterable[str] + ) -> None: + return self._hybrid._direct_butler.collections.remove_from_chain( + parent_collection_name, child_collection_names + ) + + def x_query( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Sequence[str]: + return self._hybrid._remote_butler.collections.x_query( + expression, + collection_types=collection_types, + flatten_chains=flatten_chains, + include_chains=include_chains, + ) + + def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + return self._hybrid._remote_butler.collections.get_info( + name, include_doc=include_doc, include_parents=include_parents + ) + + def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: + return self._hybrid._direct_butler.collections.register(name, type=type, doc=doc) + + def x_remove(self, name: str) -> None: + self._hybrid._direct_butler.collections.x_remove(name) diff --git a/tests/test_butler.py b/tests/test_butler.py index f49173dcd3..d395d24f2a 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -1390,85 +1390,85 @@ def testGetDatasetCollectionCaching(self): def testCollectionChainRedefine(self): butler = self._setup_to_test_collection_chain() - butler.collection_chains.redefine_chain("chain", "a") + butler.collections.redefine_chain("chain", "a") self._check_chain(butler, ["a"]) # Duplicates are removed from the list of children - butler.collection_chains.redefine_chain("chain", ["c", "b", "c"]) + butler.collections.redefine_chain("chain", ["c", "b", "c"]) self._check_chain(butler, ["c", "b"]) # Empty list clears the chain - butler.collection_chains.redefine_chain("chain", []) + butler.collections.redefine_chain("chain", []) self._check_chain(butler, []) - self._test_common_chain_functionality(butler, butler.collection_chains.redefine_chain) + self._test_common_chain_functionality(butler, butler.collections.redefine_chain) def testCollectionChainPrepend(self): butler = self._setup_to_test_collection_chain() # Duplicates are removed from the list of children - butler.collection_chains.prepend_chain("chain", ["c", "b", "c"]) + butler.collections.prepend_chain("chain", ["c", "b", "c"]) self._check_chain(butler, ["c", "b"]) # Prepend goes on the front of existing chain - butler.collection_chains.prepend_chain("chain", ["a"]) + butler.collections.prepend_chain("chain", ["a"]) self._check_chain(butler, ["a", "c", "b"]) # Empty prepend does nothing - butler.collection_chains.prepend_chain("chain", []) + butler.collections.prepend_chain("chain", []) self._check_chain(butler, ["a", "c", "b"]) # Prepending children that already exist in the chain removes them from # their current position. - butler.collection_chains.prepend_chain("chain", ["d", "b", "c"]) + butler.collections.prepend_chain("chain", ["d", "b", "c"]) self._check_chain(butler, ["d", "b", "c", "a"]) - self._test_common_chain_functionality(butler, butler.collection_chains.prepend_chain) + self._test_common_chain_functionality(butler, butler.collections.prepend_chain) def testCollectionChainExtend(self): butler = self._setup_to_test_collection_chain() # Duplicates are removed from the list of children - butler.collection_chains.extend_chain("chain", ["c", "b", "c"]) + butler.collections.extend_chain("chain", ["c", "b", "c"]) self._check_chain(butler, ["c", "b"]) # Extend goes on the end of existing chain - butler.collection_chains.extend_chain("chain", ["a"]) + butler.collections.extend_chain("chain", ["a"]) self._check_chain(butler, ["c", "b", "a"]) # Empty extend does nothing - butler.collection_chains.extend_chain("chain", []) + butler.collections.extend_chain("chain", []) self._check_chain(butler, ["c", "b", "a"]) # Extending children that already exist in the chain removes them from # their current position. - butler.collection_chains.extend_chain("chain", ["d", "b", "c"]) + butler.collections.extend_chain("chain", ["d", "b", "c"]) self._check_chain(butler, ["a", "d", "b", "c"]) - self._test_common_chain_functionality(butler, butler.collection_chains.extend_chain) + self._test_common_chain_functionality(butler, butler.collections.extend_chain) def testCollectionChainRemove(self) -> None: butler = self._setup_to_test_collection_chain() butler.registry.setCollectionChain("chain", ["a", "b", "c", "d"]) - butler.collection_chains.remove_from_chain("chain", "c") + butler.collections.remove_from_chain("chain", "c") self._check_chain(butler, ["a", "b", "d"]) # Duplicates are allowed in the list of children - butler.collection_chains.remove_from_chain("chain", ["b", "b", "a"]) + butler.collections.remove_from_chain("chain", ["b", "b", "a"]) self._check_chain(butler, ["d"]) # Empty remove does nothing - butler.collection_chains.remove_from_chain("chain", []) + butler.collections.remove_from_chain("chain", []) self._check_chain(butler, ["d"]) # Removing children that aren't in the chain does nothing - butler.collection_chains.remove_from_chain("chain", ["a", "chain"]) + butler.collections.remove_from_chain("chain", ["a", "chain"]) self._check_chain(butler, ["d"]) self._test_common_chain_functionality( - butler, butler.collection_chains.remove_from_chain, skip_cycle_check=True + butler, butler.collections.remove_from_chain, skip_cycle_check=True ) def _setup_to_test_collection_chain(self) -> Butler: From ebbc097db35bac93d03d199b8bc5762f26dca352 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 15 Aug 2024 16:28:53 -0700 Subject: [PATCH 11/26] Use new collections API in test_butler.py --- tests/test_butler.py | 54 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_butler.py b/tests/test_butler.py index d395d24f2a..f29a3148ae 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -225,7 +225,7 @@ def create_butler( """ butler = self.create_empty_butler(run=run) - collections = set(butler.registry.queryCollections()) + collections = set(butler.collections.x_query("*")) self.assertEqual(collections, {run}) # Create and register a DatasetType dimensions = butler.dimensions.conform(["instrument", "visit"]) @@ -301,7 +301,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But # this by using a distinct run collection each time counter += 1 this_run = f"put_run_{counter}" - butler.registry.registerCollection(this_run, type=CollectionType.RUN) + butler.collections.register(this_run) expected_collections.update({this_run}) with self.subTest(args=args): @@ -418,7 +418,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But # Do explicit registry removal since we know they are # empty - butler.registry.removeCollection(this_run) + butler.collections.x_remove(this_run) expected_collections.remove(this_run) # Create DatasetRef for put using default run. @@ -502,7 +502,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But butler.get(ref, parameters={"unsupported": True}) # Check we have a collection - collections = set(butler.registry.queryCollections()) + collections = set(butler.collections.x_query("*")) self.assertEqual(collections, expected_collections) # Clean up to check that we can remove something that may have @@ -575,9 +575,9 @@ def testDeferredCollectionPassing(self) -> None: metric = makeExampleMetrics() # Register a new run and put dataset. run = "deferred" - self.assertTrue(butler.registry.registerRun(run)) + self.assertTrue(butler.collections.register(run)) # Second time it will be allowed but indicate no-op - self.assertFalse(butler.registry.registerRun(run)) + self.assertFalse(butler.collections.register(run)) ref = butler.put(metric, datasetType, dataId, run=run) # Putting with no run should fail with TypeError. with self.assertRaises(CollectionError): @@ -594,7 +594,7 @@ def testDeferredCollectionPassing(self) -> None: with self.assertRaises(CollectionError): butler.get(datasetType, dataId) # Associate the dataset with a different collection. - butler.registry.registerCollection("tagged") + butler.collections.register("tagged", type=CollectionType.TAGGED) butler.registry.associate("tagged", [ref]) # Deleting the dataset from the new collection should make it findable # in the original collection. @@ -642,7 +642,7 @@ def testConstructor(self) -> None: butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) self.assertIsInstance(butler, Butler) - collections = set(butler.registry.queryCollections()) + collections = set(butler.collections.x_query("*")) self.assertEqual(collections, {self.default_run}) # Check that some special characters can be included in run name. @@ -1450,7 +1450,7 @@ def testCollectionChainExtend(self): def testCollectionChainRemove(self) -> None: butler = self._setup_to_test_collection_chain() - butler.registry.setCollectionChain("chain", ["a", "b", "c", "d"]) + butler.collections.redefine_chain("chain", ["a", "b", "c", "d"]) butler.collections.remove_from_chain("chain", "c") self._check_chain(butler, ["a", "b", "d"]) @@ -1474,19 +1474,19 @@ def testCollectionChainRemove(self) -> None: def _setup_to_test_collection_chain(self) -> Butler: butler = self.create_empty_butler(writeable=True) - butler.registry.registerCollection("chain", CollectionType.CHAINED) + butler.collections.register("chain", CollectionType.CHAINED) runs = ["a", "b", "c", "d"] for run in runs: - butler.registry.registerCollection(run) + butler.collections.register(run) - butler.registry.registerCollection("staticchain", CollectionType.CHAINED) - butler.registry.setCollectionChain("staticchain", ["a", "b"]) + butler.collections.register("staticchain", CollectionType.CHAINED) + butler.collections.redefine_chain("staticchain", ["a", "b"]) return butler def _check_chain(self, butler: Butler, expected: list[str]) -> None: - children = butler.registry.getCollectionChain("chain") + children = butler.collections.get_info("chain").children self.assertEqual(expected, list(children)) def _test_common_chain_functionality( @@ -1504,14 +1504,14 @@ def _test_common_chain_functionality( # Prevent collection cycles if not skip_cycle_check: - butler.registry.registerCollection("chain2", CollectionType.CHAINED) + butler.collections.register("chain2", CollectionType.CHAINED) func("chain2", "chain") with self.assertRaises(CollectionCycleError): func("chain", "chain2") # Make sure none of the earlier operations interfered with unrelated # chains. - self.assertEqual(["a", "b"], list(butler.registry.getCollectionChain("staticchain"))) + self.assertEqual(["a", "b"], list(butler.collections.get_info("staticchain").children)) with butler._caching_context(): with self.assertRaisesRegex(RuntimeError, "Chained collection modification not permitted"): @@ -1706,9 +1706,9 @@ def testRemoveRuns(self) -> None: butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) # Add some RUN-type collection. run1 = "run1" - butler.registry.registerRun(run1) + butler.collections.register(run1) run2 = "run2" - butler.registry.registerRun(run2) + butler.collections.register(run2) # put a dataset in each metric = makeExampleMetrics() dimensions = butler.dimensions.conform(["instrument", "physical_filter"]) @@ -1729,9 +1729,9 @@ def testRemoveRuns(self) -> None: # Should be nothing in registry for either one, and datastore should # not think either exists. with self.assertRaises(MissingCollectionError): - butler.registry.getCollectionType(run1) + butler.collections.get_info(run1) with self.assertRaises(MissingCollectionError): - butler.registry.getCollectionType(run2) + butler.collections.get_info(run1) self.assertFalse(butler.stored(ref1)) self.assertFalse(butler.stored(ref2)) # The ref we unstored should be gone according to the URI, but the @@ -1767,9 +1767,9 @@ def testPruneDatasets(self) -> None: butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) # Add some RUN-type collections. run1 = "run1" - butler.registry.registerRun(run1) + butler.collections.register(run1) run2 = "run2" - butler.registry.registerRun(run2) + butler.collections.register(run2) # put some datasets. ref1 and ref2 have the same data ID, and are in # different runs. ref3 has a different data ID. metric = makeExampleMetrics() @@ -2464,7 +2464,7 @@ def _absolute_transfer(self, transfer: str) -> None: storageClass = self.storageClassFactory.getStorageClass(storageClassName) datasetTypeName = "random_data" run = "run1" - self.source_butler.registry.registerCollection(run, CollectionType.RUN) + self.source_butler.collections.register(run) dimensions = self.source_butler.dimensions.conform(()) datasetType = DatasetType(datasetTypeName, dimensions, storageClass) @@ -2513,7 +2513,7 @@ def assertButlerTransfers( # Create the run collections in the source butler. for run in runs: - self.source_butler.registry.registerCollection(run, CollectionType.RUN) + self.source_butler.collections.register(run) # Create dimensions in source butler. n_exposures = 30 @@ -2746,7 +2746,7 @@ def assertButlerTransfers( # Now prune run2 collection and create instead a CHAINED collection. # This should block the transfer. self.target_butler.removeRuns(["run2"], unstore=True) - self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) + self.target_butler.collections.register("run2", CollectionType.CHAINED) with self.assertRaises(CollectionTypeError): # Re-importing the run1 datasets can be problematic if they # use integer IDs so filter those out. @@ -2797,8 +2797,8 @@ def test_fallback(self) -> None: self.assertIsInstance(butler._datastore, NullDatastore) # Check that registry is working. - butler.registry.registerRun("MYRUN") - collections = butler.registry.queryCollections(...) + butler.collections.register("MYRUN") + collections = butler.collections.x_query("*") self.assertIn("MYRUN", set(collections)) # Create a ref. From 9027b0aa096cdf8f391f9086fb410ca024c532cd Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 09:12:17 -0700 Subject: [PATCH 12/26] Remove __eq__ from ButlerCollections and fix test --- python/lsst/daf/butler/_butler_collections.py | 8 +------- tests/test_butler.py | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 969c05f55d..418a465ed5 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -31,7 +31,7 @@ from abc import ABC, abstractmethod from collections.abc import Iterable, Sequence, Set -from typing import Any, overload +from typing import overload from pydantic import BaseModel @@ -68,12 +68,6 @@ def __getitem__(self, index: int | slice) -> str | Sequence[str]: def __len__(self) -> int: return len(self.defaults) - def __eq__(self, other: Any) -> bool: - # Do not try to compare registry instances. - if not isinstance(other, type(self)): - return False - return self.defaults == other.defaults - @property @abstractmethod def defaults(self) -> Sequence[str]: diff --git a/tests/test_butler.py b/tests/test_butler.py index f29a3148ae..92604ada75 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -1092,7 +1092,7 @@ def testPickle(self) -> None: butlerOut = pickle.loads(pickle.dumps(butler)) self.assertIsInstance(butlerOut, Butler) self.assertEqual(butlerOut._config, butler._config) - self.assertEqual(butlerOut.collections, butler.collections) + self.assertEqual(list(butlerOut.collections.defaults), list(butler.collections.defaults)) self.assertEqual(butlerOut.run, butler.run) def testGetDatasetTypes(self) -> None: From 4cc657e5691f76c12db2db4f1b2594f7832c6827 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 10:20:24 -0700 Subject: [PATCH 13/26] Allow CollectionInfo to be sorted by name --- python/lsst/daf/butler/_butler_collections.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 418a465ed5..9c9ff6a8bf 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -31,7 +31,7 @@ from abc import ABC, abstractmethod from collections.abc import Iterable, Sequence, Set -from typing import overload +from typing import Any, overload from pydantic import BaseModel @@ -52,6 +52,12 @@ class CollectionInfo(BaseModel): parents: frozenset[str] = frozenset() """Any parents of this collection.""" + def __lt__(self, other: Any) -> bool: + """Compare objects by collection name.""" + if not isinstance(other, type(self)): + return NotImplemented + return self.name < other.name + class ButlerCollections(ABC, Sequence): """Methods for working with collections stored in the Butler.""" From b234be3c8e94469d2bd7592480786337208e6db1 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 11:09:03 -0700 Subject: [PATCH 14/26] Add collections.x_query_info and simplify some usage of x_query --- python/lsst/daf/butler/_butler_collections.py | 60 ++++++++++++++++++- .../_direct_butler_collections.py | 23 +++++++ .../_remote_butler_collections.py | 14 +++-- python/lsst/daf/butler/script/exportCalibs.py | 22 +++---- .../daf/butler/script/queryCollections.py | 48 ++++++++------- .../daf/butler/script/removeCollections.py | 11 ++-- python/lsst/daf/butler/script/removeRuns.py | 13 ++-- .../butler/tests/hybrid_butler_collections.py | 8 ++- 8 files changed, 147 insertions(+), 52 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 9c9ff6a8bf..75de1d3b49 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -210,7 +210,6 @@ def remove_from_chain( """ raise NotImplementedError() - @abstractmethod def x_query( self, expression: str | Iterable[str], @@ -220,6 +219,8 @@ def x_query( ) -> Sequence[str]: """Query the butler for collections matching an expression. + **This is an experimental interface that can change at any time.** + Parameters ---------- expression : `str` or `~collections.abc.Iterable` [ `str` ] @@ -240,6 +241,61 @@ def x_query( collections : `~collections.abc.Sequence` [ `str` ] The names of collections that match ``expression``. + Notes + ----- + The order in which collections are returned is unspecified, except that + the children of a `~CollectionType.CHAINED` collection are guaranteed + to be in the order in which they are searched. When multiple parent + `~CollectionType.CHAINED` collections match the same criteria, the + order in which the two lists appear is unspecified, and the lists of + children may be incomplete if a child has multiple parents. + + The default implementation is a wrapper around `x_query_info`. + """ + collections_info = self.x_query_info( + expression, + collection_types=collection_types, + flatten_chains=flatten_chains, + include_chains=include_chains, + ) + return [info.name for info in collections_info] + + @abstractmethod + def x_query_info( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + include_parents: bool = False, + ) -> Sequence[CollectionInfo]: + """Query the butler for collections matching an expression and + return detailed information about those collections. + + **This is an experimental interface that can change at any time.** + + Parameters + ---------- + expression : `str` or `~collections.abc.Iterable` [ `str` ] + One or more collection names or globs to include in the search. + collection_types : `set` [`CollectionType`], `CollectionType` or `None` + Restrict the types of collections to be searched. If `None` all + collection types are searched. + flatten_chains : `bool`, optional + If `True` (`False` is default), recursively yield the child + collections of matching `~CollectionType.CHAINED` collections. + include_chains : `bool` or `None`, optional + If `True`, yield records for matching `~CollectionType.CHAINED` + collections. Default is the opposite of ``flatten_chains``: + include either CHAINED collections or their children, but not both. + include_parents : `bool`, optional + Whether the returned information includes parents. + + Returns + ------- + collections : `~collections.abc.Sequence` [ `CollectionInfo` ] + The names of collections that match ``expression``. + Notes ----- The order in which collections are returned is unspecified, except that @@ -302,6 +358,8 @@ def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: st def x_remove(self, name: str) -> None: """Remove the given collection from the registry. + **This is an experimental interface that can change at any time.** + Parameters ---------- name : `str` diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index ab4b0665bc..10b438c824 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -88,6 +88,8 @@ def x_query( ) -> Sequence[str]: if collection_types is None: collection_types = CollectionType.all() + # Do not use base implementation for now to avoid the additional + # unused queries. return self._registry.queryCollections( expression, collectionTypes=collection_types, @@ -95,6 +97,27 @@ def x_query( includeChains=include_chains, ) + def x_query_info( + self, + expression: str | Iterable[str], + collection_types: Set[CollectionType] | CollectionType | None = None, + flatten_chains: bool = False, + include_chains: bool | None = None, + include_parents: bool = False, + ) -> Sequence[CollectionInfo]: + info = [] + with self._registry.caching_context(): + if collection_types is None: + collection_types = CollectionType.all() + for name in self._registry.queryCollections( + expression, + collectionTypes=collection_types, + flattenChains=flatten_chains, + includeChains=include_chains, + ): + info.append(self.get_info(name, include_parents=include_parents)) + return info + def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: record = self._registry.get_collection_record(name) doc = "" diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 3d052b9aa7..1e62d9c3e9 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -71,21 +71,27 @@ def remove_from_chain( ) -> None: raise NotImplementedError("Not yet available") - def x_query( + def x_query_info( self, expression: str | Iterable[str], collection_types: Set[CollectionType] | CollectionType | None = None, flatten_chains: bool = False, include_chains: bool | None = None, - ) -> Sequence[str]: + include_parents: bool = False, + ) -> Sequence[CollectionInfo]: + # This should become a single call on the server in the future. if collection_types is None: collection_types = CollectionType.all() - return self._registry.queryCollections( + + info = [] + for name in self._registry.queryCollections( expression, collectionTypes=collection_types, flattenChains=flatten_chains, includeChains=include_chains, - ) + ): + info.append(self.get_info(name, include_parents=include_parents)) + return info def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: info = self._registry._get_collection_info( diff --git a/python/lsst/daf/butler/script/exportCalibs.py b/python/lsst/daf/butler/script/exportCalibs.py index cddd0f3cfd..4581082d19 100644 --- a/python/lsst/daf/butler/script/exportCalibs.py +++ b/python/lsst/daf/butler/script/exportCalibs.py @@ -35,6 +35,7 @@ from astropy.table import Table from .._butler import Butler +from .._butler_collections import CollectionInfo from .._collection_type import CollectionType if TYPE_CHECKING: @@ -44,7 +45,7 @@ def find_calibration_datasets( - butler: Butler, collection: str, datasetTypes: Iterable[DatasetType] + butler: Butler, collection: CollectionInfo, datasetTypes: Iterable[DatasetType] ) -> list[DatasetRef]: """Search a calibration collection for calibration datasets. @@ -52,7 +53,7 @@ def find_calibration_datasets( ---------- butler : `lsst.daf.butler.Butler` Butler to use. - collection : `str` + collection : `CollectionInfo` Collection to search. This should be a CALIBRATION collection. datasetTypes : `list` [`lsst.daf.Butler.DatasetType`] @@ -68,18 +69,18 @@ def find_calibration_datasets( RuntimeError Raised if the collection to search is not a CALIBRATION collection. """ - if butler.collections.get_info(collection).type != CollectionType.CALIBRATION: - raise RuntimeError(f"Collection {collection} is not a CALIBRATION collection.") + if collection.type != CollectionType.CALIBRATION: + raise RuntimeError(f"Collection {collection.name} is not a CALIBRATION collection.") exportDatasets = [] for calibType in datasetTypes: with butler._query() as query: - results = query.datasets(calibType, collections=collection, find_first=False) + results = query.datasets(calibType, collections=collection.name, find_first=False) try: refs = list(results.with_dimension_records()) except Exception as e: - e.add_note(f"Error from querying dataset type {calibType} and collection {collection}") + e.add_note(f"Error from querying dataset type {calibType} and collection {collection.name}") raise exportDatasets.extend(refs) @@ -133,18 +134,17 @@ def exportCalibs( collectionsToExport = [] datasetsToExport = [] - for collection in butler.collections.x_query( + for collection in butler.collections.x_query_info( collections_query, flatten_chains=True, include_chains=True, collection_types={CollectionType.CALIBRATION, CollectionType.CHAINED}, ): - log.info("Checking collection: %s", collection) + log.info("Checking collection: %s", collection.name) # Get collection information. - collectionsToExport.append(collection) - info = butler.collections.get_info(collection) - if info.type == CollectionType.CALIBRATION: + collectionsToExport.append(collection.name) + if collection.type == CollectionType.CALIBRATION: exportDatasets = find_calibration_datasets(butler, collection, calibTypes) datasetsToExport.extend(exportDatasets) diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py index 43cee5d2ab..6f1b1ef856 100644 --- a/python/lsst/daf/butler/script/queryCollections.py +++ b/python/lsst/daf/butler/script/queryCollections.py @@ -32,6 +32,7 @@ from astropy.table import Table from .._butler import Butler +from .._butler_collections import CollectionInfo from .._collection_type import CollectionType @@ -71,34 +72,36 @@ def _getTable( dtype=(str, str, str), ) butler = Butler.from_config(repo) - names = sorted(butler.collections.x_query(glob or "*", collection_types=frozenset(collection_type))) + collections = sorted( + butler.collections.x_query_info( + glob or "*", collection_types=frozenset(collection_type), include_parents=inverse + ) + ) if inverse: - for name in names: - info = butler.collections.get_info(name, include_parents=True) + for info in collections: if info.parents: first = True for parentName in sorted(info.parents): - table.add_row((name if first else "", info.type.name if first else "", parentName)) + table.add_row((info.name if first else "", info.type.name if first else "", parentName)) first = False else: - table.add_row((name, info.type.name, "")) + table.add_row((info.name, info.type.name, "")) # If none of the datasets has a parent dataset then remove the # description column. if not any(c for c in table[descriptionCol]): del table[descriptionCol] else: - for name in names: - info = butler.collections.get_info(name) + for info in collections: if info.type == CollectionType.CHAINED: if info.children: first = True for child in info.children: - table.add_row((name if first else "", info.type.name if first else "", child)) + table.add_row((info.name if first else "", info.type.name if first else "", child)) first = False else: - table.add_row((name, info.type.name, "")) + table.add_row((info.name, info.type.name, "")) else: - table.add_row((name, info.type.name, "")) + table.add_row((info.name, info.type.name, "")) # If there aren't any CHAINED datasets in the results then remove the # description column. if not any(columnVal == CollectionType.CHAINED.name for columnVal in table[typeCol]): @@ -142,18 +145,21 @@ def _getTree( ) butler = Butler.from_config(repo, without_datastore=True) - def addCollection(name: str, level: int = 0) -> None: - info = butler.collections.get_info(name, include_parents=inverse) - table.add_row((" " * level + name, info.type.name)) + def addCollection(info: CollectionInfo, level: int = 0) -> None: + table.add_row((" " * level + info.name, info.type.name)) if inverse: for pname in sorted(info.parents): - addCollection(pname, level + 1) + pinfo = butler.collections.get_info(pname, include_parents=inverse) + addCollection(pinfo, level + 1) else: if info.type == CollectionType.CHAINED: for name in info.children: - addCollection(name, level + 1) + cinfo = butler.collections.get_info(name) + addCollection(cinfo, level + 1) - collections = butler.collections.x_query(glob or "*", collection_types=frozenset(collection_type)) + collections = butler.collections.x_query_info( + glob or "*", collection_types=frozenset(collection_type), include_parents=inverse + ) for collection in sorted(collections): addCollection(collection) return table @@ -165,14 +171,14 @@ def _getFlatten( collection_type: Iterable[CollectionType], ) -> Table: butler = Butler.from_config(repo) - collectionNames = list( - butler.collections.x_query( + collections = list( + butler.collections.x_query_info( glob or "*", collection_types=frozenset(collection_type), flatten_chains=True ) ) - - collectionTypes = [butler.collections.get_info(c).type.name for c in collectionNames] - return Table((collectionNames, collectionTypes), names=("Name", "Type")) + names = [c.name for c in collections] + types = [c.type.name for c in collections] + return Table((names, types), names=("Name", "Type")) def queryCollections( diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index 1ae062cac1..747abaf1dc 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -85,18 +85,17 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo, without_datastore=True) try: - names = sorted(butler.collections.x_query(collection, include_chains=True)) + collections_info = sorted(butler.collections.x_query_info(collection, include_chains=True)) except MissingCollectionError: # Hide the error and act like no collections should be removed. - names = [] + collections_info = [] collections = Table(names=("Collection", "Collection Type"), dtype=(str, str)) runCollections = Table(names=("Collection",), dtype=(str,)) - for name in names: - collection_info = butler.collections.get_info(name) + for collection_info in collections_info: if collection_info.type == CollectionType.RUN: - runCollections.add_row((name,)) + runCollections.add_row((collection_info.name,)) else: - collections.add_row((name, collection_info.type.name)) + collections.add_row((collection_info.name, collection_info.type.name)) return CollectionInfo(collections, runCollections) diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index bf17c2f1d5..dc1cbe1c94 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -87,20 +87,21 @@ def _getCollectionInfo( """ butler = Butler.from_config(repo) try: - collectionNames = butler.collections.x_query(collection, CollectionType.RUN, include_chains=False) + collections = butler.collections.x_query_info( + collection, CollectionType.RUN, include_chains=False, include_parents=True + ) except MissingCollectionError: # Act as if no collections matched. - collectionNames = [] + collections = [] dataset_types = butler.registry.queryDatasetTypes(...) runs = [] datasets: dict[str, int] = defaultdict(int) - for collectionName in collectionNames: - collection_info = butler.collections.get_info(collectionName, include_parents=True) + for collection_info in collections: assert collection_info.type == CollectionType.RUN - runs.append(RemoveRun(collectionName, list(collection_info.parents))) + runs.append(RemoveRun(collection_info.name, list(collection_info.parents))) with butler._query() as query: for dt in dataset_types: - results = query.datasets(dt, collections=collectionName) + results = query.datasets(dt, collections=collection_info.name) count = results.count(exact=False) if count: datasets[dt.name] += count diff --git a/python/lsst/daf/butler/tests/hybrid_butler_collections.py b/python/lsst/daf/butler/tests/hybrid_butler_collections.py index b89fd30f10..1a0875e1c8 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_collections.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_collections.py @@ -77,18 +77,20 @@ def remove_from_chain( parent_collection_name, child_collection_names ) - def x_query( + def x_query_info( self, expression: str | Iterable[str], collection_types: Set[CollectionType] | CollectionType | None = None, flatten_chains: bool = False, include_chains: bool | None = None, - ) -> Sequence[str]: - return self._hybrid._remote_butler.collections.x_query( + include_parents: bool = False, + ) -> Sequence[CollectionInfo]: + return self._hybrid._remote_butler.collections.x_query_info( expression, collection_types=collection_types, flatten_chains=flatten_chains, include_chains=include_chains, + include_parents=include_parents, ) def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: From a49c3fa872e9d0fbb0575f5d2ede08e89f117891 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 12:36:29 -0700 Subject: [PATCH 15/26] Deprecate Butler.collection_chains Now called collections --- python/lsst/daf/butler/direct_butler/_direct_butler.py | 6 ++++++ python/lsst/daf/butler/remote_butler/_remote_butler.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 2465a026ba..8a2f67212b 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -46,6 +46,7 @@ from collections.abc import Iterable, Iterator, MutableMapping, Sequence from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast +from deprecated.sphinx import deprecated from lsst.resources import ResourcePath, ResourcePathExpression from lsst.utils.introspection import get_class_of from lsst.utils.logging import VERBOSE, getLogger @@ -2149,6 +2150,11 @@ def validateConfiguration( raise ValidationError(";\n".join(messages)) @property + @deprecated( + "Please use 'collections' instead. collection_chains will be removed after v28.", + version="v28", + category=FutureWarning, + ) def collection_chains(self) -> DirectButlerCollections: """Object with methods for modifying collection chains.""" return DirectButlerCollections(self._registry) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index fbabfd0f02..bc9d78b8dc 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -35,6 +35,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, TextIO, cast +from deprecated.sphinx import deprecated from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( determine_destination_for_retrieved_artifact, ) @@ -144,6 +145,11 @@ def isWriteable(self) -> bool: return False @property + @deprecated( + "Please use 'collections' instead. collection_chains will be removed after v28.", + version="v28", + category=FutureWarning, + ) def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" from ._registry import RemoteButlerRegistry From 066bea5ade1d545550121aa886d978d6be7966b2 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 12:45:49 -0700 Subject: [PATCH 16/26] Always include the collection docs in collections.get_info --- python/lsst/daf/butler/_butler_collections.py | 4 +--- .../daf/butler/direct_butler/_direct_butler_collections.py | 6 ++---- python/lsst/daf/butler/remote_butler/_remote_butler.py | 2 +- .../daf/butler/remote_butler/_remote_butler_collections.py | 6 ++---- python/lsst/daf/butler/tests/hybrid_butler_collections.py | 6 ++---- 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 75de1d3b49..9e5a3c843c 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -308,15 +308,13 @@ def x_query_info( raise NotImplementedError() @abstractmethod - def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: """Obtain information for a specific collection. Parameters ---------- name : `str` The name of the collection of interest. - include_doc : `bool`, optional - If `True` any documentation about this collection will be included. include_parents : `bool`, optional If `True` any parents of this collection will be included. diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 10b438c824..e047cdb6f7 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -118,11 +118,9 @@ def x_query_info( info.append(self.get_info(name, include_parents=include_parents)) return info - def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: + def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: record = self._registry.get_collection_record(name) - doc = "" - if include_doc: - doc = self._registry.getCollectionDocumentation(name) or "" + doc = self._registry.getCollectionDocumentation(name) or "" children: tuple[str, ...] = tuple() if record.type == CollectionType.CHAINED: assert isinstance(record, ChainedCollectionRecord) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index bc9d78b8dc..f09a1a62bc 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -158,7 +158,7 @@ def collection_chains(self) -> ButlerCollections: @property def collections(self) -> ButlerCollections: - """Object with methods for modifying collection chains.""" + """Object with methods for modifying and querying collections.""" from ._registry import RemoteButlerRegistry return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry)) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 1e62d9c3e9..7e907a9235 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -93,10 +93,8 @@ def x_query_info( info.append(self.get_info(name, include_parents=include_parents)) return info - def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: - info = self._registry._get_collection_info( - name, include_doc=include_doc, include_parents=include_parents - ) + def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: + info = self._registry._get_collection_info(name, include_doc=True, include_parents=include_parents) doc = info.doc or "" children = info.children or () parents = info.parents or set() diff --git a/python/lsst/daf/butler/tests/hybrid_butler_collections.py b/python/lsst/daf/butler/tests/hybrid_butler_collections.py index 1a0875e1c8..4ac8688e6d 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_collections.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_collections.py @@ -93,10 +93,8 @@ def x_query_info( include_parents=include_parents, ) - def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo: - return self._hybrid._remote_butler.collections.get_info( - name, include_doc=include_doc, include_parents=include_parents - ) + def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: + return self._hybrid._remote_butler.collections.get_info(name, include_parents=include_parents) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: return self._hybrid._direct_butler.collections.register(name, type=type, doc=doc) From ff8f7710248d0d343eaa1a0c0636b4985f5621bb Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 13:01:10 -0700 Subject: [PATCH 17/26] Default CollectionInfo.parents to None --- python/lsst/daf/butler/_butler_collections.py | 7 +++++-- .../daf/butler/direct_butler/_direct_butler_collections.py | 2 +- .../daf/butler/remote_butler/_remote_butler_collections.py | 3 +-- python/lsst/daf/butler/script/queryCollections.py | 1 + python/lsst/daf/butler/script/removeRuns.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 9e5a3c843c..5d40d20de0 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -49,8 +49,11 @@ class CollectionInfo(BaseModel): """Documentation string associated with this collection.""" children: tuple[str, ...] = tuple() """Children of this collection (only if CHAINED).""" - parents: frozenset[str] = frozenset() - """Any parents of this collection.""" + parents: frozenset[str] | None = None + """Any parents of this collection. + + `None` if the parents were not requested. + """ def __lt__(self, other: Any) -> bool: """Compare objects by collection name.""" diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index e047cdb6f7..30b3b401bc 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -125,7 +125,7 @@ def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: if record.type == CollectionType.CHAINED: assert isinstance(record, ChainedCollectionRecord) children = tuple(record.children) - parents: set[str] = set() + parents: set[str] | None = None if include_parents: parents = self._registry.getCollectionParentChains(name) return CollectionInfo(name=name, type=record.type, doc=doc, parents=parents, children=children) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 7e907a9235..35b3d67be0 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -97,8 +97,7 @@ def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: info = self._registry._get_collection_info(name, include_doc=True, include_parents=include_parents) doc = info.doc or "" children = info.children or () - parents = info.parents or set() - return CollectionInfo(name=name, type=info.type, doc=doc, parents=parents, children=children) + return CollectionInfo(name=name, type=info.type, doc=doc, parents=info.parents, children=children) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: raise NotImplementedError("Not yet available.") diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py index 6f1b1ef856..f80b8e71cf 100644 --- a/python/lsst/daf/butler/script/queryCollections.py +++ b/python/lsst/daf/butler/script/queryCollections.py @@ -148,6 +148,7 @@ def _getTree( def addCollection(info: CollectionInfo, level: int = 0) -> None: table.add_row((" " * level + info.name, info.type.name)) if inverse: + assert info.parents is not None # For mypy. for pname in sorted(info.parents): pinfo = butler.collections.get_info(pname, include_parents=inverse) addCollection(pinfo, level + 1) diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index dc1cbe1c94..5edb2b2351 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -97,7 +97,7 @@ def _getCollectionInfo( runs = [] datasets: dict[str, int] = defaultdict(int) for collection_info in collections: - assert collection_info.type == CollectionType.RUN + assert collection_info.type == CollectionType.RUN and collection_info.parents is not None runs.append(RemoveRun(collection_info.name, list(collection_info.parents))) with butler._query() as query: for dt in dataset_types: From 3d1722331c3b988482ddaef1708b9fe793552655 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 13:30:55 -0700 Subject: [PATCH 18/26] Add dataset_types and governors to CollectionInfo --- python/lsst/daf/butler/_butler_collections.py | 22 +++++++++++++++- .../_direct_butler_collections.py | 26 ++++++++++++++++--- .../_remote_butler_collections.py | 23 +++++++++++++--- .../butler/tests/hybrid_butler_collections.py | 10 +++++-- 4 files changed, 72 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 5d40d20de0..4789a62010 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -54,6 +54,17 @@ class CollectionInfo(BaseModel): `None` if the parents were not requested. """ + dataset_types: frozenset[str] | None = None + """Names of any dataset types associated with datasets in this collection. + + `None` if no dataset type information was requested + """ + governors: dict[str, frozenset[str]] | None = None + """Values of any governor dimensions associated with datasets in this + collection. + + `None` if no governor information was requested. + """ def __lt__(self, other: Any) -> bool: """Compare objects by collection name.""" @@ -271,6 +282,7 @@ def x_query_info( flatten_chains: bool = False, include_chains: bool | None = None, include_parents: bool = False, + include_summary: bool = False, ) -> Sequence[CollectionInfo]: """Query the butler for collections matching an expression and return detailed information about those collections. @@ -293,6 +305,9 @@ def x_query_info( include either CHAINED collections or their children, but not both. include_parents : `bool`, optional Whether the returned information includes parents. + include_summary : `bool`, optional + Whether the returned information includes dataset type and + governor information for the collections. Returns ------- @@ -311,7 +326,9 @@ def x_query_info( raise NotImplementedError() @abstractmethod - def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: + def get_info( + self, name: str, include_parents: bool = False, include_summary: bool = False + ) -> CollectionInfo: """Obtain information for a specific collection. Parameters @@ -320,6 +337,9 @@ def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: The name of the collection of interest. include_parents : `bool`, optional If `True` any parents of this collection will be included. + include_summary : `bool`, optional + If `True` dataset type names and governor dimensions of datasets + stored in this collection will be included in the result. Returns ------- diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 30b3b401bc..15a3f5d3a1 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -104,6 +104,7 @@ def x_query_info( flatten_chains: bool = False, include_chains: bool | None = None, include_parents: bool = False, + include_summary: bool = False, ) -> Sequence[CollectionInfo]: info = [] with self._registry.caching_context(): @@ -115,10 +116,14 @@ def x_query_info( flattenChains=flatten_chains, includeChains=include_chains, ): - info.append(self.get_info(name, include_parents=include_parents)) + info.append( + self.get_info(name, include_parents=include_parents, include_summary=include_summary) + ) return info - def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: + def get_info( + self, name: str, include_parents: bool = False, include_summary: bool = False + ) -> CollectionInfo: record = self._registry.get_collection_record(name) doc = self._registry.getCollectionDocumentation(name) or "" children: tuple[str, ...] = tuple() @@ -128,7 +133,22 @@ def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: parents: set[str] | None = None if include_parents: parents = self._registry.getCollectionParentChains(name) - return CollectionInfo(name=name, type=record.type, doc=doc, parents=parents, children=children) + governors: dict[str, frozenset[str]] | None = None + dataset_types: Set[str] | None = None + if include_summary: + summary = self._registry.getCollectionSummary(name) + dataset_types = frozenset([dt.name for dt in summary.dataset_types]) + governors = {k: frozenset(v) for k, v in summary.governors.items()} + + return CollectionInfo( + name=name, + type=record.type, + doc=doc, + parents=parents, + children=children, + dataset_types=dataset_types, + governors=governors, + ) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: return self._registry.registerCollection(name, type, doc) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 35b3d67be0..c5cddbef17 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -78,6 +78,7 @@ def x_query_info( flatten_chains: bool = False, include_chains: bool | None = None, include_parents: bool = False, + include_summary: bool = False, ) -> Sequence[CollectionInfo]: # This should become a single call on the server in the future. if collection_types is None: @@ -90,14 +91,30 @@ def x_query_info( flattenChains=flatten_chains, includeChains=include_chains, ): - info.append(self.get_info(name, include_parents=include_parents)) + info.append(self.get_info(name, include_parents=include_parents, include_summary=include_summary)) return info - def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: + def get_info( + self, name: str, include_parents: bool = False, include_summary: bool = False + ) -> CollectionInfo: info = self._registry._get_collection_info(name, include_doc=True, include_parents=include_parents) doc = info.doc or "" children = info.children or () - return CollectionInfo(name=name, type=info.type, doc=doc, parents=info.parents, children=children) + governors: dict[str, frozenset[str]] | None = None + dataset_types: Set[str] | None = None + if include_summary: + summary = self._registry.getCollectionSummary(name) + dataset_types = frozenset([dt.name for dt in summary.dataset_types]) + governors = {k: frozenset(v) for k, v in summary.governors.items()} + return CollectionInfo( + name=name, + type=info.type, + doc=doc, + parents=info.parents, + children=children, + dataset_types=dataset_types, + governors=governors, + ) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: raise NotImplementedError("Not yet available.") diff --git a/python/lsst/daf/butler/tests/hybrid_butler_collections.py b/python/lsst/daf/butler/tests/hybrid_butler_collections.py index 4ac8688e6d..6c9e438e79 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_collections.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_collections.py @@ -84,6 +84,7 @@ def x_query_info( flatten_chains: bool = False, include_chains: bool | None = None, include_parents: bool = False, + include_summary: bool = False, ) -> Sequence[CollectionInfo]: return self._hybrid._remote_butler.collections.x_query_info( expression, @@ -91,10 +92,15 @@ def x_query_info( flatten_chains=flatten_chains, include_chains=include_chains, include_parents=include_parents, + include_summary=include_summary, ) - def get_info(self, name: str, include_parents: bool = False) -> CollectionInfo: - return self._hybrid._remote_butler.collections.get_info(name, include_parents=include_parents) + def get_info( + self, name: str, include_parents: bool = False, include_summary: bool = False + ) -> CollectionInfo: + return self._hybrid._remote_butler.collections.get_info( + name, include_parents=include_parents, include_summary=include_summary + ) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: return self._hybrid._direct_butler.collections.register(name, type=type, doc=doc) From e6251b864a9df6d141e392a39485ea223a60a326 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 14:35:25 -0700 Subject: [PATCH 19/26] Use collection summaries to speed up butler query-datasets Also sort the dataset types. --- .../lsst/daf/butler/script/queryDatasets.py | 22 ++++++++++++++++--- tests/test_cliCmdQueryDatasets.py | 8 +++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index c95e56fad9..dab6aba5ea 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -218,11 +218,27 @@ def getDatasets(self) -> Iterator[DatasetRef]: # Currently need to use old interface to get all the matching # dataset types and loop over the dataset types executing a new # query each time. - dataset_types = self.butler.registry.queryDatasetTypes(datasetTypes) + dataset_types: set[str] = {d.name for d in self.butler.registry.queryDatasetTypes(datasetTypes)} with self.butler._query() as query: - query_collections = self.butler.collections.x_query(query_collections) + # If there is more than one dataset type being requested include + # the summary information so we can pre-filter. With only one + # let query.datasets work it out. + include_summary = True if len(dataset_types) > 1 else False + query_collections_info = self.butler.collections.x_query_info( + query_collections, include_summary=include_summary + ) + query_collections = [c.name for c in query_collections_info] + + # Only iterate over dataset types that are relevant for the query. + if include_summary: + collection_dataset_types: set[str] = set() + for info in query_collections_info: + assert info.dataset_types is not None # For mypy. + collection_dataset_types.update(info.dataset_types) + dataset_types = dataset_types.intersection(collection_dataset_types) + # Accumulate over dataset types. - for dt in dataset_types: + for dt in sorted(dataset_types): results = query.datasets(dt, collections=query_collections, find_first=self._find_first) if self._where: results = results.where(self._where) diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py index e253f66f94..3670f5cd96 100644 --- a/tests/test_cliCmdQueryDatasets.py +++ b/tests/test_cliCmdQueryDatasets.py @@ -250,6 +250,10 @@ def testGlobDatasetType(self): tables = self._queryDatasets(repo=self.repoDir) expectedTables = ( + AstropyTable( + array(("alt_test_metric_comp", "ingest/run", "DummyCamComp", "425", "R", "d-r")), + names=("type", "run", "instrument", "visit", "band", "physical_filter"), + ), AstropyTable( array( ( @@ -259,10 +263,6 @@ def testGlobDatasetType(self): ), names=("type", "run", "instrument", "visit", "band", "physical_filter"), ), - AstropyTable( - array(("alt_test_metric_comp", "ingest/run", "DummyCamComp", "425", "R", "d-r")), - names=("type", "run", "instrument", "visit", "band", "physical_filter"), - ), ) self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True) From eaafc03e64da0ac2c5f5d56d08f304036749f1e4 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 16 Aug 2024 15:01:00 -0700 Subject: [PATCH 20/26] Improve the error message with table comparisons. --- python/lsst/daf/butler/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index 81a898f5a7..3e74b82db5 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -183,7 +183,7 @@ def assertAstropyTablesEqual( original_max = self.maxDiff self.maxDiff = None # This is required to get the full diff. try: - self.assertEqual(table1, expected1) + self.assertEqual(table1, expected1, f"Table:\n{table}\n\nvs Expected:\n{expected}") finally: self.maxDiff = original_max From af98c98d58c2349682f689c5e7804a9d549ffc90 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 08:24:31 -0700 Subject: [PATCH 21/26] Remove governors from CollectionInfo until we need it --- python/lsst/daf/butler/_butler_collections.py | 6 ------ .../daf/butler/direct_butler/_direct_butler_collections.py | 3 --- .../daf/butler/remote_butler/_remote_butler_collections.py | 3 --- 3 files changed, 12 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 4789a62010..d42d6f5b30 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -59,12 +59,6 @@ class CollectionInfo(BaseModel): `None` if no dataset type information was requested """ - governors: dict[str, frozenset[str]] | None = None - """Values of any governor dimensions associated with datasets in this - collection. - - `None` if no governor information was requested. - """ def __lt__(self, other: Any) -> bool: """Compare objects by collection name.""" diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 15a3f5d3a1..baa1c92176 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -133,12 +133,10 @@ def get_info( parents: set[str] | None = None if include_parents: parents = self._registry.getCollectionParentChains(name) - governors: dict[str, frozenset[str]] | None = None dataset_types: Set[str] | None = None if include_summary: summary = self._registry.getCollectionSummary(name) dataset_types = frozenset([dt.name for dt in summary.dataset_types]) - governors = {k: frozenset(v) for k, v in summary.governors.items()} return CollectionInfo( name=name, @@ -147,7 +145,6 @@ def get_info( parents=parents, children=children, dataset_types=dataset_types, - governors=governors, ) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index c5cddbef17..f021781a05 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -100,12 +100,10 @@ def get_info( info = self._registry._get_collection_info(name, include_doc=True, include_parents=include_parents) doc = info.doc or "" children = info.children or () - governors: dict[str, frozenset[str]] | None = None dataset_types: Set[str] | None = None if include_summary: summary = self._registry.getCollectionSummary(name) dataset_types = frozenset([dt.name for dt in summary.dataset_types]) - governors = {k: frozenset(v) for k, v in summary.governors.items()} return CollectionInfo( name=name, type=info.type, @@ -113,7 +111,6 @@ def get_info( parents=info.parents, children=children, dataset_types=dataset_types, - governors=governors, ) def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: From a859c71347a594f509af8bb1198827ddb0200eb5 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 08:31:06 -0700 Subject: [PATCH 22/26] Fix some spelling errors --- python/lsst/daf/butler/registry/tests/_registry.py | 8 ++++---- python/lsst/daf/butler/script/removeCollections.py | 4 ++-- python/lsst/daf/butler/transfers/_yaml.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index 0732e03556..0a66d7797b 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -1060,7 +1060,7 @@ def testNestedTransaction(self): # be committed). registry.insertDimensionData(dimension, dataId2) checkpointReached = True - # This should conflict and raise, triggerring a rollback + # This should conflict and raise, triggering a rollback # of the previous insertion within the same transaction # context, but not the original insertion in the outer # block. @@ -2968,7 +2968,7 @@ def testQueryResultSummaries(self): # Second query should yield no results, which we should see when # we attempt to expand the data ID. query2 = registry.queryDataIds(["physical_filter"], band="h") - # There's no execute=False, exact=Fals test here because the behavior + # There's no execute=False, exact=False test here because the behavior # not something we want to guarantee in this case (and exact=False # says either answer is legal). self.assertFalse(query2.any(execute=True, exact=False)) @@ -3684,7 +3684,7 @@ def test_skypix_constraint_queries(self) -> None: else: raise RuntimeError("Could not find usable skypix ID for this dimension configuration.") # New query system does not support non-common skypix constraints - # and we are deprecating it to replace with region-based constrints. + # and we are deprecating it to replace with region-based constraints. # TODO: Drop this tests once we remove support for non-common skypix. with contextlib.suppress(NotImplementedError): self.assertEqual( @@ -3895,7 +3895,7 @@ def test_query_find_datasets_drop_postprocessing(self) -> None: # against only one of the two collections. This should work even # though the relation returned by queryDataIds ends with # iteration-engine region-filtering, because we can recognize before - # running the query that there is only one collecton to search and + # running the query that there is only one collection to search and # hence the (default) findFirst=True is irrelevant, and joining in the # dataset query commutes past the iteration-engine postprocessing. query1 = registry.queryDataIds( diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index 747abaf1dc..71606a5461 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -41,7 +41,7 @@ class RemoveCollectionResult: """Container to return to the cli command; holds tables describing the collections that will be removed, as well as any found RUN collections - which can not be removed by this command. Also holds the callback funciton + which can not be removed by this command. Also holds the callback function to execute the remove upon user confirmation. """ @@ -73,7 +73,7 @@ def _getCollectionInfo( Parameters ---------- repo : `str` - The URI to the repostiory. + The URI to the repository. collection : `str` The collection string to search for. Same as the `expression` argument to `registry.queryCollections`. diff --git a/python/lsst/daf/butler/transfers/_yaml.py b/python/lsst/daf/butler/transfers/_yaml.py index dc9741bc6c..4911201bd5 100644 --- a/python/lsst/daf/butler/transfers/_yaml.py +++ b/python/lsst/daf/butler/transfers/_yaml.py @@ -301,7 +301,7 @@ class YamlRepoImportBackend(RepoImportBackend): stream : `io.IO` A readable file-like object. registry : `SqlRegistry` - The registry datasets will be imported into. Only used to retreive + The registry datasets will be imported into. Only used to retrieve dataset types during construction; all write happen in `register` and `load`. """ @@ -440,7 +440,7 @@ def __init__(self, stream: IO, registry: SqlRegistry): # Must create the visit_system_membership records. # But first create empty list for visits since other # logic in this file depends on self.dimensions being - # populated in an order consisteny with primary keys. + # populated in an order consistent with primary keys. self.dimensions[self.registry.dimensions["visit"]] = [] element = self.registry.dimensions["visit_system_membership"] RecordClass = element.RecordClass @@ -509,7 +509,7 @@ def __init__(self, stream: IO, registry: SqlRegistry): if not isinstance(child, str): warnings.warn( f"CHAINED collection {data['name']} includes restrictions on child " - "collection searches, which are no longer suppored and will be ignored.", + "collection searches, which are no longer supported and will be ignored.", stacklevel=find_outside_stacklevel("lsst.daf.butler"), ) # Old form with dataset type restrictions only, From c6d1cb06c1f2cfa2ca5b7bfd2115490acfe20dd8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 12:53:35 -0700 Subject: [PATCH 23/26] Rewrite command-lines to filter dataset types by collection summary --- python/lsst/daf/butler/_butler_collections.py | 12 +++++++ python/lsst/daf/butler/script/queryDataIds.py | 21 ++++++++++--- .../lsst/daf/butler/script/queryDatasets.py | 31 ++++++++++++------- .../butler/script/queryDimensionRecords.py | 9 ++++-- python/lsst/daf/butler/script/removeRuns.py | 8 +++-- .../daf/butler/script/retrieveArtifacts.py | 6 ++-- .../daf/butler/script/transferDatasets.py | 10 ++++-- 7 files changed, 72 insertions(+), 25 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index d42d6f5b30..71045f2937 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -403,3 +403,15 @@ def x_remove(self, name: str) -> None: be deleted or redefined first. """ raise NotImplementedError() + + def _filter_dataset_types( + self, dataset_types: Iterable[str], collections: Iterable[CollectionInfo] + ) -> Iterable[str]: + dataset_types_set = set(dataset_types) + collection_dataset_types: set[str] = set() + for info in collections: + if info.dataset_types is None: + raise RuntimeError("Can only filter by collections if include_summary was True") + collection_dataset_types.update(info.dataset_types) + dataset_types_set = dataset_types_set.intersection(collection_dataset_types) + return dataset_types_set diff --git a/python/lsst/daf/butler/script/queryDataIds.py b/python/lsst/daf/butler/script/queryDataIds.py index a70aba1887..41bbfd4d6d 100644 --- a/python/lsst/daf/butler/script/queryDataIds.py +++ b/python/lsst/daf/butler/script/queryDataIds.py @@ -169,11 +169,22 @@ def queryDataIds( if datasets: # Need to constrain results based on dataset type and collection. query_collections = collections or "*" - - expanded_collections = butler.collections.x_query(query_collections) - - sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) - for dt in dataset_types: + collections_info = butler.collections.x_query_info(query_collections, include_summary=True) + expanded_collections = [info.name for info in collections_info] + filtered_dataset_types = list( + butler.collections._filter_dataset_types([dt.name for dt in dataset_types], collections_info) + ) + if not filtered_dataset_types: + return ( + None, + f"No datasets of type {datasets!r} existed in the specified " + f"collections {','.join(expanded_collections)}.", + ) + + sub_query = query.join_dataset_search( + filtered_dataset_types.pop(0), collections=expanded_collections + ) + for dt in filtered_dataset_types: sub_query = sub_query.join_dataset_search(dt, collections=expanded_collections) results = sub_query.data_ids(dimensions) diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index dab6aba5ea..5403a44f5b 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -27,6 +27,7 @@ from __future__ import annotations import dataclasses +import logging from collections import defaultdict from collections.abc import Iterable, Iterator from typing import TYPE_CHECKING @@ -42,6 +43,9 @@ from lsst.resources import ResourcePath +_LOG = logging.getLogger(__name__) + + @dataclasses.dataclass(frozen=True) class _RefInfo: datasetRef: DatasetRef @@ -204,6 +208,7 @@ def getTables(self) -> list[AstropyTable]: return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] + # @profile def getDatasets(self) -> Iterator[DatasetRef]: """Get the datasets as a list. @@ -219,23 +224,27 @@ def getDatasets(self) -> Iterator[DatasetRef]: # dataset types and loop over the dataset types executing a new # query each time. dataset_types: set[str] = {d.name for d in self.butler.registry.queryDatasetTypes(datasetTypes)} + n_dataset_types = len(dataset_types) with self.butler._query() as query: - # If there is more than one dataset type being requested include - # the summary information so we can pre-filter. With only one - # let query.datasets work it out. - include_summary = True if len(dataset_types) > 1 else False + # Expand the collections query and include summary information. query_collections_info = self.butler.collections.x_query_info( - query_collections, include_summary=include_summary + query_collections, include_summary=True ) query_collections = [c.name for c in query_collections_info] # Only iterate over dataset types that are relevant for the query. - if include_summary: - collection_dataset_types: set[str] = set() - for info in query_collections_info: - assert info.dataset_types is not None # For mypy. - collection_dataset_types.update(info.dataset_types) - dataset_types = dataset_types.intersection(collection_dataset_types) + dataset_types = set( + self.butler.collections._filter_dataset_types(dataset_types, query_collections_info) + ) + + if (n_filtered := len(dataset_types)) != n_dataset_types: + _LOG.info("Filtered %d dataset types down to %d", n_dataset_types, n_filtered) + elif n_dataset_types == 0: + _LOG.info("The given dataset type, %s, is not known to this butler.", datasetTypes) + else: + _LOG.info( + "Processing %d dataset type%s", n_dataset_types, "" if n_dataset_types == 1 else "s" + ) # Accumulate over dataset types. for dt in sorted(dataset_types): diff --git a/python/lsst/daf/butler/script/queryDimensionRecords.py b/python/lsst/daf/butler/script/queryDimensionRecords.py index 11470e8f17..60ee77c506 100644 --- a/python/lsst/daf/butler/script/queryDimensionRecords.py +++ b/python/lsst/daf/butler/script/queryDimensionRecords.py @@ -80,8 +80,13 @@ def queryDimensionRecords( if datasets: query_collections = collections or "*" - expanded_collections = butler.collections.x_query(query_collections) - dataset_types = list(butler.registry.queryDatasetTypes(datasets)) + collections_info = butler.collections.x_query_info(query_collections, include_summary=True) + expanded_collections = [info.name for info in collections_info] + dataset_types = [dt.name for dt in butler.registry.queryDatasetTypes(datasets)] + dataset_types = list(butler.collections._filter_dataset_types(dataset_types, collections_info)) + + if not dataset_types: + return None sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections) for dt in dataset_types: diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index 5edb2b2351..94e85ce26d 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -88,12 +88,14 @@ def _getCollectionInfo( butler = Butler.from_config(repo) try: collections = butler.collections.x_query_info( - collection, CollectionType.RUN, include_chains=False, include_parents=True + collection, CollectionType.RUN, include_chains=False, include_parents=True, include_summary=True ) except MissingCollectionError: # Act as if no collections matched. collections = [] - dataset_types = butler.registry.queryDatasetTypes(...) + dataset_types = [dt.name for dt in butler.registry.queryDatasetTypes(...)] + dataset_types = list(butler.collections._filter_dataset_types(dataset_types, collections)) + runs = [] datasets: dict[str, int] = defaultdict(int) for collection_info in collections: @@ -104,7 +106,7 @@ def _getCollectionInfo( results = query.datasets(dt, collections=collection_info.name) count = results.count(exact=False) if count: - datasets[dt.name] += count + datasets[dt] += count return runs, {k: datasets[k] for k in sorted(datasets.keys())} diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index f9abc50669..066c8b6d56 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -90,10 +90,12 @@ def retrieveArtifacts( # Need to store in list so we can count the number to give some feedback # to caller. - dataset_types = butler.registry.queryDatasetTypes(query_types) + dataset_types = [dt.name for dt in butler.registry.queryDatasetTypes(query_types)] refs: list[DatasetRef] = [] with butler._query() as query: - expanded_collections = butler.collections.x_query(query_collections) + collections_info = butler.collections.x_query_info(query_collections, include_summary=True) + expanded_collections = [info.name for info in collections_info] + dataset_types = list(butler.collections._filter_dataset_types(dataset_types, collections_info)) for dt in dataset_types: results = query.datasets(dt, collections=expanded_collections, find_first=find_first) if where: diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py index 5060065bad..9839212d2f 100644 --- a/python/lsst/daf/butler/script/transferDatasets.py +++ b/python/lsst/daf/butler/script/transferDatasets.py @@ -80,10 +80,16 @@ def transferDatasets( dataset_type_expr = dataset_type or ... collections_expr: tuple[str, ...] = collections or ("*",) - dataset_types = source_butler.registry.queryDatasetTypes(dataset_type_expr) + dataset_types = [dt.name for dt in source_butler.registry.queryDatasetTypes(dataset_type_expr)] source_refs: list[DatasetRef] = [] with source_butler._query() as query: - query_collections = source_butler.collections.x_query(collections_expr) + query_collections_info = source_butler.collections.x_query_info( + collections_expr, include_summary=True + ) + query_collections = [info.name for info in query_collections_info] + dataset_types = list( + source_butler.collections._filter_dataset_types(dataset_types, query_collections_info) + ) # Loop over dataset types and accumulate. for dt in dataset_types: results = query.datasets(dt, collections=query_collections, find_first=find_first) From 467cc550e945101a41ee0c60304acd0095a2ade2 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 13:42:08 -0700 Subject: [PATCH 24/26] Add --collections to query-dataset-types command-line --- python/lsst/daf/butler/cli/cmd/commands.py | 5 +++++ .../lsst/daf/butler/script/queryDatasetTypes.py | 17 +++++++++++++++-- tests/test_cliCmdQueryDatasetTypes.py | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py index 98810481c6..03b2325956 100644 --- a/python/lsst/daf/butler/cli/cmd/commands.py +++ b/python/lsst/daf/butler/cli/cmd/commands.py @@ -439,6 +439,11 @@ def query_collections(*args: Any, **kwargs: Any) -> None: "dataset types to return." ) @verbose_option(help="Include dataset type name, dimensions, and storage class in output.") +@collections_option( + help="Constrain the resulting dataset types by these collections. " + "This constraint does not say that a dataset of this type is definitely present, " + "solely that one may have been present at some point." +) @options_file_option() def query_dataset_types(*args: Any, **kwargs: Any) -> None: """Get the dataset types in a repository.""" diff --git a/python/lsst/daf/butler/script/queryDatasetTypes.py b/python/lsst/daf/butler/script/queryDatasetTypes.py index 58cf68b693..8d657e69bf 100644 --- a/python/lsst/daf/butler/script/queryDatasetTypes.py +++ b/python/lsst/daf/butler/script/queryDatasetTypes.py @@ -34,7 +34,9 @@ from .._butler import Butler -def queryDatasetTypes(repo: str, verbose: bool, glob: Iterable[str]) -> Table: +def queryDatasetTypes( + repo: str, verbose: bool, glob: Iterable[str], collections: Iterable[str] | None = None +) -> Table: """Get the dataset types in a repository. Parameters @@ -48,16 +50,27 @@ def queryDatasetTypes(repo: str, verbose: bool, glob: Iterable[str]) -> Table: glob : iterable [`str`] A list of glob-style search string that fully or partially identify the dataset type names to search for. + collections : iterable [`str`] or `None`, optional + Constrains resulting dataset types such that only dataset type + found (at some point) in these collections will be returned. Returns ------- - collections : `astropy.table.Table` + dataset_types_table : `astropy.table.Table` A dict whose key is "datasetTypes" and whose value is a list of collection names. """ butler = Butler.from_config(repo, without_datastore=True) expression = glob or ... datasetTypes = butler.registry.queryDatasetTypes(expression=expression) + + if collections: + collections_info = butler.collections.x_query_info(collections, include_summary=True) + filtered_dataset_types = set( + butler.collections._filter_dataset_types([d.name for d in datasetTypes], collections_info) + ) + datasetTypes = [d for d in datasetTypes if d.name in filtered_dataset_types] + if verbose: table = Table( array( diff --git a/tests/test_cliCmdQueryDatasetTypes.py b/tests/test_cliCmdQueryDatasetTypes.py index c656f4cbe3..8ce01e923c 100644 --- a/tests/test_cliCmdQueryDatasetTypes.py +++ b/tests/test_cliCmdQueryDatasetTypes.py @@ -46,7 +46,7 @@ class QueryDatasetTypesCmdTest(CliCmdTestBase, unittest.TestCase): @staticmethod def defaultExpected(): - return dict(repo=None, verbose=False, glob=()) + return dict(repo=None, verbose=False, glob=(), collections=()) @staticmethod def command(): From 13ac82650bd8b7089d54e5f0eb25ab7e1c7c30b5 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 13:47:13 -0700 Subject: [PATCH 25/26] Add news fragment --- doc/changes/DM-45738.feature.rst | 4 ++++ doc/changes/DM-45738.removal.rst | 2 ++ 2 files changed, 6 insertions(+) create mode 100644 doc/changes/DM-45738.feature.rst create mode 100644 doc/changes/DM-45738.removal.rst diff --git a/doc/changes/DM-45738.feature.rst b/doc/changes/DM-45738.feature.rst new file mode 100644 index 0000000000..df6ddf1f53 --- /dev/null +++ b/doc/changes/DM-45738.feature.rst @@ -0,0 +1,4 @@ +* Added ``--collections`` option to ``butler query-dataset-types`` to allow the resultant dataset types to be constrained by those that are used by specific collections. +* Changed the ``Butler.collections`` property to be a ``ButlerCollections`` instance. + This object can still act as a sequence equivalent to ``ButlerCollections.defaults`` but adds new APIs for querying and manipulating collections. + Any methods with names starting with ``x_`` are deemed to be an experimental API that may change in the future. diff --git a/doc/changes/DM-45738.removal.rst b/doc/changes/DM-45738.removal.rst new file mode 100644 index 0000000000..f165b57d13 --- /dev/null +++ b/doc/changes/DM-45738.removal.rst @@ -0,0 +1,2 @@ +The ``Butler.collection_chains`` property is now deprecated. +Please use ``Butler.collections`` instead. From dc4b3cbb314b6294676436999b6d9e1634926298 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 19 Aug 2024 14:17:17 -0700 Subject: [PATCH 26/26] Catch and reraise the IntegrityError in collections.x_remove --- python/lsst/daf/butler/_butler_collections.py | 2 +- .../daf/butler/direct_butler/_direct_butler_collections.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 71045f2937..8a3d2e2d63 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -384,7 +384,7 @@ def x_remove(self, name: str) -> None: ------ lsst.daf.butler.registry.MissingCollectionError Raised if no collection with the given name exists. - sqlalchemy.exc.IntegrityError + lsst.daf.butler.registry.OrphanedRecordError Raised if the database rows associated with the collection are still referenced by some other table, such as a dataset in a datastore (for `~CollectionType.RUN` collections only) or a diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index baa1c92176..8a4b7c70e8 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -31,10 +31,12 @@ from collections.abc import Iterable, Sequence, Set +import sqlalchemy from lsst.utils.iteration import ensure_iterable from .._butler_collections import ButlerCollections, CollectionInfo from .._collection_type import CollectionType +from ..registry._exceptions import OrphanedRecordError from ..registry.interfaces import ChainedCollectionRecord from ..registry.sql_registry import SqlRegistry @@ -151,4 +153,7 @@ def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: st return self._registry.registerCollection(name, type, doc) def x_remove(self, name: str) -> None: - self._registry.removeCollection(name) + try: + self._registry.removeCollection(name) + except sqlalchemy.exc.IntegrityError as e: + raise OrphanedRecordError(f"Datasets in run {name} are still referenced elsewhere.") from e