From a3ec38ab357049766cfdc663033f32a1c2589815 Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Sat, 8 Jul 2023 11:50:57 -0400 Subject: [PATCH] Use query-results' natural grouping by dataset type when possible. When processing all dataset types in a collection together, this can represent a huge decrease in memory usage, by querying for and then processing only one dataset type at a time. --- doc/changes/DM-39939.perf.md | 1 + python/lsst/daf/butler/core/datasets/ref.py | 53 ++++++++++++++++++- python/lsst/daf/butler/registries/sql.py | 8 +-- .../daf/butler/registry/queries/_results.py | 22 ++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) create mode 100644 doc/changes/DM-39939.perf.md diff --git a/doc/changes/DM-39939.perf.md b/doc/changes/DM-39939.perf.md new file mode 100644 index 0000000000..abe13aeb64 --- /dev/null +++ b/doc/changes/DM-39939.perf.md @@ -0,0 +1 @@ +When passing lazy query-results objects directly to various registry methods (`associate`, `disassociate`, `removeDatasets`, and `certify`), query and process one dataset type at a time instead of querying for all of them and grouping by type in Python. diff --git a/python/lsst/daf/butler/core/datasets/ref.py b/python/lsst/daf/butler/core/datasets/ref.py index 9c7faefd63..d036c8857f 100644 --- a/python/lsst/daf/butler/core/datasets/ref.py +++ b/python/lsst/daf/butler/core/datasets/ref.py @@ -33,7 +33,7 @@ import sys import uuid from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, Protocol, runtime_checkable from lsst.utils.classes import immutable @@ -62,6 +62,27 @@ class AmbiguousDatasetError(Exception): """ +@runtime_checkable +class _DatasetRefGroupedIterable(Protocol): + """A package-private interface for iterables of `DatasetRef` that know how + to efficiently group their contents by `DatasetType`. + + """ + + def _iter_by_dataset_type(self) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: + """Iterate over `DatasetRef` instances, one `DatasetType` at a time. + + Returns + ------- + grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ + `~collections.abc.Iterable` [ `DatasetRef` ] + An iterable of tuples, in which the first element is a dataset type + and the second is an iterable of `DatasetRef` objects with exactly + that dataset type. + """ + ... + + class DatasetIdGenEnum(enum.Enum): """Enum used to specify dataset ID generation options.""" @@ -580,12 +601,42 @@ def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[Da ------- grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] Grouped `DatasetRef` instances. + + Notes + ----- + When lazy item-iterables are acceptable instead of a full mapping, + `iter_by_type` can in some cases be far more efficient. """ result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() for ref in refs: result.setdefault(ref.datasetType, []).append(ref) return result + @staticmethod + def iter_by_type( + refs: Iterable[DatasetRef], + ) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]: + """Group an iterable of `DatasetRef` by `DatasetType` with special + hooks for custom iterables that can do this efficiently. + + Parameters + ---------- + refs : `~collections.abc.Iterable` [ `DatasetRef` ] + `DatasetRef` instances to group. If this satisfies the + `_DatasetRefGroupedIterable` protocol, its + `~_DatasetRefGroupedIterable._iter_by_dataset_type` method will + be called. + + Returns + ------- + grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \ + `Iterable` [ `DatasetRef` ] ]] + Grouped `DatasetRef` instances. + """ + if isinstance(refs, _DatasetRefGroupedIterable): + return refs._iter_by_dataset_type() + return DatasetRef.groupByType(refs).items() + def makeCompositeRef(self) -> DatasetRef: """Create a `DatasetRef` of the composite from a component ref. diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py index dc112eeafb..18983607da 100644 --- a/python/lsst/daf/butler/registries/sql.py +++ b/python/lsst/daf/butler/registries/sql.py @@ -679,7 +679,7 @@ def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: # Docstring inherited from lsst.daf.butler.registry.Registry progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) for datasetType, refsForType in progress.iter_item_chunks( - DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" + DatasetRef.iter_by_type(refs), desc="Removing datasets by type" ): storage = self._managers.datasets[datasetType.name] try: @@ -699,7 +699,7 @@ def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." ) for datasetType, refsForType in progress.iter_item_chunks( - DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" + DatasetRef.iter_by_type(refs), desc="Associating datasets by type" ): storage = self._managers.datasets[datasetType.name] try: @@ -727,7 +727,7 @@ def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." ) for datasetType, refsForType in progress.iter_item_chunks( - DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" + DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type" ): storage = self._managers.datasets[datasetType.name] storage.disassociate(collectionRecord, refsForType) @@ -740,7 +740,7 @@ def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespa progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) collectionRecord = self._managers.collections.find(collection) for datasetType, refsForType in progress.iter_item_chunks( - DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" + DatasetRef.iter_by_type(refs), desc="Certifying datasets by type" ): storage = self._managers.datasets[datasetType.name] storage.certify( diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py index ac2073ea92..c024037455 100644 --- a/python/lsst/daf/butler/registry/queries/_results.py +++ b/python/lsst/daf/butler/registry/queries/_results.py @@ -529,6 +529,21 @@ def explain_no_results(self, execute: bool = True) -> Iterable[str]: """ raise NotImplementedError() + def _iter_by_dataset_type(self) -> Iterator[tuple[DatasetType, Iterable[DatasetRef]]]: + """Group results by dataset type. + + This is a private hook for the interface defined by + `DatasetRef.iter_by_type`, enabling much more efficient + processing of heterogeneous `DatasetRef` iterables when they come + directly from queries. + """ + for parent_results in self.byParentDatasetType(): + for component in parent_results.components: + dataset_type = parent_results.parentDatasetType + if component is not None: + dataset_type = dataset_type.makeComponentDatasetType(component) + yield (dataset_type, parent_results.withComponents((component,))) + class ParentDatasetQueryResults(DatasetQueryResults): """An object that represents results from a query for datasets with a @@ -570,6 +585,13 @@ def __iter__(self) -> Iterator[DatasetRef]: def __repr__(self) -> str: return f"" + @property + def components(self) -> Sequence[str | None]: + """The components of the parent dataset type included in these results + (`~collections.abc.Sequence` [ `str` or `None` ]). + """ + return self._components + def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]: # Docstring inherited from DatasetQueryResults. yield self