Use query-results' natural grouping by dataset type when possible.

When processing all dataset types in a collection together, this can represent a huge decrease in memory usage, by querying for and then processing only one dataset type at a time.
lsst · Jul 14, 2023 · a3ec38a · a3ec38a
1 parent 9af7b4f
commit a3ec38a
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 5 deletions.
diff --git a/doc/changes/DM-39939.perf.md b/doc/changes/DM-39939.perf.md
@@ -0,0 +1 @@
+When passing lazy query-results objects directly to various registry methods (`associate`, `disassociate`, `removeDatasets`, and `certify`), query and process one dataset type at a time instead of querying for all of them and grouping by type in Python.
diff --git a/python/lsst/daf/butler/core/datasets/ref.py b/python/lsst/daf/butler/core/datasets/ref.py
@@ -33,7 +33,7 @@
 import sys
 import uuid
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, Protocol, runtime_checkable
 
 from lsst.utils.classes import immutable
 
@@ -62,6 +62,27 @@ class AmbiguousDatasetError(Exception):
     """
 
 
+@runtime_checkable
+class _DatasetRefGroupedIterable(Protocol):
+    """A package-private interface for iterables of `DatasetRef` that know how
+    to efficiently group their contents by `DatasetType`.
+
+    """
+
+    def _iter_by_dataset_type(self) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]:
+        """Iterate over `DatasetRef` instances, one `DatasetType` at a time.
+
+        Returns
+        -------
+        grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \
+                `~collections.abc.Iterable` [ `DatasetRef` ]
+            An iterable of tuples, in which the first element is a dataset type
+            and the second is an iterable of `DatasetRef` objects with exactly
+            that dataset type.
+        """
+        ...
+
+
 class DatasetIdGenEnum(enum.Enum):
     """Enum used to specify dataset ID generation options."""
 
@@ -580,12 +601,42 @@ def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[Da
         -------
         grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
             Grouped `DatasetRef` instances.
+
+        Notes
+        -----
+        When lazy item-iterables are acceptable instead of a full mapping,
+        `iter_by_type` can in some cases be far more efficient.
         """
         result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
         for ref in refs:
             result.setdefault(ref.datasetType, []).append(ref)
         return result
 
+    @staticmethod
+    def iter_by_type(
+        refs: Iterable[DatasetRef],
+    ) -> Iterable[tuple[DatasetType, Iterable[DatasetRef]]]:
+        """Group an iterable of `DatasetRef` by `DatasetType` with special
+        hooks for custom iterables that can do this efficiently.
+
+        Parameters
+        ----------
+        refs : `~collections.abc.Iterable` [ `DatasetRef` ]
+            `DatasetRef` instances to group.  If this satisfies the
+            `_DatasetRefGroupedIterable` protocol, its
+            `~_DatasetRefGroupedIterable._iter_by_dataset_type` method will
+            be called.
+
+        Returns
+        -------
+        grouped : `~collections.abc.Iterable` [ `tuple` [ `DatasetType`, \
+                `Iterable` [ `DatasetRef` ] ]]
+            Grouped `DatasetRef` instances.
+        """
+        if isinstance(refs, _DatasetRefGroupedIterable):
+            return refs._iter_by_dataset_type()
+        return DatasetRef.groupByType(refs).items()
+
     def makeCompositeRef(self) -> DatasetRef:
         """Create a `DatasetRef` of the composite from a component ref.
 

diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py
@@ -679,7 +679,7 @@ def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
         # Docstring inherited from lsst.daf.butler.registry.Registry
         progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG)
         for datasetType, refsForType in progress.iter_item_chunks(
-            DatasetRef.groupByType(refs).items(), desc="Removing datasets by type"
+            DatasetRef.iter_by_type(refs), desc="Removing datasets by type"
         ):
             storage = self._managers.datasets[datasetType.name]
             try:
@@ -699,7 +699,7 @@ def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
                 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
             )
         for datasetType, refsForType in progress.iter_item_chunks(
-            DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
+            DatasetRef.iter_by_type(refs), desc="Associating datasets by type"
         ):
             storage = self._managers.datasets[datasetType.name]
             try:
@@ -727,7 +727,7 @@ def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
                 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
             )
         for datasetType, refsForType in progress.iter_item_chunks(
-            DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type"
+            DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type"
         ):
             storage = self._managers.datasets[datasetType.name]
             storage.disassociate(collectionRecord, refsForType)
@@ -740,7 +740,7 @@ def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespa
         progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG)
         collectionRecord = self._managers.collections.find(collection)
         for datasetType, refsForType in progress.iter_item_chunks(
-            DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type"
+            DatasetRef.iter_by_type(refs), desc="Certifying datasets by type"
         ):
             storage = self._managers.datasets[datasetType.name]
             storage.certify(

diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py
@@ -529,6 +529,21 @@ def explain_no_results(self, execute: bool = True) -> Iterable[str]:
         """
         raise NotImplementedError()
 
+    def _iter_by_dataset_type(self) -> Iterator[tuple[DatasetType, Iterable[DatasetRef]]]:
+        """Group results by dataset type.
+
+        This is a private hook for the interface defined by
+        `DatasetRef.iter_by_type`, enabling much more efficient
+        processing of heterogeneous `DatasetRef` iterables when they come
+        directly from queries.
+        """
+        for parent_results in self.byParentDatasetType():
+            for component in parent_results.components:
+                dataset_type = parent_results.parentDatasetType
+                if component is not None:
+                    dataset_type = dataset_type.makeComponentDatasetType(component)
+                yield (dataset_type, parent_results.withComponents((component,)))
+
 
 class ParentDatasetQueryResults(DatasetQueryResults):
     """An object that represents results from a query for datasets with a
@@ -570,6 +585,13 @@ def __iter__(self) -> Iterator[DatasetRef]:
     def __repr__(self) -> str:
         return f"<DatasetRef iterator for [components of] {self._dataset_type.name}>"
 
+    @property
+    def components(self) -> Sequence[str | None]:
+        """The components of the parent dataset type included in these results
+        (`~collections.abc.Sequence` [ `str` or `None` ]).
+        """
+        return self._components
+
     def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
         # Docstring inherited from DatasetQueryResults.
         yield self