Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-46129: Make collections.query_info a single HTTP call #1074

Merged
merged 8 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
class CollectionInfo(BaseModel):
"""Information about a single Butler collection."""

# This class is serialized for the server API -- any new properties you add
# must have default values provided to preserve backwards compatibility.

name: str
"""Name of the collection."""
type: CollectionType
Expand Down Expand Up @@ -280,7 +283,7 @@ def query_info(
include_parents: bool = False,
include_summary: bool = False,
include_doc: bool = False,
summary_datasets: Iterable[DatasetType] | None = None,
summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we allow Iterable[str] only to avoid all complications? I think that clients of this method that actually pass summary_datasets should already have a list of names for other purposes. I guess this also means that datasets manager has to accept Iterable[str] instead of Iterable[DatasetType]?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking the same thing, but the convention in the rest of the new query system is that dataset types can be specified as either the name or DatasetType instance so it makes sense to follow that. It's also common for this parameter to come from the result of a queryDatasetTypes call.

) -> Sequence[CollectionInfo]:
"""Query the butler for collections matching an expression and
return detailed information about those collections.
Expand All @@ -307,8 +310,8 @@ def query_info(
include_doc : `bool`, optional
Whether the returned information includes collection documentation
string.
summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ], \
optional
summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \
`~collections.abc.Iterable` [ `str` ], optional
Dataset types to include in returned summaries. Only used if
``include_summary`` is `True`. If not specified then all dataset
types will be included.
Expand Down
22 changes: 22 additions & 0 deletions python/lsst/daf/butler/_dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,3 +796,25 @@
arguments as well as positional arguments.
"""
return factory(*args, **kwargs)


def get_dataset_type_name(datasetTypeOrName: DatasetType | str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make it a classmethod of DatasetType to avoid extra imports?

"""Given a `DatasetType` object or a dataset type name, return a dataset
type name.

Parameters
----------
datasetTypeOrName : `DatasetType` | `str`
A DatasetType, or the name of a DatasetType.

Returns
-------
name
The name associated with the given DatasetType, or the given string.
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docstring also needs Returns.

if isinstance(datasetTypeOrName, DatasetType):
return datasetTypeOrName.name
elif isinstance(datasetTypeOrName, str):
return datasetTypeOrName
else:
raise TypeError(f"Expected DatasetType or str, got unexpected object: {datasetTypeOrName}")

Check warning on line 820 in python/lsst/daf/butler/_dataset_type.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/_dataset_type.py#L820

Added line #L820 was not covered by tests
89 changes: 44 additions & 45 deletions python/lsst/daf/butler/direct_butler/_direct_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,55 +114,54 @@ def query_info(
include_parents: bool = False,
include_summary: bool = False,
include_doc: bool = False,
summary_datasets: Iterable[DatasetType] | None = None,
summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Sequence[CollectionInfo]:
info = []
with self._registry.caching_context():
if collection_types is None:
collection_types = CollectionType.all()
elif isinstance(collection_types, CollectionType):
collection_types = {collection_types}

records = self._registry._managers.collections.resolve_wildcard(
CollectionWildcard.from_expression(expression),
collection_types=collection_types,
flatten_chains=flatten_chains,
include_chains=include_chains,
)
if collection_types is None:
collection_types = CollectionType.all()
elif isinstance(collection_types, CollectionType):
collection_types = {collection_types}

records = self._registry._managers.collections.resolve_wildcard(
CollectionWildcard.from_expression(expression),
collection_types=collection_types,
flatten_chains=flatten_chains,
include_chains=include_chains,
)

summaries: Mapping[Any, CollectionSummary] = {}
if include_summary:
summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)

docs: Mapping[Any, str] = {}
if include_doc:
docs = self._registry._managers.collections.get_docs(record.key for record in records)

for record in records:
doc = docs.get(record.key, "")
children: tuple[str, ...] = tuple()
if record.type == CollectionType.CHAINED:
assert isinstance(record, ChainedCollectionRecord)
children = tuple(record.children)
parents: frozenset[str] | None = None
if include_parents:
# TODO: This is non-vectorized, so expensive to do in a
# loop.
parents = frozenset(self._registry.getCollectionParentChains(record.name))
dataset_types: Set[str] | None = None
if summary := summaries.get(record.key):
dataset_types = frozenset([dt.name for dt in summary.dataset_types])

info.append(
CollectionInfo(
name=record.name,
type=record.type,
doc=doc,
parents=parents,
children=children,
dataset_types=dataset_types,
)
summaries: Mapping[Any, CollectionSummary] = {}
if include_summary:
summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)

docs: Mapping[Any, str] = {}
if include_doc:
docs = self._registry._managers.collections.get_docs(record.key for record in records)

for record in records:
doc = docs.get(record.key, "")
children: tuple[str, ...] = tuple()
if record.type == CollectionType.CHAINED:
assert isinstance(record, ChainedCollectionRecord)
children = tuple(record.children)
parents: frozenset[str] | None = None
if include_parents:
# TODO: This is non-vectorized, so expensive to do in a
# loop.
parents = frozenset(self._registry.getCollectionParentChains(record.name))
dataset_types: Set[str] | None = None
if summary := summaries.get(record.key):
dataset_types = frozenset([dt.name for dt in summary.dataset_types])

info.append(
CollectionInfo(
name=record.name,
type=record.type,
doc=doc,
parents=parents,
children=children,
dataset_types=dataset_types,
)
)

return info

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

import sqlalchemy

from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ...._dataset_type import DatasetType, get_dataset_type_name
from ...._exceptions_legacy import DatasetTypeError
from ....dimensions import DimensionUniverse
from ..._collection_summary import CollectionSummary
Expand Down Expand Up @@ -511,12 +512,14 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar
return summaries[collection.key]

def fetch_summaries(
self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
self,
collections: Iterable[CollectionRecord],
dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Mapping[Any, CollectionSummary]:
# Docstring inherited from DatasetRecordStorageManager.
dataset_type_names: Iterable[str] | None = None
if dataset_types is not None:
dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types)
return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)

_versions: list[VersionTuple]
Expand Down
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/registry/interfaces/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,9 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar

@abstractmethod
def fetch_summaries(
self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
self,
collections: Iterable[CollectionRecord],
dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Mapping[Any, CollectionSummary]:
"""Fetch collection summaries given their names and dataset types.

Expand Down
60 changes: 60 additions & 0 deletions python/lsst/daf/butler/remote_butler/_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ..registry import RegistryDefaults


class DefaultsHolder:
"""Holds a `RegistryDefaults` object and allows it to be set.

Parameters
----------
defaults : `RegistryDefaults`
Initial value for the defaults object.

Notes
-----
This exists to work around circular dependency issues (RemoteButler,
ButlerCollections, and Registry all need to know/modify the defaults.)
"""

def __init__(self, defaults: RegistryDefaults) -> None:
self._defaults = defaults

def get(self) -> RegistryDefaults:
"""Retrieve the current registry defaults."""
return self._defaults

def set(self, defaults: RegistryDefaults) -> None:
"""Set a new value for the registry defaults.

Parameters
----------
defaults : `RegistryDefaults`
New value for defaults object.
"""
self._defaults = defaults
9 changes: 2 additions & 7 deletions python/lsst/daf/butler/remote_butler/_ref_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from pydantic import TypeAdapter

from .._dataset_ref import DatasetRef
from .._dataset_type import DatasetType
from .._dataset_type import DatasetType, get_dataset_type_name
from .._storage_class import StorageClass
from ..dimensions import DataCoordinate, DataId, DataIdValue, SerializedDataId
from .server_models import DatasetTypeName
Expand Down Expand Up @@ -85,12 +85,7 @@ def normalize_dataset_type_name(datasetTypeOrName: DatasetType | str) -> Dataset
A DatasetType, or the name of a DatasetType. This union is a common
parameter in many `Butler` methods.
"""
if isinstance(datasetTypeOrName, DatasetType):
return DatasetTypeName(datasetTypeOrName.name)
elif isinstance(datasetTypeOrName, str):
return DatasetTypeName(datasetTypeOrName)
else:
raise TypeError(f"Got unexpected object for DatasetType: {datasetTypeOrName}")
return DatasetTypeName(get_dataset_type_name(datasetTypeOrName))


def simplify_dataId(dataId: DataId | None, kwargs: dict[str, DataIdValue]) -> SerializedDataId:
Expand Down
Loading
Loading