Skip to content

Commit

Permalink
Initial work on extending butler collections API
Browse files Browse the repository at this point in the history
  • Loading branch information
timj committed Aug 15, 2024
1 parent 16a6763 commit 3578e01
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 36 deletions.
84 changes: 82 additions & 2 deletions python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,31 @@

from __future__ import annotations

__all__ = ("ButlerCollections",)
__all__ = ("ButlerCollections", "CollectionInfo")

from abc import ABC, abstractmethod
from collections.abc import Iterable, Sequence
from collections.abc import Iterable, Sequence, Set
from typing import Any, overload

from pydantic import BaseModel

from ._collection_type import CollectionType


class CollectionInfo(BaseModel):
"""Information about a single Butler collection."""

name: str
"""Name of the collection."""
type: CollectionType
"""Type of the collection."""
doc: str = ""
"""Documentation string associated with this collection."""
children: tuple[str, ...] = tuple()
"""Children of this collection (only if CHAINED)."""
parents: frozenset[str] = frozenset()
"""Any parents of this collection."""


class ButlerCollections(ABC, Sequence):
"""Methods for working with collections stored in the Butler."""
Expand Down Expand Up @@ -190,3 +209,64 @@ def remove_from_chain(
transactions short.
"""
raise NotImplementedError()

@abstractmethod
def query(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
flatten_chains: bool = False,
include_chains: bool | None = None,
) -> Sequence[str]:
"""Query the butler for collections matching an expression.
Parameters
----------
expression : `str` or `~collections.abc.Iterable` [ `str` ]
One or more collection names or globs to include in the search.
collection_types : `set` [`CollectionType`], `CollectionType` or `None`
Restrict the types of collections to be searched. If `None` all
collection types are searched.
flatten_chains : `bool`, optional
If `True` (`False` is default), recursively yield the child
collections of matching `~CollectionType.CHAINED` collections.
include_chains : `bool` or `None`, optional
If `True`, yield records for matching `~CollectionType.CHAINED`
collections. Default is the opposite of ``flatten_chains``:
include either CHAINED collections or their children, but not both.
Returns
-------
collections : `~collections.abc.Sequence` [ `str` ]
The names of collections that match ``expression``.
Notes
-----
The order in which collections are returned is unspecified, except that
the children of a `~CollectionType.CHAINED` collection are guaranteed
to be in the order in which they are searched. When multiple parent
`~CollectionType.CHAINED` collections match the same criteria, the
order in which the two lists appear is unspecified, and the lists of
children may be incomplete if a child has multiple parents.
"""
raise NotImplementedError()

@abstractmethod
def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo:
"""Obtain information for a specific collection.
Parameters
----------
name : `str`
The name of the collection of interest.
include_doc : `bool`, optional
If `True` any documentation about this collection will be included.
include_parents : `bool`, optional
If `True` any parents of this collection will be included.
Returns
-------
info : `CollectionInfo`
Information on the requested collection.
"""
raise NotImplementedError()
36 changes: 34 additions & 2 deletions python/lsst/daf/butler/direct_butler/_direct_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@

__all__ = ("DirectButlerCollections",)

from collections.abc import Iterable, Sequence
from collections.abc import Iterable, Sequence, Set

from lsst.utils.iteration import ensure_iterable

from .._butler_collections import ButlerCollections
from .._butler_collections import ButlerCollections, CollectionInfo
from ..registry import CollectionType
from ..registry.interfaces import ChainedCollectionRecord
from ..registry.sql_registry import SqlRegistry


Expand Down Expand Up @@ -76,3 +78,33 @@ def remove_from_chain(
return self._registry._managers.collections.remove_from_collection_chain(
parent_collection_name, list(ensure_iterable(child_collection_names))
)

def query(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
flatten_chains: bool = False,
include_chains: bool | None = None,
) -> Sequence[str]:
if collection_types is None:
collection_types = CollectionType.all()

Check warning on line 90 in python/lsst/daf/butler/direct_butler/_direct_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/direct_butler/_direct_butler_collections.py#L90

Added line #L90 was not covered by tests
return self._registry.queryCollections(
expression,
collectionTypes=collection_types,
flattenChains=flatten_chains,
includeChains=include_chains,
)

def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo:
record = self._registry.get_collection_record(name)
doc = ""
if include_doc:
doc = self._registry.getCollectionDocumentation(name) or ""

Check warning on line 102 in python/lsst/daf/butler/direct_butler/_direct_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/direct_butler/_direct_butler_collections.py#L102

Added line #L102 was not covered by tests
children: tuple[str, ...] = tuple()
if record.type == CollectionType.CHAINED:
assert isinstance(record, ChainedCollectionRecord)
children = tuple(record.children)
parents: set[str] = set()
if include_parents:
parents = self._registry.getCollectionParentChains(name)
return CollectionInfo(name=name, type=record.type, doc=doc, parents=parents, children=children)
5 changes: 3 additions & 2 deletions python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from .._timespan import Timespan
from ..dimensions import DataId
from ..transfers import RepoExportContext
from ._registry import RemoteButlerRegistry

from ._http_connection import RemoteButlerHttpConnection, parse_model, quote_path_variable

Expand Down Expand Up @@ -146,12 +147,12 @@ def isWriteable(self) -> bool:
@property
def collection_chains(self) -> ButlerCollections:
"""Object with methods for modifying collection chains."""
return RemoteButlerCollections(self._registry)
return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry))

Check warning on line 150 in python/lsst/daf/butler/remote_butler/_remote_butler.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler.py#L150

Added line #L150 was not covered by tests

@property

Check warning on line 152 in python/lsst/daf/butler/remote_butler/_remote_butler.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler.py#L152

Added line #L152 was not covered by tests
def collections(self) -> ButlerCollections:
"""Object with methods for modifying collection chains."""
return RemoteButlerCollections(self._registry)
return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry))

Check warning on line 155 in python/lsst/daf/butler/remote_butler/_remote_butler.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler.py#L155

Added line #L155 was not covered by tests

@property
def dimensions(self) -> DimensionUniverse:
Expand Down
94 changes: 94 additions & 0 deletions python/lsst/daf/butler/remote_butler/_remote_butler_collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("RemoteButlerCollections",)

from collections.abc import Iterable, Sequence, Set

from .._butler_collections import ButlerCollections, CollectionInfo
from .._collection_type import CollectionType
from ._registry import RemoteButlerRegistry


class RemoteButlerCollections(ButlerCollections):

Check warning on line 39 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L39

Added line #L39 was not covered by tests
"""Implementation of ButlerCollections for RemoteButler.
Parameters
----------
registry : `~lsst.daf.butler.registry.sql_registry.SqlRegistry`
Registry object used to work with the collections database.
"""

def __init__(self, registry: RemoteButlerRegistry):
self._registry = registry

Check warning on line 49 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L48-L49

Added lines #L48 - L49 were not covered by tests

@property

Check warning on line 51 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L51

Added line #L51 was not covered by tests
def defaults(self) -> Sequence[str]:
return self._registry.defaults.collections

Check warning on line 53 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L53

Added line #L53 was not covered by tests

def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None:

Check warning on line 55 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L55

Added line #L55 was not covered by tests
raise NotImplementedError("Not yet available")

def prepend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None:

Check warning on line 58 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L58

Added line #L58 was not covered by tests
raise NotImplementedError("Not yet available")

def redefine_chain(

Check warning on line 61 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L61

Added line #L61 was not covered by tests
self, parent_collection_name: str, child_collection_names: str | Iterable[str]
) -> None:
raise NotImplementedError("Not yet available")

def remove_from_chain(

Check warning on line 66 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L66

Added line #L66 was not covered by tests
self, parent_collection_name: str, child_collection_names: str | Iterable[str]
) -> None:
raise NotImplementedError("Not yet available")

def query(

Check warning on line 71 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L71

Added line #L71 was not covered by tests
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
flatten_chains: bool = False,
include_chains: bool | None = None,
) -> Sequence[str]:
if collection_types is None:
collection_types = CollectionType.all()
return self._registry.queryCollections(

Check warning on line 80 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L79-L80

Added lines #L79 - L80 were not covered by tests
expression,
collectionTypes=collection_types,
flattenChains=flatten_chains,
includeChains=include_chains,
)

def get_info(self, name: str, include_doc: bool = False, include_parents: bool = False) -> CollectionInfo:
info = self._registry._get_collection_info(

Check warning on line 88 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L87-L88

Added lines #L87 - L88 were not covered by tests
name, include_doc=include_doc, include_parents=include_parents
)
doc = info.doc or ""
children = info.children or ()
parents = info.parents or set()
return CollectionInfo(name=name, type=info.type, doc=doc, parents=parents, children=children)

Check warning on line 94 in python/lsst/daf/butler/remote_butler/_remote_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler_collections.py#L91-L94

Added lines #L91 - L94 were not covered by tests
52 changes: 22 additions & 30 deletions python/lsst/daf/butler/script/queryCollections.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,38 +71,34 @@ def _getTable(
dtype=(str, str, str),
)
butler = Butler.from_config(repo)
names = sorted(
butler.registry.queryCollections(collectionTypes=frozenset(collection_type), expression=glob or ...)
)
names = sorted(butler.collections.query(glob or "*", collection_types=frozenset(collection_type)))
if inverse:
for name in names:
type = butler.registry.getCollectionType(name)
parentNames = butler.registry.getCollectionParentChains(name)
if parentNames:
info = butler.collections.get_info(name, include_parents=True)
if info.parents:
first = True
for parentName in sorted(parentNames):
table.add_row((name if first else "", type.name if first else "", parentName))
for parentName in sorted(info.parents):
table.add_row((name if first else "", info.type.name if first else "", parentName))
first = False
else:
table.add_row((name, type.name, ""))
table.add_row((name, info.type.name, ""))
# If none of the datasets has a parent dataset then remove the
# description column.
if not any(c for c in table[descriptionCol]):
del table[descriptionCol]
else:
for name in names:
type = butler.registry.getCollectionType(name)
if type == CollectionType.CHAINED:
children = butler.registry.getCollectionChain(name)
if children:
info = butler.collections.get_info(name)
if info.type == CollectionType.CHAINED:
if info.children:
first = True
for child in children:
table.add_row((name if first else "", type.name if first else "", child))
for child in info.children:
table.add_row((name if first else "", info.type.name if first else "", child))
first = False
else:
table.add_row((name, type.name, ""))
table.add_row((name, info.type.name, ""))

Check warning on line 99 in python/lsst/daf/butler/script/queryCollections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/queryCollections.py#L99

Added line #L99 was not covered by tests
else:
table.add_row((name, type.name, ""))
table.add_row((name, info.type.name, ""))
# If there aren't any CHAINED datasets in the results then remove the
# description column.
if not any(columnVal == CollectionType.CHAINED.name for columnVal in table[typeCol]):
Expand Down Expand Up @@ -147,21 +143,17 @@ def _getTree(
butler = Butler.from_config(repo, without_datastore=True)

def addCollection(name: str, level: int = 0) -> None:
collectionType = butler.registry.getCollectionType(name)
table.add_row((" " * level + name, collectionType.name))
info = butler.collections.get_info(name, include_parents=inverse)
table.add_row((" " * level + name, info.type.name))
if inverse:
parentNames = butler.registry.getCollectionParentChains(name)
for pname in sorted(parentNames):
for pname in sorted(info.parents):
addCollection(pname, level + 1)
else:
if collectionType == CollectionType.CHAINED:
childNames = butler.registry.getCollectionChain(name)
for name in childNames:
if info.type == CollectionType.CHAINED:
for name in info.children:
addCollection(name, level + 1)

collections = butler.registry.queryCollections(
collectionTypes=frozenset(collection_type), expression=glob or ...
)
collections = butler.collections.query(glob or "*", collection_types=frozenset(collection_type))
for collection in sorted(collections):
addCollection(collection)
return table
Expand All @@ -174,12 +166,12 @@ def _getFlatten(
) -> Table:
butler = Butler.from_config(repo)
collectionNames = list(
butler.registry.queryCollections(
collectionTypes=frozenset(collection_type), flattenChains=True, expression=glob or ...
butler.collections.query(
glob or "*", collection_types=frozenset(collection_type), flatten_chains=True
)
)

collectionTypes = [butler.registry.getCollectionType(c).name for c in collectionNames]
collectionTypes = [butler.collections.get_info(c).type.name for c in collectionNames]
return Table((collectionNames, collectionTypes), names=("Name", "Type"))


Expand Down

0 comments on commit 3578e01

Please sign in to comment.