diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index b3a368e223..a4d3000e07 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -226,10 +226,10 @@ def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool ) @classmethod - def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: + def makeTableSpec(cls) -> ddl.TableSpec: return ddl.TableSpec( fields=[ - ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), + ddl.FieldSpec(name="dataset_id", dtype=ddl.GUID, primaryKey=True), ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), @@ -274,9 +274,7 @@ def __init__( self._opaque_table_name = self.config["records", "table"] try: # Storage of paths and formatters, keyed by dataset_id - self._table = bridgeManager.opaque.register( - self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) - ) + self._table = bridgeManager.opaque.register(self._opaque_table_name, self.makeTableSpec()) # Interface to Registry. self._bridge = bridgeManager.register(self.name) except ReadOnlyDatabaseError: @@ -2855,7 +2853,7 @@ def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: # Docstring inherited from the base class. - return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)} + return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(), StoredFileInfo)} def _to_file_info_payload( diff --git a/python/lsst/daf/butler/direct_query_driver/_driver.py b/python/lsst/daf/butler/direct_query_driver/_driver.py index fe3547c7c0..069f43e77b 100644 --- a/python/lsst/daf/butler/direct_query_driver/_driver.py +++ b/python/lsst/daf/butler/direct_query_driver/_driver.py @@ -422,7 +422,7 @@ def explain_no_results(self, tree: qt.QueryTree, execute: bool) -> Iterable[str] def get_dataset_type(self, name: str) -> DatasetType: # Docstring inherited - return self.managers.datasets[name].datasetType + return self.managers.datasets.get_dataset_type(name) def get_default_collections(self) -> tuple[str, ...]: # Docstring inherited. @@ -1133,7 +1133,6 @@ def _join_dataset_search( fields : `~collections.abc.Set` [ `str` ] Dataset fields to include. """ - storage = self.managers.datasets[resolved_search.name] # The next two asserts will need to be dropped (and the implications # dealt with instead) if materializations start having dataset fields. assert ( @@ -1142,7 +1141,11 @@ def _join_dataset_search( assert ( resolved_search.name not in joiner.timespans ), "Dataset timespan has unexpectedly already been joined in." - joiner.join(storage.make_query_joiner(resolved_search.collection_records, fields)) + joiner.join( + self.managers.datasets.make_query_joiner( + self.get_dataset_type(resolved_search.name), resolved_search.collection_records, fields + ) + ) @dataclasses.dataclass diff --git a/python/lsst/daf/butler/queries/_general_query_results.py b/python/lsst/daf/butler/queries/_general_query_results.py index 2cee5be2da..3a98264bff 100644 --- a/python/lsst/daf/butler/queries/_general_query_results.py +++ b/python/lsst/daf/butler/queries/_general_query_results.py @@ -118,19 +118,19 @@ def iter_tuples(self, *dataset_types: DatasetType) -> Iterator[GeneralResultTupl Structure containing data coordinate, refs, and a copy of the row. """ all_dimensions = self._spec.dimensions - dataset_keys: list[tuple[DimensionGroup, str, str]] = [] + dataset_keys: list[tuple[DatasetType, DimensionGroup, str, str]] = [] for dataset_type in dataset_types: dimensions = dataset_type.dimensions id_key = f"{dataset_type.name}.dataset_id" run_key = f"{dataset_type.name}.run" - dataset_keys.append((dimensions, id_key, run_key)) + dataset_keys.append((dataset_type, dimensions, id_key, run_key)) for row in self: values = tuple( row[key] for key in itertools.chain(all_dimensions.required, all_dimensions.implied) ) data_coordinate = DataCoordinate.from_full_values(all_dimensions, values) refs = [] - for dimensions, id_key, run_key in dataset_keys: + for dataset_type, dimensions, id_key, run_key in dataset_keys: values = tuple(row[key] for key in itertools.chain(dimensions.required, dimensions.implied)) data_id = DataCoordinate.from_full_values(dimensions, values) refs.append(DatasetRef(dataset_type, data_id, row[run_key], id=row[id_key])) diff --git a/python/lsst/daf/butler/queries/result_specs.py b/python/lsst/daf/butler/queries/result_specs.py index a4dd7eb301..10d6835415 100644 --- a/python/lsst/daf/butler/queries/result_specs.py +++ b/python/lsst/daf/butler/queries/result_specs.py @@ -221,6 +221,11 @@ class GeneralResultSpec(ResultSpecBase): def find_first_dataset(self) -> str | None: # Docstring inherited. if self.find_first: + if len(self.dataset_fields) != 1: + raise InvalidQueryError( + "General query with find_first=True cannot have results from multiple " + "dataset searches." + ) (dataset_type,) = self.dataset_fields.keys() return dataset_type return None diff --git a/python/lsst/daf/butler/queries/tree/_query_tree.py b/python/lsst/daf/butler/queries/tree/_query_tree.py index 61e4e65a45..6cf30ef3f4 100644 --- a/python/lsst/daf/butler/queries/tree/_query_tree.py +++ b/python/lsst/daf/butler/queries/tree/_query_tree.py @@ -217,7 +217,7 @@ def join_materialization(self, key: MaterializationKey, dimensions: DimensionGro ) def join_dataset(self, dataset_type: str, search: DatasetSearch) -> QueryTree: - """Return a new tree joins in a search for a dataset. + """Return a new tree that joins in a search for a dataset. Parameters ---------- @@ -231,11 +231,6 @@ def join_dataset(self, dataset_type: str, search: DatasetSearch) -> QueryTree: ------- result : `QueryTree` A new tree that joins in the dataset search. - - Notes - ----- - If this dataset type was already joined in, the new `DatasetSearch` - replaces the old one. """ if existing := self.datasets.get(dataset_type): assert existing == search, "Dataset search should be new or the same." diff --git a/python/lsst/daf/butler/registry/_caching_context.py b/python/lsst/daf/butler/registry/_caching_context.py index 9a8461a312..81362df826 100644 --- a/python/lsst/daf/butler/registry/_caching_context.py +++ b/python/lsst/daf/butler/registry/_caching_context.py @@ -27,19 +27,19 @@ from __future__ import annotations -__all__ = ["CachingContext"] +__all__ = ["CachingContext", "GenericCachingContext"] -from typing import TYPE_CHECKING +from typing import Generic, TypeAlias, TypeVar from ._collection_record_cache import CollectionRecordCache from ._collection_summary_cache import CollectionSummaryCache from ._dataset_type_cache import DatasetTypeCache -if TYPE_CHECKING: - from .interfaces import DatasetRecordStorage +_T = TypeVar("_T") +_U = TypeVar("_U") -class CachingContext: +class GenericCachingContext(Generic[_T, _U]): """Collection of caches for various types of records retrieved from database. @@ -54,10 +54,16 @@ class is passed to the relevant managers that can use it to query or Dataset type cache is always enabled for now, this avoids the need for explicitly enabling caching in pipetask executors. + + `GenericCachingContext` is generic over two kinds of opaque dataset type + data, with the expectation that most code will use the ``CachingContext`` + type alias (which resolves to `GenericCachingContext[object, object]`); + the `DatasetRecordStorageManager` can then cast this to a + `GenericCachingContext` with the actual opaque data types it uses. """ def __init__(self) -> None: - self._dataset_types: DatasetTypeCache[DatasetRecordStorage] = DatasetTypeCache() + self._dataset_types: DatasetTypeCache[_T, _U] = DatasetTypeCache() self._collection_records: CollectionRecordCache | None = None self._collection_summaries: CollectionSummaryCache | None = None self._depth = 0 @@ -103,6 +109,9 @@ def collection_summaries(self) -> CollectionSummaryCache | None: return self._collection_summaries @property - def dataset_types(self) -> DatasetTypeCache[DatasetRecordStorage]: + def dataset_types(self) -> DatasetTypeCache[_T, _U]: """Cache for dataset types, never disabled (`DatasetTypeCache`).""" return self._dataset_types + + +CachingContext: TypeAlias = GenericCachingContext[object, object] diff --git a/python/lsst/daf/butler/registry/_dataset_type_cache.py b/python/lsst/daf/butler/registry/_dataset_type_cache.py index 3f1665dfa3..ab43c80ca5 100644 --- a/python/lsst/daf/butler/registry/_dataset_type_cache.py +++ b/python/lsst/daf/butler/registry/_dataset_type_cache.py @@ -33,35 +33,51 @@ from typing import Generic, TypeVar from .._dataset_type import DatasetType +from ..dimensions import DimensionGroup _T = TypeVar("_T") +_U = TypeVar("_U") -class DatasetTypeCache(Generic[_T]): +class DatasetTypeCache(Generic[_T, _U]): """Cache for dataset types. Notes ----- - This class caches mapping of dataset type name to a corresponding - `DatasetType` instance. Registry manager also needs to cache corresponding - "storage" instance, so this class allows storing additional opaque object - along with the dataset type. + This cache is a pair of mappings with different kinds of keys: - In come contexts (e.g. ``resolve_wildcard``) a full list of dataset types + - the `DatasetType` itself is cached by name, as is some opaque data used + only by a `DatasetRecordStorageManager` implementation; + - additional opaque data (also used only by `DatasetRecordStorageManager` + implementations can be cached by the dimensions dataset types (i.e. a + `DimensionGroup`). + + `DatasetTypeCache` is generic over these two opaque data types. + + In some contexts (e.g. ``resolve_wildcard``) a full list of dataset types is needed. To signify that cache content can be used in such contexts, - cache defines special ``full`` flag that needs to be set by client. + cache defines a special ``full`` flag that needs to be set by client. The + ``dimensions_full`` flag similarly reports whether all per-dimension-group + state is present in the cache. """ def __init__(self) -> None: - self._cache: dict[str, tuple[DatasetType, _T | None]] = {} + self._by_name_cache: dict[str, tuple[DatasetType, _T]] = {} + self._by_dimensions_cache: dict[DimensionGroup, _U] = {} self._full = False + self._dimensions_full = False @property def full(self) -> bool: """`True` if cache holds all known dataset types (`bool`).""" return self._full - def add(self, dataset_type: DatasetType, extra: _T | None = None) -> None: + @property + def dimensions_full(self) -> bool: + """`True` if cache holds all known dataset type dimensions (`bool`).""" + return self._dimensions_full + + def add(self, dataset_type: DatasetType, extra: _T) -> None: """Add one record to the cache. Parameters @@ -69,33 +85,46 @@ def add(self, dataset_type: DatasetType, extra: _T | None = None) -> None: dataset_type : `DatasetType` Dataset type, replaces any existing dataset type with the same name. - extra : `Any`, optional + extra : `Any` Additional opaque object stored with this dataset type. """ - self._cache[dataset_type.name] = (dataset_type, extra) - - def set(self, data: Iterable[DatasetType | tuple[DatasetType, _T | None]], *, full: bool = False) -> None: + self._by_name_cache[dataset_type.name] = (dataset_type, extra) + + def set( + self, + data: Iterable[tuple[DatasetType, _T]], + *, + full: bool = False, + dimensions_data: Iterable[tuple[DimensionGroup, _U]] | None = None, + dimensions_full: bool = False, + ) -> None: """Replace cache contents with the new set of dataset types. Parameters ---------- data : `~collections.abc.Iterable` - Sequence of `DatasetType` instances or tuples of `DatasetType` and - an extra opaque object. - full : `bool` + Sequence of tuples of `DatasetType` and an extra opaque object. + full : `bool`, optional If `True` then ``data`` contains all known dataset types. + dimensions_data : `~collections.abc.Iterable`, optional + Sequence of tuples of `DimensionGroup` and an extra opaque object. + dimensions_full : `bool`, optional + If `True` then ``data`` contains all known dataset type dimensions. """ self.clear() for item in data: - if isinstance(item, DatasetType): - item = (item, None) - self._cache[item[0].name] = item + self._by_name_cache[item[0].name] = item self._full = full + if dimensions_data is not None: + self._by_dimensions_cache.update(dimensions_data) + self._dimensions_full = dimensions_full def clear(self) -> None: """Remove everything from the cache.""" - self._cache = {} + self._by_name_cache = {} + self._by_dimensions_cache = {} self._full = False + self._dimensions_full = False def discard(self, name: str) -> None: """Remove named dataset type from the cache. @@ -105,7 +134,7 @@ def discard(self, name: str) -> None: name : `str` Name of the dataset type to remove. """ - self._cache.pop(name, None) + self._by_name_cache.pop(name, None) def get(self, name: str) -> tuple[DatasetType | None, _T | None]: """Return cached info given dataset type name. @@ -122,9 +151,9 @@ def get(self, name: str) -> tuple[DatasetType | None, _T | None]: cache. extra : `Any` or `None` Cached opaque data, `None` is returned if the name is not in the - cache or no extra info was stored for this dataset type. + cache. """ - item = self._cache.get(name) + item = self._by_name_cache.get(name) if item is None: return (None, None) return item @@ -143,15 +172,20 @@ def get_dataset_type(self, name: str) -> DatasetType | None: Cached dataset type, `None` is returned if the name is not in the cache. """ - item = self._cache.get(name) + item = self._by_name_cache.get(name) if item is None: return None return item[0] - def items(self) -> Iterator[tuple[DatasetType, _T | None]]: + def items(self) -> Iterator[tuple[DatasetType, _T]]: """Return iterator for the set of items in the cache, can only be used if `full` is true. + Returns + ------- + iter : `~collections.abc.Iterator` + Iterator over tuples of `DatasetType` and opaque data. + Raises ------ RuntimeError @@ -159,4 +193,51 @@ def items(self) -> Iterator[tuple[DatasetType, _T | None]]: """ if not self._full: raise RuntimeError("cannot call items() if cache is not full") - return iter(self._cache.values()) + return iter(self._by_name_cache.values()) + + def add_by_dimensions(self, dimensions: DimensionGroup, extra: _U) -> None: + """Add information about a set of dataset type dimensions to the cache. + + Parameters + ---------- + dimensions : `DimensionGroup` + Dimensions of one or more dataset types. + extra : `Any` + Additional opaque object stored with these dimensions. + """ + self._by_dimensions_cache[dimensions] = extra + + def get_by_dimensions(self, dimensions: DimensionGroup) -> _U | None: + """Get information about a set of dataset type dimensions. + + Parameters + ---------- + dimensions : `DimensionGroup` + Dimensions of one or more dataset types. + + Returns + ------- + extra : `Any` or `None` + Additional opaque object stored with these dimensions, or `None` if + these dimensions are not present in the cache. + """ + return self._by_dimensions_cache.get(dimensions) + + def by_dimensions_items(self) -> Iterator[tuple[DimensionGroup, _U]]: + """Return iterator for all dimensions-keyed data in the cache. + + This can only be called if `dimensions_full` is `True`. + + Returns + ------- + iter : `~collections.abc.Iterator` + Iterator over tuples of `DimensionGroup` and opaque data. + + Raises + ------ + RuntimeError + Raised if ``self.dimensions_full`` is `False`. + """ + if not self._dimensions_full: + raise RuntimeError("cannot call by_dimensions_items() if cache does not have full dimensions.") + return iter(self._by_dimensions_cache.items()) diff --git a/python/lsst/daf/butler/registry/bridge/monolithic.py b/python/lsst/daf/butler/registry/bridge/monolithic.py index 99f165c022..14ce7594d2 100644 --- a/python/lsst/daf/butler/registry/bridge/monolithic.py +++ b/python/lsst/daf/butler/registry/bridge/monolithic.py @@ -316,8 +316,6 @@ class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): Manager object for opaque table storage in the `Registry`. universe : `DimensionUniverse` All dimensions know to the `Registry`. - datasetIdColumnType : `type` - Type for dataset ID column. registry_schema_version : `VersionTuple` or `None`, optional The version of the registry schema. """ @@ -329,13 +327,11 @@ def __init__( tables: _TablesTuple, opaque: OpaqueTableStorageManager, universe: DimensionUniverse, - datasetIdColumnType: type, registry_schema_version: VersionTuple | None = None, ): super().__init__( opaque=opaque, universe=universe, - datasetIdColumnType=datasetIdColumnType, registry_schema_version=registry_schema_version, ) self._db = db @@ -348,7 +344,6 @@ def clone(self, *, db: Database, opaque: OpaqueTableStorageManager) -> Datastore tables=self._tables, opaque=opaque, universe=self.universe, - datasetIdColumnType=self.datasetIdColumnType, registry_schema_version=self._registry_schema_version, ) @@ -370,7 +365,6 @@ def initialize( tables=cast(_TablesTuple, tables), opaque=opaque, universe=universe, - datasetIdColumnType=datasets.getIdColumnType(), registry_schema_version=registry_schema_version, ) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/__init__.py b/python/lsst/daf/butler/registry/datasets/byDimensions/__init__.py index 26fd39b361..3697e36509 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/__init__.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/__init__.py @@ -26,4 +26,3 @@ # along with this program. If not, see . from ._manager import * -from ._storage import * diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py index b2b6af3c35..7d0a9c5e0b 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py @@ -1,37 +1,39 @@ from __future__ import annotations -from .... import ddl - __all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",) import dataclasses +import datetime import logging -from collections.abc import Iterable, Mapping -from typing import TYPE_CHECKING, Any +from collections.abc import Iterable, Mapping, Sequence, Set +from typing import TYPE_CHECKING, Any, ClassVar, cast +import astropy.time import sqlalchemy +from lsst.daf.relation import Relation, sql -from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef +from .... import ddl +from ...._collection_type import CollectionType +from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag +from ...._column_type_info import LogicalColumn +from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef from ...._dataset_type import DatasetType, get_dataset_type_name +from ...._exceptions import CollectionTypeError, MissingDatasetTypeError from ...._exceptions_legacy import DatasetTypeError -from ....dimensions import DimensionUniverse +from ...._timespan import Timespan +from ....dimensions import DataCoordinate, DimensionGroup, DimensionUniverse +from ....direct_query_driver import QueryBuilder, QueryJoiner # new query system, server+direct only +from ....queries import tree as qt # new query system, both clients + server +from ..._caching_context import CachingContext, GenericCachingContext from ..._collection_summary import CollectionSummary from ..._exceptions import ConflictingDefinitionError, DatasetTypeExpressionError, OrphanedRecordError -from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple +from ...interfaces import DatasetRecordStorageManager, RunRecord, VersionTuple +from ...queries import SqlQueryContext # old registry query system from ...wildcards import DatasetTypeWildcard -from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID from .summaries import CollectionSummaryManager -from .tables import ( - addDatasetForeignKey, - makeCalibTableName, - makeCalibTableSpec, - makeStaticTableSpecs, - makeTagTableName, - makeTagTableSpec, -) +from .tables import DynamicTables, addDatasetForeignKey, makeStaticTableSpecs, makeTagTableSpec if TYPE_CHECKING: - from ..._caching_context import CachingContext from ...interfaces import ( CollectionManager, CollectionRecord, @@ -53,39 +55,52 @@ _LOG = logging.getLogger(__name__) -class MissingDatabaseTableError(RuntimeError): - """Exception raised when a table is not found in a database.""" - - @dataclasses.dataclass class _DatasetTypeRecord: """Contents of a single dataset type record.""" dataset_type: DatasetType dataset_type_id: int + dimensions_key: int tag_table_name: str calib_table_name: str | None + def make_dynamic_tables(self) -> DynamicTables: + return DynamicTables( + self.dataset_type.dimensions, self.dimensions_key, self.tag_table_name, self.calib_table_name + ) -class _SpecTableFactory: - """Factory for `sqlalchemy.schema.Table` instances that builds table - instances using provided `ddl.TableSpec` definition and verifies that - table exists in the database. - """ + def update_dynamic_tables(self, current: DynamicTables) -> DynamicTables: + assert self.dimensions_key == current.dimensions_key + assert self.tag_table_name == current.tags_name + if self.calib_table_name is not None: + if current.calibs_name is not None: + assert self.calib_table_name == current.calibs_name + else: + # Some previously-cached dataset type had the same dimensions + # but was not a calibration. + current.calibs_name = self.calib_table_name + # If some previously-cached dataset type was a calibration but this + # one isn't, we don't want to forget the calibs table. + return current - def __init__(self, db: Database, name: str, spec: ddl.TableSpec): - self._db = db - self._name = name - self._spec = spec - def __call__(self) -> sqlalchemy.schema.Table: - table = self._db.getExistingTable(self._name, self._spec) - if table is None: - raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.") - return table +@dataclasses.dataclass +class _DatasetRecordStorage: + """Information cached about a dataset type. + + This combines information cached with different keys - the dataset type + and its ID are cached by name, while the tables are cached by the dataset + types dimensions (and hence shared with other dataset types that have the + same dimensions). + """ + + dataset_type: DatasetType + dataset_type_id: int + dynamic_tables: DynamicTables -class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): +class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager): """A manager class for datasets that uses one dataset-collection table for each group of dataset types that share the same dimensions. @@ -104,10 +119,6 @@ class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager): Alternative implementations that make different choices for these while keeping the same general table organization might be reasonable as well. - This class provides complete implementation of manager logic but it is - parametrized by few class attributes that have to be defined by - sub-classes. - Parameters ---------- db : `Database` @@ -144,7 +155,16 @@ def __init__( self._dimensions = dimensions self._static = static self._summaries = summaries - self._caching_context = caching_context + self._caching_context = cast(GenericCachingContext[int, DynamicTables], caching_context) + self._use_astropy_ingest_date = self.ingest_date_dtype() is ddl.AstropyTimeNsecTai + self._run_key_column = collections.getRunForeignKeyName() + + _versions: ClassVar[list[VersionTuple]] = [_VERSION_UUID, _VERSION_UUID_NS] + + _id_maker: ClassVar[DatasetIdFactory] = DatasetIdFactory() + """Factory for dataset IDs. In the future this factory may be shared with + other classes (e.g. Registry). + """ @classmethod def initialize( @@ -217,16 +237,9 @@ def makeStaticTableSpecs( return makeStaticTableSpecs( collections, universe=universe, - dtype=cls.getIdColumnType(), - autoincrement=cls._autoincrement, schema_version=schema_version, ) - @classmethod - def getIdColumnType(cls) -> type: - # Docstring inherited from base class. - return cls._idColumnType - @classmethod def addDatasetForeignKey( cls, @@ -238,8 +251,29 @@ def addDatasetForeignKey( **kwargs: Any, ) -> ddl.FieldSpec: # Docstring inherited from DatasetRecordStorageManager. - return addDatasetForeignKey( - tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs + return addDatasetForeignKey(tableSpec, name=name, onDelete=onDelete, constraint=constraint, **kwargs) + + @classmethod + def _newDefaultSchemaVersion(cls) -> VersionTuple: + # Docstring inherited from VersionedExtension. + return _VERSION_UUID_NS + + def clone( + self, + *, + db: Database, + collections: CollectionManager, + dimensions: DimensionRecordStorageManager, + caching_context: CachingContext, + ) -> ByDimensionsDatasetRecordStorageManagerUUID: + return ByDimensionsDatasetRecordStorageManagerUUID( + db=db, + collections=collections, + dimensions=dimensions, + static=self._static, + summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context), + caching_context=caching_context, + registry_schema_version=self._registry_schema_version, ) def refresh(self) -> None: @@ -247,33 +281,7 @@ def refresh(self) -> None: if self._caching_context.dataset_types is not None: self._caching_context.dataset_types.clear() - def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage: - """Create storage instance for a dataset type record.""" - tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType()) - tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec) - calibs_table_factory = None - if record.calib_table_name is not None: - calibs_spec = makeCalibTableSpec( - record.dataset_type, - type(self._collections), - self._db.getTimespanRepresentation(), - self.getIdColumnType(), - ) - calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec) - storage = self._recordStorageType( - db=self._db, - datasetType=record.dataset_type, - static=self._static, - summaries=self._summaries, - tags_table_factory=tags_table_factory, - calibs_table_factory=calibs_table_factory, - dataset_type_id=record.dataset_type_id, - collections=self._collections, - use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai, - ) - return storage - - def remove(self, name: str) -> None: + def remove_dataset_type(self, name: str) -> None: # Docstring inherited from DatasetRecordStorageManager. compositeName, componentName = DatasetType.splitDatasetTypeName(name) if componentName is not None: @@ -292,90 +300,63 @@ def remove(self, name: str) -> None: # not need to be fast. self.refresh() - def find(self, name: str) -> DatasetRecordStorage | None: + def get_dataset_type(self, name: str) -> DatasetType: # Docstring inherited from DatasetRecordStorageManager. - if self._caching_context.dataset_types is not None: - _, storage = self._caching_context.dataset_types.get(name) - if storage is not None: - return storage - else: - # On the first cache miss populate the cache with complete list - # of dataset types (if it was not done yet). - if not self._caching_context.dataset_types.full: - self._fetch_dataset_types() - # Try again - _, storage = self._caching_context.dataset_types.get(name) - if storage is not None: - return storage - record = self._fetch_dataset_type_record(name) - if record is not None: - storage = self._make_storage(record) - if self._caching_context.dataset_types is not None: - self._caching_context.dataset_types.add(storage.datasetType, storage) - return storage - else: - return None + return self._find_storage(name).dataset_type - def register(self, datasetType: DatasetType) -> bool: + def register_dataset_type(self, dataset_type: DatasetType) -> bool: # Docstring inherited from DatasetRecordStorageManager. - if datasetType.isComponent(): + # + # This is one of three places where we populate the dataset type cache. + # See the comment in _fetch_dataset_types for how these are related and + # invariants they must maintain. + # + if dataset_type.isComponent(): raise ValueError( - f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}" + f"Component dataset types can not be stored in registry. Rejecting {dataset_type.name}" ) - record = self._fetch_dataset_type_record(datasetType.name) + record = self._fetch_dataset_type_record(dataset_type.name) if record is None: - dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions) - tagTableName = makeTagTableName(datasetType, dimensionsKey) - self._db.ensureTableExists( - tagTableName, - makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()), - ) - calibTableName = ( - makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None - ) - if calibTableName is not None: - self._db.ensureTableExists( - calibTableName, - makeCalibTableSpec( - datasetType, - type(self._collections), - self._db.getTimespanRepresentation(), - self.getIdColumnType(), - ), + if ( + dynamic_tables := self._caching_context.dataset_types.get_by_dimensions( + dataset_type.dimensions ) + ) is None: + dimensions_key = self._dimensions.save_dimension_group(dataset_type.dimensions) + dynamic_tables = DynamicTables.from_dimensions_key( + dataset_type.dimensions, dimensions_key, dataset_type.isCalibration() + ) + dynamic_tables.create(self._db, type(self._collections)) + elif dataset_type.isCalibration() and dynamic_tables.calibs_name is None: + dynamic_tables.add_calibs(self._db, type(self._collections)) row, inserted = self._db.sync( self._static.dataset_type, - keys={"name": datasetType.name}, + keys={"name": dataset_type.name}, compared={ - "dimensions_key": dimensionsKey, + "dimensions_key": dynamic_tables.dimensions_key, # Force the storage class to be loaded to ensure it # exists and there is no typo in the name. - "storage_class": datasetType.storageClass.name, + "storage_class": dataset_type.storageClass.name, }, extra={ - "tag_association_table": tagTableName, - "calibration_association_table": calibTableName, + "tag_association_table": dynamic_tables.tags_name, + "calibration_association_table": ( + dynamic_tables.calibs_name if dataset_type.isCalibration() else None + ), }, returning=["id", "tag_association_table"], ) # Make sure that cache is updated if self._caching_context.dataset_types is not None and row is not None: - record = _DatasetTypeRecord( - dataset_type=datasetType, - dataset_type_id=row["id"], - tag_table_name=tagTableName, - calib_table_name=calibTableName, - ) - storage = self._make_storage(record) - self._caching_context.dataset_types.add(datasetType, storage) + self._caching_context.dataset_types.add(dataset_type, row["id"]) + self._caching_context.dataset_types.add_by_dimensions(dataset_type.dimensions, dynamic_tables) else: - if datasetType != record.dataset_type: + if dataset_type != record.dataset_type: raise ConflictingDefinitionError( - f"Given dataset type {datasetType} is inconsistent " + f"Given dataset type {dataset_type} is inconsistent " f"with database definition {record.dataset_type}." ) inserted = False - return bool(inserted) def resolve_wildcard( @@ -393,8 +374,12 @@ def resolve_wildcard( "Component dataset types are not supported in Registry methods; use DatasetRef or " "DatasetType methods to obtain components from parents instead." ) - if (found_storage := self.find(parent_name)) is not None: - resolved_dataset_type = found_storage.datasetType + try: + resolved_dataset_type = self.get_dataset_type(parent_name) + except MissingDatasetTypeError: + if missing is not None: + missing.append(name) + else: if dataset_type is not None: if dataset_type.is_compatible_with(resolved_dataset_type): # Prefer the given dataset type to enable storage class @@ -406,8 +391,6 @@ def resolve_wildcard( f"not compatible with the registered type {resolved_dataset_type}." ) result.append(resolved_dataset_type) - elif missing is not None: - missing.append(name) if wildcard.patterns is ...: if explicit_only: raise TypeError( @@ -429,6 +412,11 @@ def resolve_wildcard( def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: # Docstring inherited from DatasetRecordStorageManager. + # + # This is one of three places where we populate the dataset type cache. + # See the comment in _fetch_dataset_types for how these are related and + # invariants they must maintain. + # sql = ( sqlalchemy.sql.select( self._static.dataset.columns.dataset_type_id, @@ -443,20 +431,54 @@ def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: row = sql_result.mappings().fetchone() if row is None: return None + run = row[self._run_key_column] record = self._record_from_row(row) - storage: DatasetRecordStorage | None = None + dynamic_tables: DynamicTables | None = None if self._caching_context.dataset_types is not None: - _, storage = self._caching_context.dataset_types.get(record.dataset_type.name) - if storage is None: - storage = self._make_storage(record) + _, dataset_type_id = self._caching_context.dataset_types.get(record.dataset_type.name) + if dataset_type_id is None: + if self._caching_context.dataset_types is not None: + self._caching_context.dataset_types.add(record.dataset_type, record.dataset_type_id) + else: + assert record.dataset_type_id == dataset_type_id, "Two IDs for the same dataset type name!" + dynamic_tables = self._caching_context.dataset_types.get_by_dimensions( + record.dataset_type.dimensions + ) + if dynamic_tables is None: + dynamic_tables = record.make_dynamic_tables() if self._caching_context.dataset_types is not None: - self._caching_context.dataset_types.add(storage.datasetType, storage) - assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class" + self._caching_context.dataset_types.add_by_dimensions( + record.dataset_type.dimensions, dynamic_tables + ) + if record.dataset_type.dimensions: + # This query could return multiple rows (one for each tagged + # collection the dataset is in, plus one for its run collection), + # and we don't care which of those we get. + tags_table = dynamic_tables.tags(self._db, type(self._collections)) + data_id_sql = ( + tags_table.select() + .where( + sqlalchemy.sql.and_( + tags_table.columns.dataset_id == id, + tags_table.columns.dataset_type_id == record.dataset_type_id, + ) + ) + .limit(1) + ) + with self._db.query(data_id_sql) as sql_result: + data_id_row = sql_result.mappings().fetchone() + assert data_id_row is not None, "Data ID should be present if dataset is." "" + data_id = DataCoordinate.from_required_values( + record.dataset_type.dimensions, + tuple(data_id_row[dimension] for dimension in record.dataset_type.dimensions.required), + ) + else: + data_id = DataCoordinate.make_empty(self._dimensions.universe) return DatasetRef( - storage.datasetType, - dataId=storage.getDataId(id=id), + record.dataset_type, + dataId=data_id, id=id, - run=self._collections[row[self._collections.getRunForeignKeyName()]].name, + run=self._collections[run].name, ) def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None: @@ -486,6 +508,7 @@ def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord: return _DatasetTypeRecord( dataset_type=datasetType, dataset_type_id=row["id"], + dimensions_key=row["dimensions_key"], tag_table_name=row["tag_association_table"], calib_table_name=calibTableName, ) @@ -495,6 +518,28 @@ def _dataset_type_from_row(self, row: Mapping) -> DatasetType: def _fetch_dataset_types(self) -> list[DatasetType]: """Fetch list of all defined dataset types.""" + # This is one of three places we populate the dataset type cache: + # + # - This method handles almost all requests for dataset types that + # should already exist. It always marks the cache as "full" in both + # dataset type names and dimensions. + # + # - register_dataset_type handles the case where the dataset type might + # not existing yet. Since it can only add a single dataset type, it + # never changes whether the cache is full. + # + # - getDatasetRef is a special case for a dataset type that should + # already exist, but is looked up via a dataset ID rather than its + # name. It also never changes whether the cache is full, and it's + # handles separately essentially as an optimization: we can fetch a + # single dataset type definition record in a join when we query for + # the dataset type based on the dataset ID, and this is better than + # blindly fetching all dataset types in a separate query. + # + # In all three cases, we require that the per-dimensions data be cached + # whenever a dataset type is added to the cache by name, to reduce the + # number of possible states the cache can be in and minimize the number + # of queries. if self._caching_context.dataset_types is not None: if self._caching_context.dataset_types.full: return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()] @@ -503,10 +548,57 @@ def _fetch_dataset_types(self) -> list[DatasetType]: records = [self._record_from_row(row) for row in sql_rows] # Cache everything and specify that cache is complete. if self._caching_context.dataset_types is not None: - cache_data = [(record.dataset_type, self._make_storage(record)) for record in records] - self._caching_context.dataset_types.set(cache_data, full=True) + cache_data: list[tuple[DatasetType, int]] = [] + cache_dimensions_data: dict[DimensionGroup, DynamicTables] = {} + for record in records: + cache_data.append((record.dataset_type, record.dataset_type_id)) + if (dynamic_tables := cache_dimensions_data.get(record.dataset_type.dimensions)) is None: + cache_dimensions_data[record.dataset_type.dimensions] = record.make_dynamic_tables() + else: + record.update_dynamic_tables(dynamic_tables) + self._caching_context.dataset_types.set( + cache_data, full=True, dimensions_data=cache_dimensions_data.items(), dimensions_full=True + ) return [record.dataset_type for record in records] + def _find_storage(self, name: str) -> _DatasetRecordStorage: + """Find a dataset type and the extra information needed to work with + it, utilizing and populating the cache as needed. + """ + if self._caching_context.dataset_types is not None: + dataset_type, dataset_type_id = self._caching_context.dataset_types.get(name) + if dataset_type is not None: + tables = self._caching_context.dataset_types.get_by_dimensions(dataset_type.dimensions) + assert ( + dataset_type_id is not None and tables is not None + ), "Dataset type cache population is incomplete." + return _DatasetRecordStorage( + dataset_type=dataset_type, dataset_type_id=dataset_type_id, dynamic_tables=tables + ) + else: + # On the first cache miss populate the cache with complete list + # of dataset types (if it was not done yet). + if not self._caching_context.dataset_types.full: + self._fetch_dataset_types() + # Try again + dataset_type, dataset_type_id = self._caching_context.dataset_types.get(name) + if dataset_type is not None: + tables = self._caching_context.dataset_types.get_by_dimensions(dataset_type.dimensions) + assert ( + dataset_type_id is not None and tables is not None + ), "Dataset type cache population is incomplete." + return _DatasetRecordStorage( + dataset_type=dataset_type, dataset_type_id=dataset_type_id, dynamic_tables=tables + ) + record = self._fetch_dataset_type_record(name) + if record is not None: + if self._caching_context.dataset_types is not None: + self._caching_context.dataset_types.add(record.dataset_type, record.dataset_type_id) + return _DatasetRecordStorage( + record.dataset_type, record.dataset_type_id, record.make_dynamic_tables() + ) + raise MissingDatasetTypeError(f"Dataset type {name!r} does not exist.") + def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: # Docstring inherited from DatasetRecordStorageManager. summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row) @@ -523,64 +615,1047 @@ def fetch_summaries( dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types) return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row) - _versions: list[VersionTuple] - """Schema version for this class.""" + def ingest_date_dtype(self) -> type: + """Return type of the ``ingest_date`` column.""" + schema_version = self.newSchemaVersion() + if schema_version is not None and schema_version.major > 1: + return ddl.AstropyTimeNsecTai + else: + return sqlalchemy.TIMESTAMP - _recordStorageType: type[ByDimensionsDatasetRecordStorage] - """Type of the storage class returned by this manager.""" + def insert( + self, + dataset_type_name: str, + run: RunRecord, + data_ids: Iterable[DataCoordinate], + id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, + ) -> list[DatasetRef]: + # Docstring inherited from DatasetRecordStorageManager. + if (storage := self._find_storage(dataset_type_name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type_name!r} has not been registered.") + # Current timestamp, type depends on schema version. Use microsecond + # precision for astropy time to keep things consistent with + # TIMESTAMP(6) SQL type. + timestamp: datetime.datetime | astropy.time.Time + if self._use_astropy_ingest_date: + # Astropy `now()` precision should be the same as `now()` which + # should mean microsecond. + timestamp = astropy.time.Time.now() + else: + timestamp = datetime.datetime.now(datetime.UTC) + + # Iterate over data IDs, transforming a possibly-single-pass iterable + # into a list. + data_id_list: list[DataCoordinate] = [] + rows = [] + summary = CollectionSummary() + for dataId in summary.add_data_ids_generator(storage.dataset_type, data_ids): + data_id_list.append(dataId) + rows.append( + { + "id": self._id_maker.makeDatasetId( + run.name, storage.dataset_type, dataId, id_generation_mode + ), + "dataset_type_id": storage.dataset_type_id, + self._run_key_column: run.key, + "ingest_date": timestamp, + } + ) + if not rows: + # Just in case an empty collection is provided we want to avoid + # adding dataset type to summary tables. + return [] + + with self._db.transaction(): + # Insert into the static dataset table. + self._db.insert(self._static.dataset, *rows) + # Update the summary tables for this collection in case this is the + # first time this dataset type or these governor values will be + # inserted there. + self._summaries.update(run, [storage.dataset_type_id], summary) + # Combine the generated dataset_id values and data ID fields to + # form rows to be inserted into the tags table. + protoTagsRow = { + "dataset_type_id": storage.dataset_type_id, + self._collections.getCollectionForeignKeyName(): run.key, + } + tagsRows = [ + dict(protoTagsRow, dataset_id=row["id"], **dataId.required) + for dataId, row in zip(data_id_list, rows, strict=True) + ] + # Insert those rows into the tags table. + self._db.insert(storage.dynamic_tables.tags(self._db, type(self._collections)), *tagsRows) + + return [ + DatasetRef( + datasetType=storage.dataset_type, + dataId=dataId, + id=row["id"], + run=run.name, + ) + for dataId, row in zip(data_id_list, rows, strict=True) + ] - _autoincrement: bool - """If True then PK column of the dataset table is auto-increment.""" + def import_( + self, + dataset_type: DatasetType, + run: RunRecord, + data_ids: Mapping[DatasetId, DataCoordinate], + ) -> list[DatasetRef]: + # Docstring inherited from DatasetRecordStorageManager. + if not data_ids: + # Just in case an empty mapping is provided we want to avoid + # adding dataset type to summary tables. + return [] + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + # Current timestamp, type depends on schema version. + if self._use_astropy_ingest_date: + # Astropy `now()` precision should be the same as `now()` which + # should mean microsecond. + timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) + else: + timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC)) + # We'll insert all new rows into a temporary table + table_spec = makeTagTableSpec( + storage.dataset_type.dimensions, type(self._collections), constraints=False + ) + collection_fkey_name = self._collections.getCollectionForeignKeyName() + proto_ags_row = { + "dataset_type_id": storage.dataset_type_id, + collection_fkey_name: run.key, + } + tmpRows = [ + dict(proto_ags_row, dataset_id=dataset_id, **data_id.required) + for dataset_id, data_id in data_ids.items() + ] + with self._db.transaction(for_temp_tables=True), self._db.temporary_table(table_spec) as tmp_tags: + # store all incoming data in a temporary table + self._db.insert(tmp_tags, *tmpRows) + # There are some checks that we want to make for consistency + # of the new datasets with existing ones. + self._validate_import(storage, tmp_tags, run) + # Before we merge temporary table into dataset/tags we need to + # drop datasets which are already there (and do not conflict). + self._db.deleteWhere( + tmp_tags, + tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), + ) + # Copy it into dataset table, need to re-label some columns. + self._db.insert( + self._static.dataset, + select=sqlalchemy.sql.select( + tmp_tags.columns.dataset_id.label("id"), + tmp_tags.columns.dataset_type_id, + tmp_tags.columns[collection_fkey_name].label(self._run_key_column), + timestamp.label("ingest_date"), + ), + ) + refs = [ + DatasetRef( + datasetType=dataset_type, + id=dataset_id, + dataId=dataId, + run=run.name, + ) + for dataset_id, dataId in data_ids.items() + ] + # Update the summary tables for this collection in case this + # is the first time this dataset type or these governor values + # will be inserted there. + summary = CollectionSummary() + summary.add_datasets(refs) + self._summaries.update(run, [storage.dataset_type_id], summary) + # Copy from temp table into tags table. + self._db.insert( + storage.dynamic_tables.tags(self._db, type(self._collections)), select=tmp_tags.select() + ) + return refs - _idColumnType: type - """Type of dataset column used to store dataset ID.""" + def _validate_import( + self, storage: _DatasetRecordStorage, tmp_tags: sqlalchemy.schema.Table, run: RunRecord + ) -> None: + """Validate imported refs against existing datasets. + Parameters + ---------- + storage : `_DatasetREcordStorage` + Struct that holds the tables and ID for a dataset type. + tmp_tags : `sqlalchemy.schema.Table` + Temporary table with new datasets and the same schema as tags + table. + run : `RunRecord` + The record object describing the `~CollectionType.RUN` collection. + + Raises + ------ + ConflictingDefinitionError + Raise if new datasets conflict with existing ones. + """ + dataset = self._static.dataset + tags = storage.dynamic_tables.tags(self._db, type(self._collections)) + collection_fkey_name = self._collections.getCollectionForeignKeyName() -class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase): - """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses - UUID for dataset primary key. - """ + # Check that existing datasets have the same dataset type and + # run. + query = ( + sqlalchemy.sql.select( + dataset.columns.id.label("dataset_id"), + dataset.columns.dataset_type_id.label("dataset_type_id"), + tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"), + dataset.columns[self._run_key_column].label("run"), + tmp_tags.columns[collection_fkey_name].label("new_run"), + ) + .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) + .where( + sqlalchemy.sql.or_( + dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, + dataset.columns[self._run_key_column] != tmp_tags.columns[collection_fkey_name], + ) + ) + .limit(1) + ) + with self._db.query(query) as result: + # Only include the first one in the exception message + if (row := result.first()) is not None: + existing_run = self._collections[row.run].name + new_run = self._collections[row.new_run].name + if row.dataset_type_id == storage.dataset_type_id: + if row.new_dataset_type_id == storage.dataset_type_id: + raise ConflictingDefinitionError( + f"Current run {existing_run!r} and new run {new_run!r} do not agree for " + f"dataset {row.dataset_id}." + ) + else: + raise ConflictingDefinitionError( + f"Dataset {row.dataset_id} was provided with type {storage.dataset_type.name!r} " + f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} " + f"in run {run!r}." + ) + else: + raise ConflictingDefinitionError( + f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} " + f"in run {new_run!r}, but was already defined with type " + f"{storage.dataset_type.name!r} in run {run!r}." + ) + + # Check that matching dataset in tags table has the same DataId. + query = ( + sqlalchemy.sql.select( + tags.columns.dataset_id, + tags.columns.dataset_type_id.label("type_id"), + tmp_tags.columns.dataset_type_id.label("new_type_id"), + *[tags.columns[dim] for dim in storage.dataset_type.dimensions.required], + *[ + tmp_tags.columns[dim].label(f"new_{dim}") + for dim in storage.dataset_type.dimensions.required + ], + ) + .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) + .where( + sqlalchemy.sql.or_( + tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, + *[ + tags.columns[dim] != tmp_tags.columns[dim] + for dim in storage.dataset_type.dimensions.required + ], + ) + ) + .limit(1) + ) - _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS] - _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID - _autoincrement: bool = False - _idColumnType: type = ddl.GUID + with self._db.query(query) as result: + if (row := result.first()) is not None: + # Only include the first one in the exception message + raise ConflictingDefinitionError( + f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" + ) - def clone( + # Check that matching run+dataId have the same dataset ID. + query = ( + sqlalchemy.sql.select( + *[tags.columns[dim] for dim in storage.dataset_type.dimensions.required], + tags.columns.dataset_id, + tmp_tags.columns.dataset_id.label("new_dataset_id"), + tags.columns[collection_fkey_name], + tmp_tags.columns[collection_fkey_name].label(f"new_{collection_fkey_name}"), + ) + .select_from( + tags.join( + tmp_tags, + sqlalchemy.sql.and_( + tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, + tags.columns[collection_fkey_name] == tmp_tags.columns[collection_fkey_name], + *[ + tags.columns[dim] == tmp_tags.columns[dim] + for dim in storage.dataset_type.dimensions.required + ], + ), + ) + ) + .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) + .limit(1) + ) + with self._db.query(query) as result: + # only include the first one in the exception message + if (row := result.first()) is not None: + data_id = {dim: getattr(row, dim) for dim in storage.dataset_type.dimensions.required} + existing_collection = self._collections[getattr(row, collection_fkey_name)].name + new_collection = self._collections[getattr(row, f"new_{collection_fkey_name}")].name + raise ConflictingDefinitionError( + f"Dataset with type {storage.dataset_type.name!r} and data ID {data_id} " + f"has ID {row.dataset_id} in existing collection {existing_collection!r} " + f"but ID {row.new_dataset_id} in new collection {new_collection!r}." + ) + + def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None: + # Docstring inherited from DatasetRecordStorageManager. + # Only delete from common dataset table; ON DELETE foreign key clauses + # will handle the rest. + self._db.delete( + self._static.dataset, + ["id"], + *[{"id": getattr(dataset, "id", dataset)} for dataset in datasets], + ) + + def associate( + self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] + ) -> None: + # Docstring inherited from DatasetRecordStorageManager. + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + if collection.type is not CollectionType.TAGGED: + raise CollectionTypeError( + f"Cannot associate into collection '{collection.name}' " + f"of type {collection.type.name}; must be TAGGED." + ) + proto_row = { + self._collections.getCollectionForeignKeyName(): collection.key, + "dataset_type_id": storage.dataset_type_id, + } + rows = [] + summary = CollectionSummary() + for dataset in summary.add_datasets_generator(datasets): + rows.append(dict(proto_row, dataset_id=dataset.id, **dataset.dataId.required)) + if rows: + # Update the summary tables for this collection in case this is the + # first time this dataset type or these governor values will be + # inserted there. + self._summaries.update(collection, [storage.dataset_type_id], summary) + # Update the tag table itself. + self._db.replace(storage.dynamic_tables.tags(self._db, type(self._collections)), *rows) + + def disassociate( + self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] + ) -> None: + # Docstring inherited from DatasetRecordStorageManager. + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + if collection.type is not CollectionType.TAGGED: + raise CollectionTypeError( + f"Cannot disassociate from collection '{collection.name}' " + f"of type {collection.type.name}; must be TAGGED." + ) + rows = [ + { + "dataset_id": dataset.id, + self._collections.getCollectionForeignKeyName(): collection.key, + } + for dataset in datasets + ] + self._db.delete( + storage.dynamic_tables.tags(self._db, type(self._collections)), + ["dataset_id", self._collections.getCollectionForeignKeyName()], + *rows, + ) + + def certify( + self, + dataset_type: DatasetType, + collection: CollectionRecord, + datasets: Iterable[DatasetRef], + timespan: Timespan, + context: SqlQueryContext, + ) -> None: + # Docstring inherited from DatasetRecordStorageManager. + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + if not dataset_type.isCalibration(): + raise DatasetTypeError( + f"Cannot certify datasets of type {dataset_type.name!r}, for which " + "DatasetType.isCalibration() is False." + ) + if collection.type is not CollectionType.CALIBRATION: + raise CollectionTypeError( + f"Cannot certify into collection '{collection.name}' " + f"of type {collection.type.name}; must be CALIBRATION." + ) + TimespanReprClass = self._db.getTimespanRepresentation() + proto_row = { + self._collections.getCollectionForeignKeyName(): collection.key, + "dataset_type_id": storage.dataset_type_id, + } + rows = [] + data_ids: set[DataCoordinate] | None = ( + set() if not TimespanReprClass.hasExclusionConstraint() else None + ) + summary = CollectionSummary() + for dataset in summary.add_datasets_generator(datasets): + row = dict(proto_row, dataset_id=dataset.id, **dataset.dataId.required) + TimespanReprClass.update(timespan, result=row) + rows.append(row) + if data_ids is not None: + data_ids.add(dataset.dataId) + if not rows: + # Just in case an empty dataset collection is provided we want to + # avoid adding dataset type to summary tables. + return + # Update the summary tables for this collection in case this is the + # first time this dataset type or these governor values will be + # inserted there. + self._summaries.update(collection, [storage.dataset_type_id], summary) + # Update the association table itself. + calibs_table = storage.dynamic_tables.calibs(self._db, type(self._collections)) + if TimespanReprClass.hasExclusionConstraint(): + # Rely on database constraint to enforce invariants; we just + # reraise the exception for consistency across DB engines. + try: + self._db.insert(calibs_table, *rows) + except sqlalchemy.exc.IntegrityError as err: + raise ConflictingDefinitionError( + f"Validity range conflict certifying datasets of type {dataset_type.name!r} " + f"into {collection.name!r} for range {timespan}." + ) from err + else: + # Have to implement exclusion constraint ourselves. + # Start by building a SELECT query for any rows that would overlap + # this one. + relation = self._build_calib_overlap_query(dataset_type, collection, data_ids, timespan, context) + # Acquire a table lock to ensure there are no concurrent writes + # could invalidate our checking before we finish the inserts. We + # use a SAVEPOINT in case there is an outer transaction that a + # failure here should not roll back. + with self._db.transaction(lock=[calibs_table], savepoint=True): + # Enter SqlQueryContext in case we need to use a temporary + # table to include the give data IDs in the query. Note that + # by doing this inside the transaction, we make sure it doesn't + # attempt to close the session when its done, since it just + # sees an already-open session that it knows it shouldn't + # manage. + with context: + # Run the check SELECT query. + conflicting = context.count(context.process(relation)) + if conflicting > 0: + raise ConflictingDefinitionError( + f"{conflicting} validity range conflicts certifying datasets of type " + f"{dataset_type.name} into {collection.name} for range " + f"[{timespan.begin}, {timespan.end})." + ) + # Proceed with the insert. + self._db.insert(calibs_table, *rows) + + def decertify( self, + dataset_type: DatasetType, + collection: CollectionRecord, + timespan: Timespan, *, - db: Database, - collections: CollectionManager, - dimensions: DimensionRecordStorageManager, - caching_context: CachingContext, - ) -> ByDimensionsDatasetRecordStorageManagerUUID: - return ByDimensionsDatasetRecordStorageManagerUUID( - db=db, - collections=collections, - dimensions=dimensions, - static=self._static, - summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context), - caching_context=caching_context, - registry_schema_version=self._registry_schema_version, + data_ids: Iterable[DataCoordinate] | None = None, + context: SqlQueryContext, + ) -> None: + # Docstring inherited from DatasetRecordStorageManager. + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + if not dataset_type.isCalibration(): + raise DatasetTypeError( + f"Cannot certify datasets of type {dataset_type.name!r}, for which " + "DatasetType.isCalibration() is False." + ) + if collection.type is not CollectionType.CALIBRATION: + raise CollectionTypeError( + f"Cannot decertify from collection '{collection.name}' " + f"of type {collection.type.name}; must be CALIBRATION." + ) + TimespanReprClass = self._db.getTimespanRepresentation() + # Construct a SELECT query to find all rows that overlap our inputs. + data_id_set: set[DataCoordinate] | None + if data_ids is not None: + data_id_set = set(data_ids) + else: + data_id_set = None + relation = self._build_calib_overlap_query(dataset_type, collection, data_id_set, timespan, context) + calib_pkey_tag = DatasetColumnTag(dataset_type.name, "calib_pkey") + dataset_id_tag = DatasetColumnTag(dataset_type.name, "dataset_id") + timespan_tag = DatasetColumnTag(dataset_type.name, "timespan") + data_id_tags = [(name, DimensionKeyColumnTag(name)) for name in dataset_type.dimensions.required] + # Set up collections to populate with the rows we'll want to modify. + # The insert rows will have the same values for collection and + # dataset type. + proto_insert_row = { + self._collections.getCollectionForeignKeyName(): collection.key, + "dataset_type_id": storage.dataset_type_id, + } + rows_to_delete = [] + rows_to_insert = [] + # Acquire a table lock to ensure there are no concurrent writes + # between the SELECT and the DELETE and INSERT queries based on it. + calibs_table = storage.dynamic_tables.calibs(self._db, type(self._collections)) + with self._db.transaction(lock=[calibs_table], savepoint=True): + # Enter SqlQueryContext in case we need to use a temporary table to + # include the give data IDs in the query (see similar block in + # certify for details). + with context: + for row in context.fetch_iterable(relation): + rows_to_delete.append({"id": row[calib_pkey_tag]}) + # Construct the insert row(s) by copying the prototype row, + # then adding the dimension column values, then adding + # what's left of the timespan from that row after we + # subtract the given timespan. + new_insert_row = proto_insert_row.copy() + new_insert_row["dataset_id"] = row[dataset_id_tag] + for name, tag in data_id_tags: + new_insert_row[name] = row[tag] + row_timespan = row[timespan_tag] + assert row_timespan is not None, "Field should have a NOT NULL constraint." + for diff_timespan in row_timespan.difference(timespan): + rows_to_insert.append( + TimespanReprClass.update(diff_timespan, result=new_insert_row.copy()) + ) + # Run the DELETE and INSERT queries. + self._db.delete(calibs_table, ["id"], *rows_to_delete) + self._db.insert(calibs_table, *rows_to_insert) + + def _build_calib_overlap_query( + self, + dataset_type: DatasetType, + collection: CollectionRecord, + data_ids: set[DataCoordinate] | None, + timespan: Timespan, + context: SqlQueryContext, + ) -> Relation: + relation = self.make_relation( + dataset_type, collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context + ).with_rows_satisfying( + context.make_timespan_overlap_predicate( + DatasetColumnTag(dataset_type.name, "timespan"), timespan + ), ) + if data_ids is not None: + relation = relation.join( + context.make_data_id_relation(data_ids, dataset_type.dimensions.required).transferred_to( + context.sql_engine + ), + ) + return relation - @classmethod - def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: + def make_relation( + self, + dataset_type: DatasetType, + *collections: CollectionRecord, + columns: Set[str], + context: SqlQueryContext, + ) -> Relation: # Docstring inherited from DatasetRecordStorageManager. - return True + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + collection_types = {collection.type for collection in collections} + assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." + TimespanReprClass = self._db.getTimespanRepresentation() + # + # There are two kinds of table in play here: + # + # - the static dataset table (with the dataset ID, dataset type ID, + # run ID/name, and ingest date); + # + # - the dynamic tags/calibs table (with the dataset ID, dataset type + # type ID, collection ID/name, data ID, and possibly validity + # range). + # + # That means that we might want to return a query against either table + # or a JOIN of both, depending on which quantities the caller wants. + # But the data ID is always included, which means we'll always include + # the tags/calibs table and join in the static dataset table only if we + # need things from it that we can't get from the tags/calibs table. + # + # Note that it's important that we include a WHERE constraint on both + # tables for any column (e.g. dataset_type_id) that is in both when + # it's given explicitly; not doing can prevent the query planner from + # using very important indexes. At present, we don't include those + # redundant columns in the JOIN ON expression, however, because the + # FOREIGN KEY (and its index) are defined only on dataset_id. + tag_relation: Relation | None = None + calib_relation: Relation | None = None + if collection_types != {CollectionType.CALIBRATION}: + tags_table = storage.dynamic_tables.tags(self._db, type(self._collections)) + # We'll need a subquery for the tags table if any of the given + # collections are not a CALIBRATION collection. This intentionally + # also fires when the list of collections is empty as a way to + # create a dummy subquery that we know will fail. + # We give the table an alias because it might appear multiple times + # in the same query, for different dataset types. + tags_parts = sql.Payload[LogicalColumn](tags_table.alias(f"{dataset_type.name}_tags")) + if "timespan" in columns: + tags_parts.columns_available[DatasetColumnTag(dataset_type.name, "timespan")] = ( + TimespanReprClass.fromLiteral(Timespan(None, None)) + ) + tag_relation = self._finish_single_relation( + storage, + tags_parts, + columns, + [ + (record, rank) + for rank, record in enumerate(collections) + if record.type is not CollectionType.CALIBRATION + ], + context, + ) + assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." + if CollectionType.CALIBRATION in collection_types: + # If at least one collection is a CALIBRATION collection, we'll + # need a subquery for the calibs table, and could include the + # timespan as a result or constraint. + calibs_table = storage.dynamic_tables.calibs(self._db, type(self._collections)) + calibs_parts = sql.Payload[LogicalColumn](calibs_table.alias(f"{dataset_type.name}_calibs")) + if "timespan" in columns: + calibs_parts.columns_available[DatasetColumnTag(dataset_type.name, "timespan")] = ( + TimespanReprClass.from_columns(calibs_parts.from_clause.columns) + ) + if "calib_pkey" in columns: + # This is a private extension not included in the base class + # interface, for internal use only in _buildCalibOverlapQuery, + # which needs access to the autoincrement primary key for the + # calib association table. + calibs_parts.columns_available[DatasetColumnTag(dataset_type.name, "calib_pkey")] = ( + calibs_parts.from_clause.columns.id + ) + calib_relation = self._finish_single_relation( + storage, + calibs_parts, + columns, + [ + (record, rank) + for rank, record in enumerate(collections) + if record.type is CollectionType.CALIBRATION + ], + context, + ) + if tag_relation is not None: + if calib_relation is not None: + # daf_relation's chain operation does not automatically + # deduplicate; it's more like SQL's UNION ALL. To get UNION + # in SQL here, we add an explicit deduplication. + return tag_relation.chain(calib_relation).without_duplicates() + else: + return tag_relation + elif calib_relation is not None: + return calib_relation + else: + raise AssertionError("Branch should be unreachable.") - @classmethod - def _newDefaultSchemaVersion(cls) -> VersionTuple: - # Docstring inherited from VersionedExtension. + def _finish_single_relation( + self, + storage: _DatasetRecordStorage, + payload: sql.Payload[LogicalColumn], + requested_columns: Set[str], + collections: Sequence[tuple[CollectionRecord, int]], + context: SqlQueryContext, + ) -> Relation: + """Handle adding columns and WHERE terms that are not specific to + either the tags or calibs tables. - # By default return latest version so that new repos are created with - # nanoseconds ingest_date. - return _VERSION_UUID_NS + Helper method for `make_relation`. - def ingest_date_dtype(self) -> type: - """Return type of the ``ingest_date`` column.""" - schema_version = self.newSchemaVersion() - if schema_version is not None and schema_version.major > 1: - return ddl.AstropyTimeNsecTai + Parameters + ---------- + storage : `ByDimensionsDatasetRecordStorageUUID` + Struct that holds the tables and ID for the dataset type. + payload : `lsst.daf.relation.sql.Payload` + SQL query parts under construction, to be modified in-place and + used to construct the new relation. + requested_columns : `~collections.abc.Set` [ `str` ] + Columns the relation should include. + collections : `~collections.abc.Sequence` [ `tuple` \ + [ `CollectionRecord`, `int` ] ] + Collections to search for the dataset and their ranks. + context : `SqlQueryContext` + Context that manages engines and state for the query. + + Returns + ------- + relation : `lsst.daf.relation.Relation` + New dataset query relation. + """ + payload.where.append(payload.from_clause.columns.dataset_type_id == storage.dataset_type_id) + dataset_id_col = payload.from_clause.columns.dataset_id + collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] + # We always constrain and optionally retrieve the collection(s) via the + # tags/calibs table. + if len(collections) == 1: + payload.where.append(collection_col == collections[0][0].key) + if "collection" in requested_columns: + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "collection")] = ( + sqlalchemy.sql.literal(collections[0][0].key) + ) else: - return sqlalchemy.TIMESTAMP + assert collections, "The no-collections case should be in calling code for better diagnostics." + payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) + if "collection" in requested_columns: + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "collection")] = ( + collection_col + ) + # Add rank if requested as a CASE-based calculation the collection + # column. + if "rank" in requested_columns: + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "rank")] = ( + sqlalchemy.sql.case( + {record.key: rank for record, rank in collections}, + value=collection_col, + ) + ) + # Add more column definitions, starting with the data ID. + for dimension_name in storage.dataset_type.dimensions.required: + payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ + dimension_name + ] + # We can always get the dataset_id from the tags/calibs table. + if "dataset_id" in requested_columns: + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "dataset_id")] = ( + dataset_id_col + ) + # It's possible we now have everything we need, from just the + # tags/calibs table. The things we might need to get from the static + # dataset table are the run key and the ingest date. + need_static_table = False + if "run" in requested_columns: + if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: + # If we are searching exactly one RUN collection, we + # know that if we find the dataset in that collection, + # then that's the datasets's run; we don't need to + # query for it. + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "run")] = ( + sqlalchemy.sql.literal(collections[0][0].key) + ) + else: + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "run")] = ( + self._static.dataset.columns[self._run_key_column] + ) + need_static_table = True + # Ingest date can only come from the static table. + if "ingest_date" in requested_columns: + need_static_table = True + payload.columns_available[DatasetColumnTag(storage.dataset_type.name, "ingest_date")] = ( + self._static.dataset.columns.ingest_date + ) + # If we need the static table, join it in via dataset_id and + # dataset_type_id + if need_static_table: + payload.from_clause = payload.from_clause.join( + self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) + ) + # Also constrain dataset_type_id in static table in case that helps + # generate a better plan. + # We could also include this in the JOIN ON clause, but my guess is + # that that's a good idea IFF it's in the foreign key, and right + # now it isn't. + payload.where.append(self._static.dataset.columns.dataset_type_id == storage.dataset_type_id) + leaf = context.sql_engine.make_leaf( + payload.columns_available.keys(), + payload=payload, + name=storage.dataset_type.name, + parameters={record.name: rank for record, rank in collections}, + ) + return leaf + + def make_query_joiner( + self, dataset_type: DatasetType, collections: Sequence[CollectionRecord], fields: Set[str] + ) -> QueryJoiner: + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + # This method largely mimics `make_relation`, but it uses the new query + # system primitives instead of the old one. In terms of the SQL + # queries it builds, there are two more main differences: + # + # - Collection and run columns are now string names rather than IDs. + # This insulates the query result-processing code from collection + # caching and the collection manager subclass details. + # + # - The subquery always has unique rows, which is achieved by using + # SELECT DISTINCT when necessary. + # + collection_types = {collection.type for collection in collections} + assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." + # + # There are two kinds of table in play here: + # + # - the static dataset table (with the dataset ID, dataset type ID, + # run ID/name, and ingest date); + # + # - the dynamic tags/calibs table (with the dataset ID, dataset type + # type ID, collection ID/name, data ID, and possibly validity + # range). + # + # That means that we might want to return a query against either table + # or a JOIN of both, depending on which quantities the caller wants. + # But the data ID is always included, which means we'll always include + # the tags/calibs table and join in the static dataset table only if we + # need things from it that we can't get from the tags/calibs table. + # + # Note that it's important that we include a WHERE constraint on both + # tables for any column (e.g. dataset_type_id) that is in both when + # it's given explicitly; not doing can prevent the query planner from + # using very important indexes. At present, we don't include those + # redundant columns in the JOIN ON expression, however, because the + # FOREIGN KEY (and its index) are defined only on dataset_id. + columns = qt.ColumnSet(dataset_type.dimensions) + columns.drop_implied_dimension_keys() + columns.dataset_fields[dataset_type.name].update(fields) + tags_builder: QueryBuilder | None = None + if collection_types != {CollectionType.CALIBRATION}: + # We'll need a subquery for the tags table if any of the given + # collections are not a CALIBRATION collection. This intentionally + # also fires when the list of collections is empty as a way to + # create a dummy subquery that we know will fail. + # We give the table an alias because it might appear multiple times + # in the same query, for different dataset types. + tags_table = storage.dynamic_tables.tags(self._db, type(self._collections)) + tags_builder = self._finish_query_builder( + storage, + QueryJoiner(self._db, tags_table.alias(f"{dataset_type.name}_tags")).to_builder(columns), + [record for record in collections if record.type is not CollectionType.CALIBRATION], + fields, + ) + if "timespan" in fields: + tags_builder.joiner.timespans[dataset_type.name] = ( + self._db.getTimespanRepresentation().fromLiteral(None) + ) + calibs_builder: QueryBuilder | None = None + if CollectionType.CALIBRATION in collection_types: + # If at least one collection is a CALIBRATION collection, we'll + # need a subquery for the calibs table, and could include the + # timespan as a result or constraint. + calibs_table = storage.dynamic_tables.calibs(self._db, type(self._collections)).alias( + f"{dataset_type.name}_calibs" + ) + calibs_builder = self._finish_query_builder( + storage, + QueryJoiner(self._db, calibs_table).to_builder(columns), + [record for record in collections if record.type is CollectionType.CALIBRATION], + fields, + ) + if "timespan" in fields: + calibs_builder.joiner.timespans[dataset_type.name] = ( + self._db.getTimespanRepresentation().from_columns(calibs_table.columns) + ) + + # In calibration collections, we need timespan as well as data ID + # to ensure unique rows. + calibs_builder.distinct = calibs_builder.distinct and "timespan" not in fields + if tags_builder is not None: + if calibs_builder is not None: + # Need a UNION subquery. + return tags_builder.union_subquery([calibs_builder]) + else: + return tags_builder.to_joiner() + elif calibs_builder is not None: + return calibs_builder.to_joiner() + else: + raise AssertionError("Branch should be unreachable.") + + def _finish_query_builder( + self, + storage: _DatasetRecordStorage, + sql_projection: QueryBuilder, + collections: Sequence[CollectionRecord], + fields: Set[str], + ) -> QueryBuilder: + # This method plays the same role as _finish_single_relation in the new + # query system. It is called exactly one or two times by + # make_sql_builder, just as _finish_single_relation is called exactly + # one or two times by make_relation. See make_sql_builder comments for + # what's different. + assert sql_projection.joiner.from_clause is not None + run_collections_only = all(record.type is CollectionType.RUN for record in collections) + sql_projection.joiner.where( + sql_projection.joiner.from_clause.c.dataset_type_id == storage.dataset_type_id + ) + dataset_id_col = sql_projection.joiner.from_clause.c.dataset_id + collection_col = sql_projection.joiner.from_clause.c[self._collections.getCollectionForeignKeyName()] + fields_provided = sql_projection.joiner.fields[storage.dataset_type.name] + # We always constrain and optionally retrieve the collection(s) via the + # tags/calibs table. + if "collection_key" in fields: + sql_projection.joiner.fields[storage.dataset_type.name]["collection_key"] = collection_col + if len(collections) == 1: + only_collection_record = collections[0] + sql_projection.joiner.where(collection_col == only_collection_record.key) + if "collection" in fields: + fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name).cast( + # This cast is necessary to ensure that Postgres knows the + # type of this column if it is used in an aggregate + # function. + sqlalchemy.String + ) + + elif not collections: + sql_projection.joiner.where(sqlalchemy.literal(False)) + if "collection" in fields: + fields_provided["collection"] = sqlalchemy.literal("NO COLLECTIONS") + else: + sql_projection.joiner.where(collection_col.in_([collection.key for collection in collections])) + if "collection" in fields: + # Avoid a join to the collection table to get the name by using + # a CASE statement. The SQL will be a bit more verbose but + # more efficient. + fields_provided["collection"] = _create_case_expression_for_collections( + collections, collection_col + ) + # Add more column definitions, starting with the data ID. + sql_projection.joiner.extract_dimensions(storage.dataset_type.dimensions.required) + # We can always get the dataset_id from the tags/calibs table, even if + # could also get it from the 'static' dataset table. + if "dataset_id" in fields: + fields_provided["dataset_id"] = dataset_id_col + + # It's possible we now have everything we need, from just the + # tags/calibs table. The things we might need to get from the static + # dataset table are the run key and the ingest date. + need_static_table = False + need_collection_table = False + # Ingest date can only come from the static table. + if "ingest_date" in fields: + fields_provided["ingest_date"] = self._static.dataset.c.ingest_date + need_static_table = True + if "run" in fields: + if len(collections) == 1 and run_collections_only: + # If we are searching exactly one RUN collection, we + # know that if we find the dataset in that collection, + # then that's the datasets's run; we don't need to + # query for it. + # + fields_provided["run"] = sqlalchemy.literal(only_collection_record.name).cast( + # This cast is necessary to ensure that Postgres knows the + # type of this column if it is used in an aggregate + # function. + sqlalchemy.String + ) + elif run_collections_only: + # Once again we can avoid joining to the collection table by + # adding a CASE statement. + fields_provided["run"] = _create_case_expression_for_collections( + collections, self._static.dataset.c[self._run_key_column] + ) + need_static_table = True + else: + # Here we can't avoid a join to the collection table, because + # we might find a dataset via something other than its RUN + # collection. + # + # We have to defer adding the join until after we have joined + # in the static dataset table, because the ON clause involves + # the run collection from the static dataset table. Postgres + # cares about the join ordering (though SQLite does not.) + need_collection_table = True + need_static_table = True + if need_static_table: + # If we need the static table, join it in via dataset_id. We don't + # use QueryJoiner.join because we're joining on dataset ID, not + # dimensions. + sql_projection.joiner.from_clause = sql_projection.joiner.from_clause.join( + self._static.dataset, onclause=(dataset_id_col == self._static.dataset.c.id) + ) + # Also constrain dataset_type_id in static table in case that helps + # generate a better plan. We could also include this in the JOIN ON + # clause, but my guess is that that's a good idea IFF it's in the + # foreign key, and right now it isn't. + sql_projection.joiner.where(self._static.dataset.c.dataset_type_id == storage.dataset_type_id) + if need_collection_table: + # Join the collection table to look up the RUN collection name + # associated with the dataset. + ( + fields_provided["run"], + sql_projection.joiner.from_clause, + ) = self._collections.lookup_name_sql( + self._static.dataset.c[self._run_key_column], + sql_projection.joiner.from_clause, + ) + + sql_projection.distinct = ( + # If there are multiple collections, this subquery might have + # non-unique rows. + len(collections) > 1 + and not fields + ) + return sql_projection + + def refresh_collection_summaries(self, dataset_type: DatasetType) -> None: + # Docstring inherited. + if (storage := self._find_storage(dataset_type.name)) is None: + raise MissingDatasetTypeError(f"Dataset type {dataset_type.name!r} has not been registered.") + with self._db.transaction(): + # The main issue here is consistency in the presence of concurrent + # updates (using default READ COMMITTED isolation). Regular clients + # only add to summary tables, and we want to avoid deleting what + # other concurrent transactions may add while we are in this + # transaction. This ordering of operations should guarantee it: + # - read collections for this dataset type from summary tables, + # - read collections for this dataset type from dataset tables + # (both tags and calibs), + # - whatever is in the first set but not in the second can be + # dropped from summary tables. + summary_collection_ids = set(self._summaries.get_collection_ids(storage.dataset_type_id)) + + # Query datasets tables for associated collections. + column_name = self._collections.getCollectionForeignKeyName() + tags_table = storage.dynamic_tables.tags(self._db, type(self._collections)) + query: sqlalchemy.sql.expression.SelectBase = ( + sqlalchemy.select(tags_table.columns[column_name]) + .where(tags_table.columns.dataset_type_id == storage.dataset_type_id) + .distinct() + ) + if dataset_type.isCalibration(): + calibs_table = storage.dynamic_tables.calibs(self._db, type(self._collections)) + query2 = ( + sqlalchemy.select(calibs_table.columns[column_name]) + .where(calibs_table.columns.dataset_type_id == storage.dataset_type_id) + .distinct() + ) + query = sqlalchemy.sql.expression.union(query, query2) + + with self._db.query(query) as result: + collection_ids = set(result.scalars()) + + collections_to_delete = summary_collection_ids - collection_ids + self._summaries.delete_collections(storage.dataset_type_id, collections_to_delete) + + +def _create_case_expression_for_collections( + collections: Iterable[CollectionRecord], id_column: sqlalchemy.ColumnElement +) -> sqlalchemy.Case | sqlalchemy.Null: + """Return a SQLAlchemy Case expression that converts collection IDs to + collection names for the given set of collections. + + Parameters + ---------- + collections : `~collections.abc.Iterable` [ `CollectionRecord` ] + List of collections to include in conversion table. This should be an + exhaustive list of collections that could appear in `id_column`. + id_column : `sqlalchemy.ColumnElement` + The column containing the collection ID that we want to convert to a + collection name. + """ + mapping = {record.key: record.name for record in collections} + if not mapping: + # SQLAlchemy does not correctly handle an empty mapping in case() -- it + # crashes when trying to compile the expression with an + # "AttributeError('NoneType' object has no attribute 'dialect_impl')" + # when trying to access the 'type' property of the Case object. If you + # explicitly specify a type via type_coerce it instead generates + # invalid SQL syntax. + # + # We can end up with empty mappings here in certain "doomed query" edge + # cases, e.g. we start with a list of valid collections but they are + # all filtered out by higher-level code on the basis of collection + # summaries. + return sqlalchemy.null() + + return sqlalchemy.case(mapping, value=id_column) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py deleted file mode 100644 index fe38ed42e9..0000000000 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py +++ /dev/null @@ -1,1165 +0,0 @@ -# This file is part of daf_butler. -# -# Developed for the LSST Data Management System. -# This product includes software developed by the LSST Project -# (http://www.lsst.org). -# See the COPYRIGHT file at the top-level directory of this distribution -# for details of code ownership. -# -# This software is dual licensed under the GNU General Public License and also -# under a 3-clause BSD license. Recipients may choose which of these licenses -# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, -# respectively. If you choose the GPL option then the following text applies -# (but note that there is still no warranty even if you opt for BSD instead): -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - - -from __future__ import annotations - -from .... import ddl - -__all__ = ("ByDimensionsDatasetRecordStorage",) - -import datetime -from collections.abc import Callable, Iterable, Iterator, Sequence, Set -from typing import TYPE_CHECKING - -import astropy.time -import sqlalchemy -from lsst.daf.relation import Relation, sql - -from ...._collection_type import CollectionType -from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag -from ...._column_type_info import LogicalColumn -from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef -from ...._dataset_type import DatasetType -from ...._exceptions import CollectionTypeError -from ...._timespan import Timespan -from ....dimensions import DataCoordinate -from ....direct_query_driver import QueryBuilder, QueryJoiner # new query system, server+direct only -from ....queries import tree as qt # new query system, both clients + server -from ..._collection_summary import CollectionSummary -from ..._exceptions import ConflictingDefinitionError -from ...interfaces import DatasetRecordStorage -from ...queries import SqlQueryContext # old registry query system -from .tables import makeTagTableSpec - -if TYPE_CHECKING: - from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord - from .summaries import CollectionSummaryManager - from .tables import StaticDatasetTablesTuple - - -class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): - """Dataset record storage implementation paired with - `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more - information. - - Instances of this class should never be constructed directly; use - `DatasetRecordStorageManager.register` instead. - - Parameters - ---------- - datasetType : `DatasetType` - The dataset type to use. - db : `Database` - Database connection. - dataset_type_id : `int` - Dataset type identifier. - collections : `CollectionManager` - The collection manager. - static : `StaticDatasetTablesTuple` - Unknown. - summaries : `CollectionSummaryManager` - Collection summary manager. - tags_table_factory : `~collections.abc.Callable` - Factory for creating tags tables. - use_astropy_ingest_date : `bool` - Whether to use Astropy for ingest date. - calibs_table_factory : `~collections.abc.Callable` - Factory for creating calibration tables. - """ - - def __init__( - self, - *, - datasetType: DatasetType, - db: Database, - dataset_type_id: int, - collections: CollectionManager, - static: StaticDatasetTablesTuple, - summaries: CollectionSummaryManager, - tags_table_factory: Callable[[], sqlalchemy.schema.Table], - use_astropy_ingest_date: bool, - calibs_table_factory: Callable[[], sqlalchemy.schema.Table] | None, - ): - super().__init__(datasetType=datasetType) - self._dataset_type_id = dataset_type_id - self._db = db - self._collections = collections - self._static = static - self._summaries = summaries - self._tags_table_factory = tags_table_factory - self._calibs_table_factory = calibs_table_factory - self._runKeyColumn = collections.getRunForeignKeyName() - self._use_astropy = use_astropy_ingest_date - self._tags_table: sqlalchemy.schema.Table | None = None - self._calibs_table: sqlalchemy.schema.Table | None = None - - @property - def _tags(self) -> sqlalchemy.schema.Table: - if self._tags_table is None: - self._tags_table = self._tags_table_factory() - return self._tags_table - - @property - def _calibs(self) -> sqlalchemy.schema.Table | None: - if self._calibs_table is None: - if self._calibs_table_factory is None: - return None - self._calibs_table = self._calibs_table_factory() - return self._calibs_table - - def delete(self, datasets: Iterable[DatasetRef]) -> None: - # Docstring inherited from DatasetRecordStorage. - # Only delete from common dataset table; ON DELETE foreign key clauses - # will handle the rest. - self._db.delete( - self._static.dataset, - ["id"], - *[{"id": dataset.id} for dataset in datasets], - ) - - def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: - # Docstring inherited from DatasetRecordStorage. - if collection.type is not CollectionType.TAGGED: - raise TypeError( - f"Cannot associate into collection '{collection.name}' " - f"of type {collection.type.name}; must be TAGGED." - ) - protoRow = { - self._collections.getCollectionForeignKeyName(): collection.key, - "dataset_type_id": self._dataset_type_id, - } - rows = [] - summary = CollectionSummary() - for dataset in summary.add_datasets_generator(datasets): - rows.append(dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required)) - if rows: - # Update the summary tables for this collection in case this is the - # first time this dataset type or these governor values will be - # inserted there. - self._summaries.update(collection, [self._dataset_type_id], summary) - # Update the tag table itself. - self._db.replace(self._tags, *rows) - - def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: - # Docstring inherited from DatasetRecordStorage. - if collection.type is not CollectionType.TAGGED: - raise TypeError( - f"Cannot disassociate from collection '{collection.name}' " - f"of type {collection.type.name}; must be TAGGED." - ) - rows = [ - { - "dataset_id": dataset.id, - self._collections.getCollectionForeignKeyName(): collection.key, - } - for dataset in datasets - ] - self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) - - def _buildCalibOverlapQuery( - self, - collection: CollectionRecord, - data_ids: set[DataCoordinate] | None, - timespan: Timespan, - context: SqlQueryContext, - ) -> Relation: - relation = self.make_relation( - collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context - ).with_rows_satisfying( - context.make_timespan_overlap_predicate( - DatasetColumnTag(self.datasetType.name, "timespan"), timespan - ), - ) - if data_ids is not None: - relation = relation.join( - context.make_data_id_relation(data_ids, self.datasetType.dimensions.required).transferred_to( - context.sql_engine - ), - ) - return relation - - def certify( - self, - collection: CollectionRecord, - datasets: Iterable[DatasetRef], - timespan: Timespan, - context: SqlQueryContext, - ) -> None: - # Docstring inherited from DatasetRecordStorage. - if self._calibs is None: - raise CollectionTypeError( - f"Cannot certify datasets of type {self.datasetType.name}, for which " - "DatasetType.isCalibration() is False." - ) - if collection.type is not CollectionType.CALIBRATION: - raise CollectionTypeError( - f"Cannot certify into collection '{collection.name}' " - f"of type {collection.type.name}; must be CALIBRATION." - ) - TimespanReprClass = self._db.getTimespanRepresentation() - protoRow = { - self._collections.getCollectionForeignKeyName(): collection.key, - "dataset_type_id": self._dataset_type_id, - } - rows = [] - dataIds: set[DataCoordinate] | None = ( - set() if not TimespanReprClass.hasExclusionConstraint() else None - ) - summary = CollectionSummary() - for dataset in summary.add_datasets_generator(datasets): - row = dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required) - TimespanReprClass.update(timespan, result=row) - rows.append(row) - if dataIds is not None: - dataIds.add(dataset.dataId) - if not rows: - # Just in case an empty dataset collection is provided we want to - # avoid adding dataset type to summary tables. - return - # Update the summary tables for this collection in case this is the - # first time this dataset type or these governor values will be - # inserted there. - self._summaries.update(collection, [self._dataset_type_id], summary) - # Update the association table itself. - if TimespanReprClass.hasExclusionConstraint(): - # Rely on database constraint to enforce invariants; we just - # reraise the exception for consistency across DB engines. - try: - self._db.insert(self._calibs, *rows) - except sqlalchemy.exc.IntegrityError as err: - raise ConflictingDefinitionError( - f"Validity range conflict certifying datasets of type {self.datasetType.name} " - f"into {collection.name} for range [{timespan.begin}, {timespan.end})." - ) from err - else: - # Have to implement exclusion constraint ourselves. - # Start by building a SELECT query for any rows that would overlap - # this one. - relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context) - # Acquire a table lock to ensure there are no concurrent writes - # could invalidate our checking before we finish the inserts. We - # use a SAVEPOINT in case there is an outer transaction that a - # failure here should not roll back. - with self._db.transaction(lock=[self._calibs], savepoint=True): - # Enter SqlQueryContext in case we need to use a temporary - # table to include the give data IDs in the query. Note that - # by doing this inside the transaction, we make sure it doesn't - # attempt to close the session when its done, since it just - # sees an already-open session that it knows it shouldn't - # manage. - with context: - # Run the check SELECT query. - conflicting = context.count(context.process(relation)) - if conflicting > 0: - raise ConflictingDefinitionError( - f"{conflicting} validity range conflicts certifying datasets of type " - f"{self.datasetType.name} into {collection.name} for range " - f"[{timespan.begin}, {timespan.end})." - ) - # Proceed with the insert. - self._db.insert(self._calibs, *rows) - - def decertify( - self, - collection: CollectionRecord, - timespan: Timespan, - *, - dataIds: Iterable[DataCoordinate] | None = None, - context: SqlQueryContext, - ) -> None: - # Docstring inherited from DatasetRecordStorage. - if self._calibs is None: - raise CollectionTypeError( - f"Cannot decertify datasets of type {self.datasetType.name}, for which " - "DatasetType.isCalibration() is False." - ) - if collection.type is not CollectionType.CALIBRATION: - raise CollectionTypeError( - f"Cannot decertify from collection '{collection.name}' " - f"of type {collection.type.name}; must be CALIBRATION." - ) - TimespanReprClass = self._db.getTimespanRepresentation() - # Construct a SELECT query to find all rows that overlap our inputs. - dataIdSet: set[DataCoordinate] | None - if dataIds is not None: - dataIdSet = set(dataIds) - else: - dataIdSet = None - relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context) - calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey") - dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id") - timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan") - data_id_tags = [(name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required] - # Set up collections to populate with the rows we'll want to modify. - # The insert rows will have the same values for collection and - # dataset type. - protoInsertRow = { - self._collections.getCollectionForeignKeyName(): collection.key, - "dataset_type_id": self._dataset_type_id, - } - rowsToDelete = [] - rowsToInsert = [] - # Acquire a table lock to ensure there are no concurrent writes - # between the SELECT and the DELETE and INSERT queries based on it. - with self._db.transaction(lock=[self._calibs], savepoint=True): - # Enter SqlQueryContext in case we need to use a temporary table to - # include the give data IDs in the query (see similar block in - # certify for details). - with context: - for row in context.fetch_iterable(relation): - rowsToDelete.append({"id": row[calib_pkey_tag]}) - # Construct the insert row(s) by copying the prototype row, - # then adding the dimension column values, then adding - # what's left of the timespan from that row after we - # subtract the given timespan. - newInsertRow = protoInsertRow.copy() - newInsertRow["dataset_id"] = row[dataset_id_tag] - for name, tag in data_id_tags: - newInsertRow[name] = row[tag] - rowTimespan = row[timespan_tag] - assert rowTimespan is not None, "Field should have a NOT NULL constraint." - for diffTimespan in rowTimespan.difference(timespan): - rowsToInsert.append( - TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()) - ) - # Run the DELETE and INSERT queries. - self._db.delete(self._calibs, ["id"], *rowsToDelete) - self._db.insert(self._calibs, *rowsToInsert) - - def make_relation( - self, - *collections: CollectionRecord, - columns: Set[str], - context: SqlQueryContext, - ) -> Relation: - # Docstring inherited from DatasetRecordStorage. - collection_types = {collection.type for collection in collections} - assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." - TimespanReprClass = self._db.getTimespanRepresentation() - # - # There are two kinds of table in play here: - # - # - the static dataset table (with the dataset ID, dataset type ID, - # run ID/name, and ingest date); - # - # - the dynamic tags/calibs table (with the dataset ID, dataset type - # type ID, collection ID/name, data ID, and possibly validity - # range). - # - # That means that we might want to return a query against either table - # or a JOIN of both, depending on which quantities the caller wants. - # But the data ID is always included, which means we'll always include - # the tags/calibs table and join in the static dataset table only if we - # need things from it that we can't get from the tags/calibs table. - # - # Note that it's important that we include a WHERE constraint on both - # tables for any column (e.g. dataset_type_id) that is in both when - # it's given explicitly; not doing can prevent the query planner from - # using very important indexes. At present, we don't include those - # redundant columns in the JOIN ON expression, however, because the - # FOREIGN KEY (and its index) are defined only on dataset_id. - tag_relation: Relation | None = None - calib_relation: Relation | None = None - if collection_types != {CollectionType.CALIBRATION}: - # We'll need a subquery for the tags table if any of the given - # collections are not a CALIBRATION collection. This intentionally - # also fires when the list of collections is empty as a way to - # create a dummy subquery that we know will fail. - # We give the table an alias because it might appear multiple times - # in the same query, for different dataset types. - tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags")) - if "timespan" in columns: - tags_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( - TimespanReprClass.fromLiteral(Timespan(None, None)) - ) - tag_relation = self._finish_single_relation( - tags_parts, - columns, - [ - (record, rank) - for rank, record in enumerate(collections) - if record.type is not CollectionType.CALIBRATION - ], - context, - ) - assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." - if CollectionType.CALIBRATION in collection_types: - # If at least one collection is a CALIBRATION collection, we'll - # need a subquery for the calibs table, and could include the - # timespan as a result or constraint. - assert ( - self._calibs is not None - ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." - calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs")) - if "timespan" in columns: - calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( - TimespanReprClass.from_columns(calibs_parts.from_clause.columns) - ) - if "calib_pkey" in columns: - # This is a private extension not included in the base class - # interface, for internal use only in _buildCalibOverlapQuery, - # which needs access to the autoincrement primary key for the - # calib association table. - calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "calib_pkey")] = ( - calibs_parts.from_clause.columns.id - ) - calib_relation = self._finish_single_relation( - calibs_parts, - columns, - [ - (record, rank) - for rank, record in enumerate(collections) - if record.type is CollectionType.CALIBRATION - ], - context, - ) - if tag_relation is not None: - if calib_relation is not None: - # daf_relation's chain operation does not automatically - # deduplicate; it's more like SQL's UNION ALL. To get UNION - # in SQL here, we add an explicit deduplication. - return tag_relation.chain(calib_relation).without_duplicates() - else: - return tag_relation - elif calib_relation is not None: - return calib_relation - else: - raise AssertionError("Branch should be unreachable.") - - def _finish_single_relation( - self, - payload: sql.Payload[LogicalColumn], - requested_columns: Set[str], - collections: Sequence[tuple[CollectionRecord, int]], - context: SqlQueryContext, - ) -> Relation: - """Handle adding columns and WHERE terms that are not specific to - either the tags or calibs tables. - - Helper method for `make_relation`. - - Parameters - ---------- - payload : `lsst.daf.relation.sql.Payload` - SQL query parts under construction, to be modified in-place and - used to construct the new relation. - requested_columns : `~collections.abc.Set` [ `str` ] - Columns the relation should include. - collections : `~collections.abc.Sequence` [ `tuple` \ - [ `CollectionRecord`, `int` ] ] - Collections to search for the dataset and their ranks. - context : `SqlQueryContext` - Context that manages engines and state for the query. - - Returns - ------- - relation : `lsst.daf.relation.Relation` - New dataset query relation. - """ - payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id) - dataset_id_col = payload.from_clause.columns.dataset_id - collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] - # We always constrain and optionally retrieve the collection(s) via the - # tags/calibs table. - if len(collections) == 1: - payload.where.append(collection_col == collections[0][0].key) - if "collection" in requested_columns: - payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( - sqlalchemy.sql.literal(collections[0][0].key) - ) - else: - assert collections, "The no-collections case should be in calling code for better diagnostics." - payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) - if "collection" in requested_columns: - payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( - collection_col - ) - # Add rank if requested as a CASE-based calculation the collection - # column. - if "rank" in requested_columns: - payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case( - {record.key: rank for record, rank in collections}, - value=collection_col, - ) - # Add more column definitions, starting with the data ID. - for dimension_name in self.datasetType.dimensions.required: - payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ - dimension_name - ] - # We can always get the dataset_id from the tags/calibs table. - if "dataset_id" in requested_columns: - payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col - # It's possible we now have everything we need, from just the - # tags/calibs table. The things we might need to get from the static - # dataset table are the run key and the ingest date. - need_static_table = False - if "run" in requested_columns: - if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: - # If we are searching exactly one RUN collection, we - # know that if we find the dataset in that collection, - # then that's the datasets's run; we don't need to - # query for it. - payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( - sqlalchemy.sql.literal(collections[0][0].key) - ) - else: - payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( - self._static.dataset.columns[self._runKeyColumn] - ) - need_static_table = True - # Ingest date can only come from the static table. - if "ingest_date" in requested_columns: - need_static_table = True - payload.columns_available[DatasetColumnTag(self.datasetType.name, "ingest_date")] = ( - self._static.dataset.columns.ingest_date - ) - # If we need the static table, join it in via dataset_id and - # dataset_type_id - if need_static_table: - payload.from_clause = payload.from_clause.join( - self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) - ) - # Also constrain dataset_type_id in static table in case that helps - # generate a better plan. - # We could also include this in the JOIN ON clause, but my guess is - # that that's a good idea IFF it's in the foreign key, and right - # now it isn't. - payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) - leaf = context.sql_engine.make_leaf( - payload.columns_available.keys(), - payload=payload, - name=self.datasetType.name, - parameters={record.name: rank for record, rank in collections}, - ) - return leaf - - def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner: - # This method largely mimics `make_relation`, but it uses the new query - # system primitives instead of the old one. In terms of the SQL - # queries it builds, there are two more main differences: - # - # - Collection and run columns are now string names rather than IDs. - # This insulates the query result-processing code from collection - # caching and the collection manager subclass details. - # - # - The subquery always has unique rows, which is achieved by using - # SELECT DISTINCT when necessary. - # - collection_types = {collection.type for collection in collections} - assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." - # - # There are two kinds of table in play here: - # - # - the static dataset table (with the dataset ID, dataset type ID, - # run ID/name, and ingest date); - # - # - the dynamic tags/calibs table (with the dataset ID, dataset type - # type ID, collection ID/name, data ID, and possibly validity - # range). - # - # That means that we might want to return a query against either table - # or a JOIN of both, depending on which quantities the caller wants. - # But the data ID is always included, which means we'll always include - # the tags/calibs table and join in the static dataset table only if we - # need things from it that we can't get from the tags/calibs table. - # - # Note that it's important that we include a WHERE constraint on both - # tables for any column (e.g. dataset_type_id) that is in both when - # it's given explicitly; not doing can prevent the query planner from - # using very important indexes. At present, we don't include those - # redundant columns in the JOIN ON expression, however, because the - # FOREIGN KEY (and its index) are defined only on dataset_id. - columns = qt.ColumnSet(self.datasetType.dimensions) - columns.drop_implied_dimension_keys() - columns.dataset_fields[self.datasetType.name].update(fields) - tags_builder: QueryBuilder | None = None - if collection_types != {CollectionType.CALIBRATION}: - # We'll need a subquery for the tags table if any of the given - # collections are not a CALIBRATION collection. This intentionally - # also fires when the list of collections is empty as a way to - # create a dummy subquery that we know will fail. - # We give the table an alias because it might appear multiple times - # in the same query, for different dataset types. - tags_builder = self._finish_query_builder( - QueryJoiner(self._db, self._tags.alias(f"{self.datasetType.name}_tags")).to_builder(columns), - [record for record in collections if record.type is not CollectionType.CALIBRATION], - fields, - ) - if "timespan" in fields: - tags_builder.joiner.timespans[self.datasetType.name] = ( - self._db.getTimespanRepresentation().fromLiteral(None) - ) - calibs_builder: QueryBuilder | None = None - if CollectionType.CALIBRATION in collection_types: - # If at least one collection is a CALIBRATION collection, we'll - # need a subquery for the calibs table, and could include the - # timespan as a result or constraint. - assert ( - self._calibs is not None - ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." - calibs_table = self._calibs.alias(f"{self.datasetType.name}_calibs") - calibs_builder = self._finish_query_builder( - QueryJoiner(self._db, calibs_table).to_builder(columns), - [record for record in collections if record.type is CollectionType.CALIBRATION], - fields, - ) - if "timespan" in fields: - calibs_builder.joiner.timespans[self.datasetType.name] = ( - self._db.getTimespanRepresentation().from_columns(calibs_table.columns) - ) - - # In calibration collections, we need timespan as well as data ID - # to ensure unique rows. - calibs_builder.distinct = calibs_builder.distinct and "timespan" not in fields - if tags_builder is not None: - if calibs_builder is not None: - # Need a UNION subquery. - return tags_builder.union_subquery([calibs_builder]) - else: - return tags_builder.to_joiner() - elif calibs_builder is not None: - return calibs_builder.to_joiner() - else: - raise AssertionError("Branch should be unreachable.") - - def _finish_query_builder( - self, - sql_projection: QueryBuilder, - collections: Sequence[CollectionRecord], - fields: Set[str], - ) -> QueryBuilder: - # This method plays the same role as _finish_single_relation in the new - # query system. It is called exactly one or two times by - # make_sql_builder, just as _finish_single_relation is called exactly - # one or two times by make_relation. See make_sql_builder comments for - # what's different. - assert sql_projection.joiner.from_clause is not None - run_collections_only = all(record.type is CollectionType.RUN for record in collections) - sql_projection.joiner.where( - sql_projection.joiner.from_clause.c.dataset_type_id == self._dataset_type_id - ) - dataset_id_col = sql_projection.joiner.from_clause.c.dataset_id - collection_col = sql_projection.joiner.from_clause.c[self._collections.getCollectionForeignKeyName()] - fields_provided = sql_projection.joiner.fields[self.datasetType.name] - # We always constrain and optionally retrieve the collection(s) via the - # tags/calibs table. - if "collection_key" in fields: - sql_projection.joiner.fields[self.datasetType.name]["collection_key"] = collection_col - if len(collections) == 1: - only_collection_record = collections[0] - sql_projection.joiner.where(collection_col == only_collection_record.key) - if "collection" in fields: - fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name).cast( - # This cast is necessary to ensure that Postgres knows the - # type of this column if it is used in an aggregate - # function. - sqlalchemy.String - ) - - elif not collections: - sql_projection.joiner.where(sqlalchemy.literal(False)) - if "collection" in fields: - fields_provided["collection"] = sqlalchemy.literal("NO COLLECTIONS") - else: - sql_projection.joiner.where(collection_col.in_([collection.key for collection in collections])) - if "collection" in fields: - # Avoid a join to the collection table to get the name by using - # a CASE statement. The SQL will be a bit more verbose but - # more efficient. - fields_provided["collection"] = _create_case_expression_for_collections( - collections, collection_col - ) - # Add more column definitions, starting with the data ID. - sql_projection.joiner.extract_dimensions(self.datasetType.dimensions.required) - # We can always get the dataset_id from the tags/calibs table, even if - # could also get it from the 'static' dataset table. - if "dataset_id" in fields: - fields_provided["dataset_id"] = dataset_id_col - - # It's possible we now have everything we need, from just the - # tags/calibs table. The things we might need to get from the static - # dataset table are the run key and the ingest date. - need_static_table = False - need_collection_table = False - # Ingest date can only come from the static table. - if "ingest_date" in fields: - fields_provided["ingest_date"] = self._static.dataset.c.ingest_date - need_static_table = True - if "run" in fields: - if len(collections) == 1 and run_collections_only: - # If we are searching exactly one RUN collection, we - # know that if we find the dataset in that collection, - # then that's the datasets's run; we don't need to - # query for it. - # - fields_provided["run"] = sqlalchemy.literal(only_collection_record.name).cast( - # This cast is necessary to ensure that Postgres knows the - # type of this column if it is used in an aggregate - # function. - sqlalchemy.String - ) - elif run_collections_only: - # Once again we can avoid joining to the collection table by - # adding a CASE statement. - fields_provided["run"] = _create_case_expression_for_collections( - collections, self._static.dataset.c[self._runKeyColumn] - ) - need_static_table = True - else: - # Here we can't avoid a join to the collection table, because - # we might find a dataset via something other than its RUN - # collection. - # - # We have to defer adding the join until after we have joined - # in the static dataset table, because the ON clause involves - # the run collection from the static dataset table. Postgres - # cares about the join ordering (though SQLite does not.) - need_collection_table = True - need_static_table = True - if need_static_table: - # If we need the static table, join it in via dataset_id. We don't - # use QueryJoiner.join because we're joining on dataset ID, not - # dimensions. - sql_projection.joiner.from_clause = sql_projection.joiner.from_clause.join( - self._static.dataset, onclause=(dataset_id_col == self._static.dataset.c.id) - ) - # Also constrain dataset_type_id in static table in case that helps - # generate a better plan. We could also include this in the JOIN ON - # clause, but my guess is that that's a good idea IFF it's in the - # foreign key, and right now it isn't. - sql_projection.joiner.where(self._static.dataset.c.dataset_type_id == self._dataset_type_id) - if need_collection_table: - # Join the collection table to look up the RUN collection name - # associated with the dataset. - ( - fields_provided["run"], - sql_projection.joiner.from_clause, - ) = self._collections.lookup_name_sql( - self._static.dataset.c[self._runKeyColumn], - sql_projection.joiner.from_clause, - ) - - sql_projection.distinct = ( - # If there are multiple collections, this subquery might have - # non-unique rows. - len(collections) > 1 - and not fields - ) - return sql_projection - - def getDataId(self, id: DatasetId) -> DataCoordinate: - """Return DataId for a dataset. - - Parameters - ---------- - id : `DatasetId` - Unique dataset identifier. - - Returns - ------- - dataId : `DataCoordinate` - DataId for the dataset. - """ - # This query could return multiple rows (one for each tagged collection - # the dataset is in, plus one for its run collection), and we don't - # care which of those we get. - sql = ( - self._tags.select() - .where( - sqlalchemy.sql.and_( - self._tags.columns.dataset_id == id, - self._tags.columns.dataset_type_id == self._dataset_type_id, - ) - ) - .limit(1) - ) - with self._db.query(sql) as sql_result: - row = sql_result.mappings().fetchone() - assert row is not None, "Should be guaranteed by caller and foreign key constraints." - return DataCoordinate.from_required_values( - self.datasetType.dimensions, - tuple(row[dimension] for dimension in self.datasetType.dimensions.required), - ) - - def refresh_collection_summaries(self) -> None: - """Make sure that dataset type collection summaries for this dataset - type are consistent with the contents of the dataset tables. - """ - with self._db.transaction(): - # The main issue here is consistency in the presence of concurrent - # updates (using default READ COMMITTED isolation). Regular clients - # only add to summary tables, and we want to avoid deleting what - # other concurrent transactions may add while we are in this - # transaction. This ordering of operations should guarantee it: - # - read collections for this dataset type from summary tables, - # - read collections for this dataset type from dataset tables - # (both tags and calibs), - # - whatever is in the first set but not in the second can be - # dropped from summary tables. - summary_collection_ids = set(self._summaries.get_collection_ids(self._dataset_type_id)) - - # Query datasets tables for associated collections. - column_name = self._collections.getCollectionForeignKeyName() - query: sqlalchemy.sql.expression.SelectBase = ( - sqlalchemy.select(self._tags.columns[column_name]) - .where(self._tags.columns.dataset_type_id == self._dataset_type_id) - .distinct() - ) - if (calibs := self._calibs) is not None: - query2 = ( - sqlalchemy.select(calibs.columns[column_name]) - .where(calibs.columns.dataset_type_id == self._dataset_type_id) - .distinct() - ) - query = sqlalchemy.sql.expression.union(query, query2) - - with self._db.query(query) as result: - collection_ids = set(result.scalars()) - - collections_to_delete = summary_collection_ids - collection_ids - self._summaries.delete_collections(self._dataset_type_id, collections_to_delete) - - -class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): - """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for - dataset IDs. - """ - - idMaker = DatasetIdFactory() - """Factory for dataset IDs. In the future this factory may be shared with - other classes (e.g. Registry).""" - - def insert( - self, - run: RunRecord, - dataIds: Iterable[DataCoordinate], - idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, - ) -> Iterator[DatasetRef]: - # Docstring inherited from DatasetRecordStorage. - - # Current timestamp, type depends on schema version. Use microsecond - # precision for astropy time to keep things consistent with - # TIMESTAMP(6) SQL type. - timestamp: datetime.datetime | astropy.time.Time - if self._use_astropy: - # Astropy `now()` precision should be the same as `now()` which - # should mean microsecond. - timestamp = astropy.time.Time.now() - else: - timestamp = datetime.datetime.now(datetime.UTC) - - # Iterate over data IDs, transforming a possibly-single-pass iterable - # into a list. - dataIdList: list[DataCoordinate] = [] - rows = [] - summary = CollectionSummary() - for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): - dataIdList.append(dataId) - rows.append( - { - "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), - "dataset_type_id": self._dataset_type_id, - self._runKeyColumn: run.key, - "ingest_date": timestamp, - } - ) - if not rows: - # Just in case an empty collection is provided we want to avoid - # adding dataset type to summary tables. - return - - with self._db.transaction(): - # Insert into the static dataset table. - self._db.insert(self._static.dataset, *rows) - # Update the summary tables for this collection in case this is the - # first time this dataset type or these governor values will be - # inserted there. - self._summaries.update(run, [self._dataset_type_id], summary) - # Combine the generated dataset_id values and data ID fields to - # form rows to be inserted into the tags table. - protoTagsRow = { - "dataset_type_id": self._dataset_type_id, - self._collections.getCollectionForeignKeyName(): run.key, - } - tagsRows = [ - dict(protoTagsRow, dataset_id=row["id"], **dataId.required) - for dataId, row in zip(dataIdList, rows, strict=True) - ] - # Insert those rows into the tags table. - self._db.insert(self._tags, *tagsRows) - - for dataId, row in zip(dataIdList, rows, strict=True): - yield DatasetRef( - datasetType=self.datasetType, - dataId=dataId, - id=row["id"], - run=run.name, - ) - - def import_( - self, - run: RunRecord, - datasets: Iterable[DatasetRef], - ) -> Iterator[DatasetRef]: - # Docstring inherited from DatasetRecordStorage. - - # Current timestamp, type depends on schema version. - if self._use_astropy: - # Astropy `now()` precision should be the same as `now()` which - # should mean microsecond. - timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) - else: - timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC)) - - # Iterate over data IDs, transforming a possibly-single-pass iterable - # into a list. - dataIds: dict[DatasetId, DataCoordinate] = {} - summary = CollectionSummary() - for dataset in summary.add_datasets_generator(datasets): - dataIds[dataset.id] = dataset.dataId - - if not dataIds: - # Just in case an empty collection is provided we want to avoid - # adding dataset type to summary tables. - return - - # We'll insert all new rows into a temporary table - tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) - collFkName = self._collections.getCollectionForeignKeyName() - protoTagsRow = { - "dataset_type_id": self._dataset_type_id, - collFkName: run.key, - } - tmpRows = [ - dict(protoTagsRow, dataset_id=dataset_id, **dataId.required) - for dataset_id, dataId in dataIds.items() - ] - with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags: - # store all incoming data in a temporary table - self._db.insert(tmp_tags, *tmpRows) - - # There are some checks that we want to make for consistency - # of the new datasets with existing ones. - self._validateImport(tmp_tags, run) - - # Before we merge temporary table into dataset/tags we need to - # drop datasets which are already there (and do not conflict). - self._db.deleteWhere( - tmp_tags, - tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), - ) - - # Copy it into dataset table, need to re-label some columns. - self._db.insert( - self._static.dataset, - select=sqlalchemy.sql.select( - tmp_tags.columns.dataset_id.label("id"), - tmp_tags.columns.dataset_type_id, - tmp_tags.columns[collFkName].label(self._runKeyColumn), - timestamp.label("ingest_date"), - ), - ) - - # Update the summary tables for this collection in case this - # is the first time this dataset type or these governor values - # will be inserted there. - self._summaries.update(run, [self._dataset_type_id], summary) - - # Copy it into tags table. - self._db.insert(self._tags, select=tmp_tags.select()) - - # Return refs in the same order as in the input list. - for dataset_id, dataId in dataIds.items(): - yield DatasetRef( - datasetType=self.datasetType, - id=dataset_id, - dataId=dataId, - run=run.name, - ) - - def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: - """Validate imported refs against existing datasets. - - Parameters - ---------- - tmp_tags : `sqlalchemy.schema.Table` - Temporary table with new datasets and the same schema as tags - table. - run : `RunRecord` - The record object describing the `~CollectionType.RUN` collection. - - Raises - ------ - ConflictingDefinitionError - Raise if new datasets conflict with existing ones. - """ - dataset = self._static.dataset - tags = self._tags - collFkName = self._collections.getCollectionForeignKeyName() - - # Check that existing datasets have the same dataset type and - # run. - query = ( - sqlalchemy.sql.select( - dataset.columns.id.label("dataset_id"), - dataset.columns.dataset_type_id.label("dataset_type_id"), - tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"), - dataset.columns[self._runKeyColumn].label("run"), - tmp_tags.columns[collFkName].label("new_run"), - ) - .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) - .where( - sqlalchemy.sql.or_( - dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, - dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], - ) - ) - .limit(1) - ) - with self._db.query(query) as result: - # Only include the first one in the exception message - if (row := result.first()) is not None: - existing_run = self._collections[row.run].name - new_run = self._collections[row.new_run].name - if row.dataset_type_id == self._dataset_type_id: - if row.new_dataset_type_id == self._dataset_type_id: - raise ConflictingDefinitionError( - f"Current run {existing_run!r} and new run {new_run!r} do not agree for " - f"dataset {row.dataset_id}." - ) - else: - raise ConflictingDefinitionError( - f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} " - f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} " - f"in run {run!r}." - ) - else: - raise ConflictingDefinitionError( - f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} " - f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} " - f"in run {run!r}." - ) - - # Check that matching dataset in tags table has the same DataId. - query = ( - sqlalchemy.sql.select( - tags.columns.dataset_id, - tags.columns.dataset_type_id.label("type_id"), - tmp_tags.columns.dataset_type_id.label("new_type_id"), - *[tags.columns[dim] for dim in self.datasetType.dimensions.required], - *[tmp_tags.columns[dim].label(f"new_{dim}") for dim in self.datasetType.dimensions.required], - ) - .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) - .where( - sqlalchemy.sql.or_( - tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, - *[ - tags.columns[dim] != tmp_tags.columns[dim] - for dim in self.datasetType.dimensions.required - ], - ) - ) - .limit(1) - ) - - with self._db.query(query) as result: - if (row := result.first()) is not None: - # Only include the first one in the exception message - raise ConflictingDefinitionError( - f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" - ) - - # Check that matching run+dataId have the same dataset ID. - query = ( - sqlalchemy.sql.select( - *[tags.columns[dim] for dim in self.datasetType.dimensions.required], - tags.columns.dataset_id, - tmp_tags.columns.dataset_id.label("new_dataset_id"), - tags.columns[collFkName], - tmp_tags.columns[collFkName].label(f"new_{collFkName}"), - ) - .select_from( - tags.join( - tmp_tags, - sqlalchemy.sql.and_( - tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, - tags.columns[collFkName] == tmp_tags.columns[collFkName], - *[ - tags.columns[dim] == tmp_tags.columns[dim] - for dim in self.datasetType.dimensions.required - ], - ), - ) - ) - .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) - .limit(1) - ) - with self._db.query(query) as result: - # only include the first one in the exception message - if (row := result.first()) is not None: - data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required} - existing_collection = self._collections[getattr(row, collFkName)].name - new_collection = self._collections[getattr(row, f"new_{collFkName}")].name - raise ConflictingDefinitionError( - f"Dataset with type {self.datasetType.name!r} and data ID {data_id} " - f"has ID {row.dataset_id} in existing collection {existing_collection!r} " - f"but ID {row.new_dataset_id} in new collection {new_collection!r}." - ) - - -def _create_case_expression_for_collections( - collections: Iterable[CollectionRecord], id_column: sqlalchemy.ColumnElement -) -> sqlalchemy.Case | sqlalchemy.Null: - """Return a SQLAlchemy Case expression that converts collection IDs to - collection names for the given set of collections. - - Parameters - ---------- - collections : `~collections.abc.Iterable` [ `CollectionRecord` ] - List of collections to include in conversion table. This should be an - exhaustive list of collections that could appear in `id_column`. - id_column : `sqlalchemy.ColumnElement` - The column containing the collection ID that we want to convert to a - collection name. - """ - mapping = {record.key: record.name for record in collections} - if not mapping: - # SQLAlchemy does not correctly handle an empty mapping in case() -- it - # crashes when trying to compile the expression with an - # "AttributeError('NoneType' object has no attribute 'dialect_impl')" - # when trying to access the 'type' property of the Case object. If you - # explicitly specify a type via type_coerce it instead generates - # invalid SQL syntax. - # - # We can end up with empty mappings here in certain "doomed query" edge - # cases, e.g. we start with a list of valid collections but they are - # all filtered out by higher-level code on the basis of collection - # summaries. - return sqlalchemy.null() - - return sqlalchemy.case(mapping, value=id_column) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/tables.py b/python/lsst/daf/butler/registry/datasets/byDimensions/tables.py index 5729bd2b80..029997ba64 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/tables.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/tables.py @@ -27,8 +27,6 @@ from __future__ import annotations -from .... import ddl - __all__ = ( "addDatasetForeignKey", "makeCalibTableName", @@ -44,14 +42,18 @@ import sqlalchemy -from ...._dataset_type import DatasetType -from ....dimensions import DimensionUniverse, GovernorDimension, addDimensionForeignKey +from .... import ddl +from ....dimensions import DimensionGroup, DimensionUniverse, GovernorDimension, addDimensionForeignKey from ....timespan_database_representation import TimespanDatabaseRepresentation -from ...interfaces import CollectionManager, VersionTuple +from ...interfaces import CollectionManager, Database, VersionTuple DATASET_TYPE_NAME_LENGTH = 128 +class MissingDatabaseTableError(RuntimeError): + """Exception raised when a table is not found in a database.""" + + StaticDatasetTablesTuple = namedtuple( "StaticDatasetTablesTuple", [ @@ -63,7 +65,6 @@ def addDatasetForeignKey( tableSpec: ddl.TableSpec, - dtype: type, *, name: str = "dataset", onDelete: str | None = None, @@ -81,9 +82,6 @@ def addDatasetForeignKey( tableSpec : `ddl.TableSpec` Specification for the table that should reference the dataset table. Will be modified in place. - dtype : `type` - Type of the column, same as the column type of the PK column of - a referenced table (``dataset.id``). name : `str`, optional A name to use for the prefix of the new field; the full name is ``{name}_id``. @@ -104,7 +102,7 @@ def addDatasetForeignKey( idSpec : `ddl.FieldSpec` Specification for the ID field. """ - idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=dtype, **kwargs) + idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) tableSpec.fields.add(idFieldSpec) if constraint: tableSpec.foreignKeys.append( @@ -116,8 +114,6 @@ def addDatasetForeignKey( def makeStaticTableSpecs( collections: type[CollectionManager], universe: DimensionUniverse, - dtype: type, - autoincrement: bool, schema_version: VersionTuple, ) -> StaticDatasetTablesTuple: """Construct all static tables used by the classes in this package. @@ -131,10 +127,6 @@ def makeStaticTableSpecs( Manager object for the collections in this `Registry`. universe : `DimensionUniverse` Universe graph containing all dimensions known to this `Registry`. - dtype : `type` - Type of the dataset ID (primary key) column. - autoincrement : `bool` - If `True` then dataset ID column will be auto-incrementing. schema_version : `VersionTuple` The version of this schema. @@ -222,8 +214,7 @@ def makeStaticTableSpecs( fields=[ ddl.FieldSpec( name="id", - dtype=dtype, - autoincrement=autoincrement, + dtype=ddl.GUID, primaryKey=True, doc="A unique field used as the primary key for dataset.", ), @@ -252,15 +243,12 @@ def makeStaticTableSpecs( return specs -def makeTagTableName(datasetType: DatasetType, dimensionsKey: int) -> str: +def makeTagTableName(dimensionsKey: int) -> str: """Construct the name for a dynamic (DatasetType-dependent) tag table used by the classes in this package. Parameters ---------- - datasetType : `DatasetType` - Dataset type to construct a name for. Multiple dataset types may - share the same table. dimensionsKey : `int` Integer key used to save ``datasetType.dimensions`` to the database. @@ -272,15 +260,12 @@ def makeTagTableName(datasetType: DatasetType, dimensionsKey: int) -> str: return f"dataset_tags_{dimensionsKey:08d}" -def makeCalibTableName(datasetType: DatasetType, dimensionsKey: int) -> str: +def makeCalibTableName(dimensionsKey: int) -> str: """Construct the name for a dynamic (DatasetType-dependent) tag + validity range table used by the classes in this package. Parameters ---------- - datasetType : `DatasetType` - Dataset type to construct a name for. Multiple dataset types may - share the same table. dimensionsKey : `int` Integer key used to save ``datasetType.dimensions`` to the database. @@ -289,27 +274,22 @@ def makeCalibTableName(datasetType: DatasetType, dimensionsKey: int) -> str: name : `str` Name for the table. """ - assert datasetType.isCalibration() return f"dataset_calibs_{dimensionsKey:08d}" def makeTagTableSpec( - datasetType: DatasetType, collections: type[CollectionManager], dtype: type, *, constraints: bool = True + dimensions: DimensionGroup, collections: type[CollectionManager], *, constraints: bool = True ) -> ddl.TableSpec: """Construct the specification for a dynamic (DatasetType-dependent) tag table used by the classes in this package. Parameters ---------- - datasetType : `DatasetType` - Dataset type to construct a spec for. Multiple dataset types may - share the same table. + dimensions : `DimensionGroup` + Dimensions of the dataset type. collections : `type` [ `CollectionManager` ] `CollectionManager` subclass that can be used to construct foreign keys to the run and/or collection tables. - dtype : `type` - Type of the FK column, same as the column type of the PK column of - a referenced table (``dataset.id``). constraints : `bool`, optional If `False` (`True` is default), do not define foreign key constraints. @@ -337,7 +317,7 @@ def makeTagTableSpec( # sufficient and saves us from worrying about nulls in the constraint. constraint = ["dataset_type_id"] # Add foreign key fields to dataset table (part of the primary key) - addDatasetForeignKey(tableSpec, dtype, primaryKey=True, onDelete="CASCADE", constraint=constraints) + addDatasetForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE", constraint=constraints) # Add foreign key fields to collection table (part of the primary key and # the data ID unique constraint). collectionFieldSpec = collections.addCollectionForeignKey( @@ -353,8 +333,8 @@ def makeTagTableSpec( target=(collectionFieldSpec.name, "dataset_type_id"), ) ) - for dimension_name in datasetType.dimensions.required: - dimension = datasetType.dimensions.universe.dimensions[dimension_name] + for dimension_name in dimensions.required: + dimension = dimensions.universe.dimensions[dimension_name] fieldSpec = addDimensionForeignKey( tableSpec, dimension=dimension, nullable=False, primaryKey=False, constraint=constraints ) @@ -375,27 +355,22 @@ def makeTagTableSpec( def makeCalibTableSpec( - datasetType: DatasetType, + dimensions: DimensionGroup, collections: type[CollectionManager], TimespanReprClass: type[TimespanDatabaseRepresentation], - dtype: type, ) -> ddl.TableSpec: """Construct the specification for a dynamic (DatasetType-dependent) tag + validity range table used by the classes in this package. Parameters ---------- - datasetType : `DatasetType` - Dataset type to construct a spec for. Multiple dataset types may - share the same table. + dimensions : `DimensionGroup` + Dimensions of the dataset type. collections : `type` [ `CollectionManager` ] `CollectionManager` subclass that can be used to construct foreign keys to the run and/or collection tables. TimespanReprClass : `type` of `TimespanDatabaseRepresentation` The Python type to use to represent a timespan. - dtype : `type` - Type of the FK column, same as the column type of the PK column of - a referenced table (``dataset.id``). Returns ------- @@ -425,7 +400,7 @@ def makeCalibTableSpec( index: list[str | type[TimespanDatabaseRepresentation]] = ["dataset_type_id"] # Add foreign key fields to dataset table (not part of the temporal # lookup/constraint). - addDatasetForeignKey(tableSpec, dtype, nullable=False, onDelete="CASCADE") + addDatasetForeignKey(tableSpec, nullable=False, onDelete="CASCADE") # Add foreign key fields to collection table (part of the temporal lookup # index/constraint). collectionFieldSpec = collections.addCollectionForeignKey(tableSpec, nullable=False, onDelete="CASCADE") @@ -439,8 +414,8 @@ def makeCalibTableSpec( ) ) # Add dimension fields (part of the temporal lookup index.constraint). - for dimension_name in datasetType.dimensions.required: - dimension = datasetType.dimensions.universe.dimensions[dimension_name] + for dimension_name in dimensions.required: + dimension = dimensions.universe.dimensions[dimension_name] fieldSpec = addDimensionForeignKey(tableSpec, dimension=dimension, nullable=False, primaryKey=False) index.append(fieldSpec.name) # If this is a governor dimension, add a foreign key constraint to the @@ -472,3 +447,166 @@ def makeCalibTableSpec( index.extend(fieldSpec.name for fieldSpec in tsFieldSpecs) tableSpec.indexes.add(ddl.IndexSpec(*index)) # type: ignore return tableSpec + + +class DynamicTables: + """A struct that holds the "dynamic" tables common to dataset types that + share the same dimensions. + + Parameters + ---------- + dimensions : `DimensionGroup` + Dimensions of the dataset types that use these tables. + dimensions_key : `int` + Integer key used to persist this dimension group in the database and + name the associated tables. + tags_name : `str` + Name of the "tags" table that associates datasets with data IDs in + RUN and TAGGED collections. + calibs_name : `str` or `None` + Name of the "calibs" table that associates datasets with data IDs and + timespans in CALIBRATION collections. This is `None` if none of the + dataset types (or at least none of those seen by this client) are + calibrations. + """ + + def __init__( + self, dimensions: DimensionGroup, dimensions_key: int, tags_name: str, calibs_name: str | None + ): + self._dimensions = dimensions + self.dimensions_key = dimensions_key + self.tags_name = tags_name + self.calibs_name = calibs_name + self._tags_table: sqlalchemy.Table | None = None + self._calibs_table: sqlalchemy.Table | None = None + + @classmethod + def from_dimensions_key( + cls, dimensions: DimensionGroup, dimensions_key: int, is_calibration: bool + ) -> DynamicTables: + """Construct with table names generated from the dimension key. + + Parameters + ---------- + dimensions : `DimensionGroup` + Dimensions of the dataset types that use these tables. + dimensions_key : `int` + Integer key used to persist this dimension group in the database + and name the associated tables. + is_calibration : `bool` + Whether any of the dataset types that use these tables are + calibrations. + + Returns + ------- + dynamic_tables : `DynamicTables` + Struct that holds tables for a group of dataset types. + """ + return cls( + dimensions, + dimensions_key=dimensions_key, + tags_name=makeTagTableName(dimensions_key), + calibs_name=makeCalibTableName(dimensions_key) if is_calibration else None, + ) + + def create(self, db: Database, collections: type[CollectionManager]) -> None: + """Create the tables if they don't already exist. + + Parameters + ---------- + db : `Database` + Database interface. + collections : `type` [ `CollectionManager` ] + Manager class for collections; used to create foreign key columns + for collections. + """ + if self._tags_table is None: + self._tags_table = db.ensureTableExists( + self.tags_name, + makeTagTableSpec(self._dimensions, collections), + ) + if self.calibs_name is not None and self._calibs_table is None: + self._calibs_table = db.ensureTableExists( + self.calibs_name, + makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation()), + ) + + def add_calibs(self, db: Database, collections: type[CollectionManager]) -> None: + """Create a calibs table for a dataset type whose dimensions already + have a tags table. + + Parameters + ---------- + db : `Database` + Database interface. + collections : `type` [ `CollectionManager` ] + Manager class for collections; used to create foreign key columns + for collections. + """ + self.calibs_name = makeCalibTableName(self.dimensions_key) + self._calibs_table = db.ensureTableExists( + self.calibs_name, + makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation()), + ) + + def tags(self, db: Database, collections: type[CollectionManager]) -> sqlalchemy.Table: + """Return the "tags" table that associates datasets with data IDs in + TAGGED and RUN collections. + + This method caches its result the first time it is called (and assumes + the arguments it is given never change). + + Parameters + ---------- + db : `Database` + Database interface. + collections : `type` [ `CollectionManager` ] + Manager class for collections; used to create foreign key columns + for collections. + + Returns + ------- + table : `sqlalchemy.Table` + SQLAlchemy table object. + """ + if self._tags_table is None: + spec = makeTagTableSpec(self._dimensions, collections) + table = db.getExistingTable(self.tags_name, spec) + if table is None: + raise MissingDatabaseTableError(f"Table {self.tags_name!r} is missing from database schema.") + self._tags_table = table + return self._tags_table + + def calibs(self, db: Database, collections: type[CollectionManager]) -> sqlalchemy.Table: + """Return the "calibs" table that associates datasets with data IDs and + timespans in CALIBRATION collections. + + This method caches its result the first time it is called (and assumes + the arguments it is given never change). It may only be called if the + dataset type is calibration. + + Parameters + ---------- + db : `Database` + Database interface. + collections : `type` [ `CollectionManager` ] + Manager class for collections; used to create foreign key columns + for collections. + + Returns + ------- + table : `sqlalchemy.Table` + SQLAlchemy table object. + """ + assert ( + self.calibs_name is not None + ), "Dataset type should be checked to be calibration by calling code." + if self._calibs_table is None: + spec = makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation()) + table = db.getExistingTable(self.calibs_name, spec) + if table is None: + raise MissingDatabaseTableError( + f"Table {self.calibs_name!r} is missing from database schema." + ) + self._calibs_table = table + return self._calibs_table diff --git a/python/lsst/daf/butler/registry/dimensions/static.py b/python/lsst/daf/butler/registry/dimensions/static.py index 8e5bd0afc9..ed4f4ca0df 100644 --- a/python/lsst/daf/butler/registry/dimensions/static.py +++ b/python/lsst/daf/butler/registry/dimensions/static.py @@ -324,9 +324,9 @@ def fetch_one( mapping = row._mapping return element.RecordClass(**mapping) - def save_dimension_group(self, graph: DimensionGroup) -> int: + def save_dimension_group(self, group: DimensionGroup) -> int: # Docstring inherited from DimensionRecordStorageManager. - return self._dimension_group_storage.save(graph) + return self._dimension_group_storage.save(group) def load_dimension_group(self, key: int) -> DimensionGroup: # Docstring inherited from DimensionRecordStorageManager. diff --git a/python/lsst/daf/butler/registry/interfaces/_bridge.py b/python/lsst/daf/butler/registry/interfaces/_bridge.py index 784ba555db..406074c26d 100644 --- a/python/lsst/daf/butler/registry/interfaces/_bridge.py +++ b/python/lsst/daf/butler/registry/interfaces/_bridge.py @@ -259,8 +259,6 @@ class DatastoreRegistryBridgeManager(VersionedExtension): Manager object for opaque table storage in the `Registry`. universe : `DimensionUniverse` All dimensions know to the `Registry`. - datasetIdColumnType : `type` - Type for dataset ID column. registry_schema_version : `VersionTuple` or `None`, optional Version of registry schema. @@ -285,13 +283,11 @@ def __init__( *, opaque: OpaqueTableStorageManager, universe: DimensionUniverse, - datasetIdColumnType: type, registry_schema_version: VersionTuple | None = None, ): super().__init__(registry_schema_version=registry_schema_version) self.opaque = opaque self.universe = universe - self.datasetIdColumnType = datasetIdColumnType @abstractmethod def clone( diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py index be8a22f910..2ee7d7b265 100644 --- a/python/lsst/daf/butler/registry/interfaces/_datasets.py +++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py @@ -29,17 +29,17 @@ from ... import ddl -__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") +__all__ = ("DatasetRecordStorageManager",) -from abc import ABC, abstractmethod -from collections.abc import Iterable, Iterator, Mapping, Sequence, Set +from abc import abstractmethod +from collections.abc import Iterable, Mapping, Sequence, Set from typing import TYPE_CHECKING, Any from lsst.daf.relation import Relation from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef from ..._dataset_type import DatasetType -from ..._exceptions import MissingDatasetTypeError +from ..._exceptions import DatasetTypeError, DatasetTypeNotSupportedError from ..._timespan import Timespan from ...dimensions import DataCoordinate from ._versioning import VersionedExtension, VersionTuple @@ -54,307 +54,6 @@ from ._dimensions import DimensionRecordStorageManager -class DatasetRecordStorage(ABC): - """An interface that manages the records associated with a particular - `DatasetType`. - - Parameters - ---------- - datasetType : `DatasetType` - Dataset type whose records this object manages. - """ - - def __init__(self, datasetType: DatasetType): - self.datasetType = datasetType - - @abstractmethod - def insert( - self, - run: RunRecord, - dataIds: Iterable[DataCoordinate], - idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, - ) -> Iterator[DatasetRef]: - """Insert one or more dataset entries into the database. - - Parameters - ---------- - run : `RunRecord` - The record object describing the `~CollectionType.RUN` collection - this dataset will be associated with. - dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] - Expanded data IDs (`DataCoordinate` instances) for the - datasets to be added. The dimensions of all data IDs must be the - same as ``self.datasetType.dimensions``. - idGenerationMode : `DatasetIdGenEnum` - With `UNIQUE` each new dataset is inserted with its new unique ID. - With non-`UNIQUE` mode ID is computed from some combination of - dataset type, dataId, and run collection name; if the same ID is - already in the database then new record is not inserted. - - Returns - ------- - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - References to the inserted datasets. - """ - raise NotImplementedError() - - @abstractmethod - def import_( - self, - run: RunRecord, - datasets: Iterable[DatasetRef], - ) -> Iterator[DatasetRef]: - """Insert one or more dataset entries into the database. - - Parameters - ---------- - run : `RunRecord` - The record object describing the `~CollectionType.RUN` collection - this dataset will be associated with. - datasets : `~collections.abc.Iterable` of `DatasetRef` - Datasets to be inserted. Datasets can specify ``id`` attribute - which will be used for inserted datasets. All dataset IDs must - have the same type (`int` or `uuid.UUID`), if type of dataset IDs - does not match type supported by this class then IDs will be - ignored and new IDs will be generated by backend. - - Returns - ------- - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - References to the inserted or existing datasets. - - Notes - ----- - The ``datasetType`` and ``run`` attributes of datasets are supposed to - be identical across all datasets but this is not checked and it should - be enforced by higher level registry code. This method does not need - to use those attributes from datasets, only ``dataId`` and ``id`` are - relevant. - """ - raise NotImplementedError() - - @abstractmethod - def delete(self, datasets: Iterable[DatasetRef]) -> None: - """Fully delete the given datasets from the registry. - - Parameters - ---------- - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - Datasets to be deleted. All datasets must be resolved and have - the same `DatasetType` as ``self``. - - Raises - ------ - AmbiguousDatasetError - Raised if any of the given `DatasetRef` instances is unresolved. - """ - raise NotImplementedError() - - @abstractmethod - def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: - """Associate one or more datasets with a collection. - - Parameters - ---------- - collection : `CollectionRecord` - The record object describing the collection. ``collection.type`` - must be `~CollectionType.TAGGED`. - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - Datasets to be associated. All datasets must be resolved and have - the same `DatasetType` as ``self``. - - Raises - ------ - AmbiguousDatasetError - Raised if any of the given `DatasetRef` instances is unresolved. - - Notes - ----- - Associating a dataset with into collection that already contains a - different dataset with the same `DatasetType` and data ID will remove - the existing dataset from that collection. - - Associating the same dataset into a collection multiple times is a - no-op, but is still not permitted on read-only databases. - """ - raise NotImplementedError() - - @abstractmethod - def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: - """Remove one or more datasets from a collection. - - Parameters - ---------- - collection : `CollectionRecord` - The record object describing the collection. ``collection.type`` - must be `~CollectionType.TAGGED`. - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - Datasets to be disassociated. All datasets must be resolved and - have the same `DatasetType` as ``self``. - - Raises - ------ - AmbiguousDatasetError - Raised if any of the given `DatasetRef` instances is unresolved. - """ - raise NotImplementedError() - - @abstractmethod - def certify( - self, - collection: CollectionRecord, - datasets: Iterable[DatasetRef], - timespan: Timespan, - context: SqlQueryContext, - ) -> None: - """Associate one or more datasets with a calibration collection and a - validity range within it. - - Parameters - ---------- - collection : `CollectionRecord` - The record object describing the collection. ``collection.type`` - must be `~CollectionType.CALIBRATION`. - datasets : `~collections.abc.Iterable` [ `DatasetRef` ] - Datasets to be associated. All datasets must be resolved and have - the same `DatasetType` as ``self``. - timespan : `Timespan` - The validity range for these datasets within the collection. - context : `SqlQueryContext` - The object that manages database connections, temporary tables and - relation engines for this query. - - Raises - ------ - AmbiguousDatasetError - Raised if any of the given `DatasetRef` instances is unresolved. - ConflictingDefinitionError - Raised if the collection already contains a different dataset with - the same `DatasetType` and data ID and an overlapping validity - range. - CollectionTypeError - Raised if - ``collection.type is not CollectionType.CALIBRATION`` or if - ``self.datasetType.isCalibration() is False``. - """ - raise NotImplementedError() - - @abstractmethod - def decertify( - self, - collection: CollectionRecord, - timespan: Timespan, - *, - dataIds: Iterable[DataCoordinate] | None = None, - context: SqlQueryContext, - ) -> None: - """Remove or adjust datasets to clear a validity range within a - calibration collection. - - Parameters - ---------- - collection : `CollectionRecord` - The record object describing the collection. ``collection.type`` - must be `~CollectionType.CALIBRATION`. - timespan : `Timespan` - The validity range to remove datasets from within the collection. - Datasets that overlap this range but are not contained by it will - have their validity ranges adjusted to not overlap it, which may - split a single dataset validity range into two. - dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional - Data IDs that should be decertified within the given validity range - If `None`, all data IDs for ``self.datasetType`` will be - decertified. - context : `SqlQueryContext` - The object that manages database connections, temporary tables and - relation engines for this query. - - Raises - ------ - CollectionTypeError - Raised if ``collection.type is not CollectionType.CALIBRATION``. - """ - raise NotImplementedError() - - @abstractmethod - def make_relation( - self, - *collections: CollectionRecord, - columns: Set[str], - context: SqlQueryContext, - ) -> Relation: - """Return a `sql.Relation` that represents a query for for this - `DatasetType` in one or more collections. - - Parameters - ---------- - *collections : `CollectionRecord` - The record object(s) describing the collection(s) to query. May - not be of type `CollectionType.CHAINED`. If multiple collections - are passed, the query will search all of them in an unspecified - order, and all collections must have the same type. Must include - at least one collection. - columns : `~collections.abc.Set` [ `str` ] - Columns to include in the relation. See `Query.find_datasets` for - most options, but this method supports one more: - - - ``rank``: a calculated integer column holding the index of the - collection the dataset was found in, within the ``collections`` - sequence given. - context : `SqlQueryContext` - The object that manages database connections, temporary tables and - relation engines for this query. - - Returns - ------- - relation : `~lsst.daf.relation.Relation` - Representation of the query. - """ - raise NotImplementedError() - - @abstractmethod - def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner: - """Make a `..direct_query_driver.QueryJoiner` that represents a search - for datasets of this type. - - Parameters - ---------- - collections : `~collections.abc.Sequence` [ `CollectionRecord` ] - Collections to search, in order, after filtering out collections - with no datasets of this type via collection summaries. - fields : `~collections.abc.Set` [ `str` ] - Names of fields to make available in the joiner. Options include: - - - ``dataset_id`` (UUID) - - ``run`` (collection name, `str`) - - ``collection`` (collection name, `str`) - - ``collection_key`` (collection primary key, manager-dependent) - - ``timespan`` (validity range, or unbounded for non-calibrations) - - ``ingest_date`` (time dataset was ingested into repository) - - Dimension keys for the dataset type's required dimensions are - always included. - - Returns - ------- - joiner : `..direct_query_driver.QueryJoiner` - A query-construction object representing a table or subquery. If - ``fields`` is empty or ``len(collections) <= 1``, this is - guaranteed to have rows that are unique over dimension keys. - """ - raise NotImplementedError() - - @abstractmethod - def refresh_collection_summaries(self) -> None: - """Make sure that collection summaries for this dataset type are - consistent with the contents of the dataset tables. - """ - - datasetType: DatasetType - """Dataset type whose records this object manages (`DatasetType`). - """ - - class DatasetRecordStorageManager(VersionedExtension): """An interface that manages the tables that describe datasets. @@ -441,39 +140,6 @@ def initialize( """ raise NotImplementedError() - @classmethod - @abstractmethod - def getIdColumnType(cls) -> type: - """Return type used for columns storing dataset IDs. - - This type is used for columns storing `DatasetRef.id` values, usually - a `type` subclass provided by SQLAlchemy. - - Returns - ------- - dtype : `type` - Type used for dataset identification in database. - """ - raise NotImplementedError() - - @classmethod - @abstractmethod - def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: - """Test whether the given dataset ID generation mode is supported by - `insert`. - - Parameters - ---------- - mode : `DatasetIdGenEnum` - Enum value for the mode to test. - - Returns - ------- - supported : `bool` - Whether the given mode is supported. - """ - raise NotImplementedError() - @classmethod @abstractmethod def addDatasetForeignKey( @@ -523,64 +189,78 @@ def refresh(self) -> None: """ raise NotImplementedError() - def __getitem__(self, name: str) -> DatasetRecordStorage: - """Return the object that provides access to the records associated - with the given `DatasetType` name. + @abstractmethod + def get_dataset_type(self, name: str) -> DatasetType: + """Look up a dataset type by name. - This is simply a convenience wrapper for `find` that raises `KeyError` - when the dataset type is not found. + Parameters + ---------- + name : `str` + Name of a parent dataset type. Returns ------- - records : `DatasetRecordStorage` + dataset_type : `DatasetType` The object representing the records for the given dataset type. Raises ------ - KeyError + MissingDatasetTypeError Raised if there is no dataset type with the given name. - - Notes - ----- - Dataset types registered by another client of the same repository since - the last call to `initialize` or `refresh` may not be found. """ - result = self.find(name) - if result is None: - raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") - return result + raise NotImplementedError() - @abstractmethod - def find(self, name: str) -> DatasetRecordStorage | None: - """Return an object that provides access to the records associated with - the given `DatasetType` name, if one exists. + def conform_exact_dataset_type(self, dataset_type: DatasetType | str) -> DatasetType: + """Conform a value that may be a dataset type or dataset type name to + just the dataset type name, while checking that the dataset type is not + a component and (if a `DatasetType` instance is given) has the exact + same definition in the registry. Parameters ---------- - name : `str` - Name of the dataset type. + dataset_type : `str` or `DatasetType` + Dataset type object or name. Returns ------- - records : `DatasetRecordStorage` or `None` - The object representing the records for the given dataset type, or - `None` if there are no records for that dataset type. + dataset_type : `DatasetType` + The corresponding registered dataset type. - Notes - ----- - Dataset types registered by another client of the same repository since - the last call to `initialize` or `refresh` may not be found. + Raises + ------ + DatasetTypeError + Raised if ``dataset_type`` is a component, or if its definition + does not exactly match the registered dataset type. + MissingDatasetTypeError + Raised if this dataset type is not registered at all. """ - raise NotImplementedError() + if isinstance(dataset_type, DatasetType): + dataset_type_name = dataset_type.name + given_dataset_type = dataset_type + else: + dataset_type_name = dataset_type + given_dataset_type = None + parent_name, component = DatasetType.splitDatasetTypeName(dataset_type_name) + if component is not None: + raise DatasetTypeNotSupportedError( + f"Component dataset {dataset_type_name!r} is not supported in this context." + ) + registered_dataset_type = self.get_dataset_type(dataset_type_name) + if given_dataset_type is not None and registered_dataset_type != given_dataset_type: + raise DatasetTypeError( + f"Given dataset type {given_dataset_type} is not identical to the " + f"registered one {registered_dataset_type}." + ) + return registered_dataset_type @abstractmethod - def register(self, datasetType: DatasetType) -> bool: + def register_dataset_type(self, dataset_type: DatasetType) -> bool: """Ensure that this `Registry` can hold records for the given `DatasetType`, creating new tables as necessary. Parameters ---------- - datasetType : `DatasetType` + dataset_type : `DatasetType` Dataset type for which a table should created (as necessary) and an associated `DatasetRecordStorage` returned. @@ -597,7 +277,7 @@ def register(self, datasetType: DatasetType) -> bool: raise NotImplementedError() @abstractmethod - def remove(self, name: str) -> None: + def remove_dataset_type(self, name: str) -> None: """Remove the dataset type. Parameters @@ -701,3 +381,293 @@ def fetch_summaries( def ingest_date_dtype(self) -> type: """Return type of the ``ingest_date`` column.""" raise NotImplementedError() + + @abstractmethod + def insert( + self, + dataset_type_name: str, + run: RunRecord, + data_ids: Iterable[DataCoordinate], + id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, + ) -> list[DatasetRef]: + """Insert one or more dataset entries into the database. + + Parameters + ---------- + dataset_type_name : `str` + Name of the dataset type. + run : `RunRecord` + The record object describing the `~CollectionType.RUN` collection + these datasets will be associated with. + data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ] + Expanded data IDs (`DataCoordinate` instances) for the + datasets to be added. The dimensions of all data IDs must be the + same as ``dataset_type.dimensions``. + id_generation_mode : `DatasetIdGenEnum` + With `UNIQUE` each new dataset is inserted with its new unique ID. + With non-`UNIQUE` mode ID is computed from some combination of + dataset type, dataId, and run collection name; if the same ID is + already in the database then new record is not inserted. + + Returns + ------- + datasets : `list` [ `DatasetRef` ] + References to the inserted datasets. + """ + raise NotImplementedError() + + @abstractmethod + def import_( + self, + dataset_type: DatasetType, + run: RunRecord, + data_ids: Mapping[DatasetId, DataCoordinate], + ) -> list[DatasetRef]: + """Insert one or more dataset entries into the database. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of dataset to import. Also used as the dataset type for + the returned refs. + run : `RunRecord` + The record object describing the `~CollectionType.RUN` collection + these datasets will be associated with. + data_ids : `~collections.abc.Mapping` + Mapping from dataset ID to data ID. + + Returns + ------- + datasets : `list` [ `DatasetRef` ] + References to the inserted or existing datasets. + """ + raise NotImplementedError() + + @abstractmethod + def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None: + """Fully delete the given datasets from the registry. + + Parameters + ---------- + datasets : `~collections.abc.Iterable` [ `DatasetId` or `DatasetRef` ] + Datasets to be deleted. If `DatasetRef` instances are passed, + only the `DatasetRef.id` attribute is used. + """ + raise NotImplementedError() + + @abstractmethod + def associate( + self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] + ) -> None: + """Associate one or more datasets with a collection. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of all datasets. + collection : `CollectionRecord` + The record object describing the collection. ``collection.type`` + must be `~CollectionType.TAGGED`. + datasets : `~collections.abc.Iterable` [ `DatasetRef` ] + Datasets to be associated. All datasets must have the same + `DatasetType` as ``dataset_type``, but this is not checked. + + Notes + ----- + Associating a dataset into collection that already contains a + different dataset with the same `DatasetType` and data ID will remove + the existing dataset from that collection. + + Associating the same dataset into a collection multiple times is a + no-op, but is still not permitted on read-only databases. + """ + raise NotImplementedError() + + @abstractmethod + def disassociate( + self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] + ) -> None: + """Remove one or more datasets from a collection. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of all datasets. + collection : `CollectionRecord` + The record object describing the collection. ``collection.type`` + must be `~CollectionType.TAGGED`. + datasets : `~collections.abc.Iterable` [ `DatasetRef` ] + Datasets to be disassociated. All datasets must have the same + `DatasetType` as ``dataset_type``, but this is not checked. + """ + raise NotImplementedError() + + @abstractmethod + def certify( + self, + dataset_type: DatasetType, + collection: CollectionRecord, + datasets: Iterable[DatasetRef], + timespan: Timespan, + context: SqlQueryContext, + ) -> None: + """Associate one or more datasets with a calibration collection and a + validity range within it. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of all datasets. + collection : `CollectionRecord` + The record object describing the collection. ``collection.type`` + must be `~CollectionType.CALIBRATION`. + datasets : `~collections.abc.Iterable` [ `DatasetRef` ] + Datasets to be associated. All datasets must have the same + `DatasetType` as ``dataset_type``, but this is not checked. + timespan : `Timespan` + The validity range for these datasets within the collection. + context : `SqlQueryContext` + The object that manages database connections, temporary tables and + relation engines for this query. + + Raises + ------ + ConflictingDefinitionError + Raised if the collection already contains a different dataset with + the same `DatasetType` and data ID and an overlapping validity + range. + DatasetTypeError + Raised if ``dataset_type.isCalibration() is False``. + CollectionTypeError + Raised if + ``collection.type is not CollectionType.CALIBRATION``. + """ + raise NotImplementedError() + + @abstractmethod + def decertify( + self, + dataset_type: DatasetType, + collection: CollectionRecord, + timespan: Timespan, + *, + data_ids: Iterable[DataCoordinate] | None = None, + context: SqlQueryContext, + ) -> None: + """Remove or adjust datasets to clear a validity range within a + calibration collection. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of all datasets. + collection : `CollectionRecord` + The record object describing the collection. ``collection.type`` + must be `~CollectionType.CALIBRATION`. + timespan : `Timespan` + The validity range to remove datasets from within the collection. + Datasets that overlap this range but are not contained by it will + have their validity ranges adjusted to not overlap it, which may + split a single dataset validity range into two. + data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ], optional + Data IDs that should be decertified within the given validity range + If `None`, all data IDs for ``dataset_type`` in ``collection`` will + be decertified. + context : `SqlQueryContext` + The object that manages database connections, temporary tables and + relation engines for this query. + + Raises + ------ + DatasetTypeError + Raised if ``dataset_type.isCalibration() is False``. + CollectionTypeError + Raised if + ``collection.type is not CollectionType.CALIBRATION``. + """ + raise NotImplementedError() + + @abstractmethod + def make_relation( + self, + dataset_type: DatasetType, + *collections: CollectionRecord, + columns: Set[str], + context: SqlQueryContext, + ) -> Relation: + """Return a `sql.Relation` that represents a query for this + `DatasetType` in one or more collections. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of dataset to query for. + *collections : `CollectionRecord` + The record object(s) describing the collection(s) to query. May + not be of type `CollectionType.CHAINED`. If multiple collections + are passed, the query will search all of them in an unspecified + order, and all collections must have the same type. Must include + at least one collection. + columns : `~collections.abc.Set` [ `str` ] + Columns to include in the relation. See `Query.find_datasets` for + most options, but this method supports one more: + + - ``rank``: a calculated integer column holding the index of the + collection the dataset was found in, within the ``collections`` + sequence given. + context : `SqlQueryContext` + The object that manages database connections, temporary tables and + relation engines for this query. + + Returns + ------- + relation : `~lsst.daf.relation.Relation` + Representation of the query. + """ + raise NotImplementedError() + + @abstractmethod + def make_query_joiner( + self, dataset_type: DatasetType, collections: Sequence[CollectionRecord], fields: Set[str] + ) -> QueryJoiner: + """Make a `..direct_query_driver.QueryJoiner` that represents a search + for datasets of this type. + + Parameters + ---------- + dataset_type : `DatasetType` + Type of dataset to query for. + collections : `~collections.abc.Sequence` [ `CollectionRecord` ] + Collections to search, in order, after filtering out collections + with no datasets of this type via collection summaries. + fields : `~collections.abc.Set` [ `str` ] + Names of fields to make available in the joiner. Options include: + + - ``dataset_id`` (UUID) + - ``run`` (collection name, `str`) + - ``collection`` (collection name, `str`) + - ``collection_key`` (collection primary key, manager-dependent) + - ``timespan`` (validity range, or unbounded for non-calibrations) + - ``ingest_date`` (time dataset was ingested into repository) + + Dimension keys for the dataset type's required dimensions are + always included. + + Returns + ------- + joiner : `..direct_query_driver.QueryJoiner` + A query-construction object representing a table or subquery. + """ + raise NotImplementedError() + + @abstractmethod + def refresh_collection_summaries(self, dataset_type: DatasetType) -> None: + """Make sure that collection summaries for this dataset type are + consistent with the contents of the dataset tables. + + Parameters + ---------- + dataset_type : `DatasetType` + Dataset type whose summary entries should be refreshed. + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/registry/interfaces/_dimensions.py b/python/lsst/daf/butler/registry/interfaces/_dimensions.py index ebf612ac39..ae1b14cb49 100644 --- a/python/lsst/daf/butler/registry/interfaces/_dimensions.py +++ b/python/lsst/daf/butler/registry/interfaces/_dimensions.py @@ -238,6 +238,9 @@ def save_dimension_group(self, group: DimensionGroup) -> int: """Save a `DimensionGroup` definition to the database, allowing it to be retrieved later via the returned key. + If this dimension group has already been saved, this method just + returns the key already associated with it. + Parameters ---------- group : `DimensionGroup` diff --git a/python/lsst/daf/butler/registry/managers.py b/python/lsst/daf/butler/registry/managers.py index 0b8af228ff..159330defc 100644 --- a/python/lsst/daf/butler/registry/managers.py +++ b/python/lsst/daf/butler/registry/managers.py @@ -40,7 +40,6 @@ from contextlib import contextmanager from typing import Any, Generic, TypeVar -import sqlalchemy from lsst.utils import doImportType from .._column_type_info import ColumnTypeInfo @@ -229,11 +228,6 @@ def makeRepo(self, database: Database, dimensionConfig: DimensionConfig) -> Regi universe = DimensionUniverse(dimensionConfig) with database.declareStaticTables(create=True) as context: - if self.datasets.getIdColumnType() is sqlalchemy.BigInteger: - raise RuntimeError( - "New data repositories should be created with UUID dataset IDs instead of autoincrement " - "integer dataset IDs.", - ) instances = RegistryManagerInstances.initialize(database, context, types=self, universe=universe) # store managers and their versions in attributes table diff --git a/python/lsst/daf/butler/registry/queries/_sql_query_backend.py b/python/lsst/daf/butler/registry/queries/_sql_query_backend.py index b3d632211b..dfc06f1712 100644 --- a/python/lsst/daf/butler/registry/queries/_sql_query_backend.py +++ b/python/lsst/daf/butler/registry/queries/_sql_query_backend.py @@ -39,7 +39,7 @@ from ..._column_categorization import ColumnCategorization from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag from ..._dataset_type import DatasetType -from ..._exceptions import DataIdValueError +from ..._exceptions import DataIdValueError, MissingDatasetTypeError from ...dimensions import DimensionGroup, DimensionRecordSet, DimensionUniverse from ...dimensions.record_cache import DimensionRecordCache from ..interfaces import CollectionRecord, Database @@ -163,24 +163,25 @@ def _make_dataset_query_relation_impl( "Caller is responsible for handling the case of all collections being rejected (we can't " "write a good error message without knowing why collections were rejected)." ) - dataset_storage = self._managers.datasets.find(dataset_type.name) - if dataset_storage is None: - # Unrecognized dataset type means no results. - return self.make_doomed_dataset_relation( + try: + return self._managers.datasets.make_relation( dataset_type, - columns, - messages=[ - f"Dataset type {dataset_type.name!r} is not registered, " - "so no instances of it can exist in any collection." - ], - context=context, - ) - else: - return dataset_storage.make_relation( *collections, columns=columns, context=context, ) + except MissingDatasetTypeError: + pass + # Unrecognized dataset type means no results. + return self.make_doomed_dataset_relation( + dataset_type, + columns, + messages=[ + f"Dataset type {dataset_type.name!r} is not registered, " + "so no instances of it can exist in any collection." + ], + context=context, + ) def make_dimension_relation( self, diff --git a/python/lsst/daf/butler/registry/sql_registry.py b/python/lsst/daf/butler/registry/sql_registry.py index a2ad378e23..5365db1733 100644 --- a/python/lsst/daf/butler/registry/sql_registry.py +++ b/python/lsst/daf/butler/registry/sql_registry.py @@ -344,8 +344,7 @@ def refresh_collection_summaries(self) -> None: governor summaries later. """ for dataset_type in self.queryDatasetTypes(): - if storage := self._managers.datasets.find(dataset_type.name): - storage.refresh_collection_summaries() + self._managers.datasets.refresh_collection_summaries(dataset_type) def caching_context(self) -> contextlib.AbstractContextManager[None]: """Return context manager that enables caching. @@ -740,7 +739,7 @@ def registerDatasetType(self, datasetType: DatasetType) -> bool: This method cannot be called within transactions, as it needs to be able to perform its own transaction to be concurrent. """ - return self._managers.datasets.register(datasetType) + return self._managers.datasets.register_dataset_type(datasetType) def removeDatasetType(self, name: str | tuple[str, ...]) -> None: """Remove the named `DatasetType` from the registry. @@ -781,7 +780,7 @@ def removeDatasetType(self, name: str | tuple[str, ...]) -> None: _LOG.info("Dataset type %r not defined", datasetTypeExpression) else: for datasetType in datasetTypes: - self._managers.datasets.remove(datasetType.name) + self._managers.datasets.remove_dataset_type(datasetType.name) _LOG.info("Removed dataset type %r", datasetType.name) def getDatasetType(self, name: str) -> DatasetType: @@ -808,11 +807,11 @@ def getDatasetType(self, name: str) -> DatasetType: other registry operations do not. """ parent_name, component = DatasetType.splitDatasetTypeName(name) - storage = self._managers.datasets[parent_name] + parent_dataset_type = self._managers.datasets.get_dataset_type(parent_name) if component is None: - return storage.datasetType + return parent_dataset_type else: - return storage.datasetType.makeComponentDatasetType(component) + return parent_dataset_type.makeComponentDatasetType(component) def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: """Test whether the given dataset ID generation mode is supported by @@ -828,7 +827,7 @@ def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: supported : `bool` Whether the given mode is supported. """ - return self._managers.datasets.supportsIdGenerationMode(mode) + return True def findDataset( self, @@ -1045,14 +1044,7 @@ def insertDatasets( lsst.daf.butler.registry.MissingCollectionError Raised if ``run`` does not exist in the registry. """ - if isinstance(datasetType, DatasetType): - storage = self._managers.datasets.find(datasetType.name) - if storage is None: - raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") - else: - storage = self._managers.datasets.find(datasetType) - if storage is None: - raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.") + datasetType = self._managers.datasets.conform_exact_dataset_type(datasetType) if run is None: if self.defaults.run is None: raise NoDefaultCollectionError( @@ -1068,21 +1060,23 @@ def insertDatasets( progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) if expand: expandedDataIds = [ - self.expandDataId(dataId, dimensions=storage.datasetType.dimensions) - for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") + self.expandDataId(dataId, dimensions=datasetType.dimensions) + for dataId in progress.wrap(dataIds, f"Expanding {datasetType.name} data IDs") ] else: expandedDataIds = [ - DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds + DataCoordinate.standardize(dataId, dimensions=datasetType.dimensions) for dataId in dataIds ] try: - refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) + refs = list( + self._managers.datasets.insert(datasetType.name, runRecord, expandedDataIds, idGenerationMode) + ) if self._managers.obscore: self._managers.obscore.add_datasets(refs) except sqlalchemy.exc.IntegrityError as err: raise ConflictingDefinitionError( "A database constraint failure was triggered by inserting " - f"one or more datasets of type {storage.datasetType} into " + f"one or more datasets of type {datasetType} into " f"collection '{run}'. " "This probably means a dataset with the same data ID " "and dataset type already exists, but it may also mean a " @@ -1158,11 +1152,6 @@ def _importDatasets( raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}") datasetType = datasetTypes.pop() - # get storage handler for this dataset type - storage = self._managers.datasets.find(datasetType.name) - if storage is None: - raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") - # find run name runs = {dataset.run for dataset in datasets} if len(runs) != 1: @@ -1177,26 +1166,23 @@ def _importDatasets( ) assert isinstance(runRecord, RunRecord) - progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) + progress = Progress("daf.butler.Registry.importDatasets", level=logging.DEBUG) if expand: - expandedDatasets = [ - dataset.expanded(self.expandDataId(dataset.dataId, dimensions=storage.datasetType.dimensions)) - for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") - ] + data_ids = { + dataset.id: self.expandDataId(dataset.dataId, dimensions=datasetType.dimensions) + for dataset in progress.wrap(datasets, f"Expanding {datasetType.name} data IDs") + } else: - expandedDatasets = [ - DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) - for dataset in datasets - ] + data_ids = {dataset.id: dataset.dataId for dataset in datasets} try: - refs = list(storage.import_(runRecord, expandedDatasets)) + refs = list(self._managers.datasets.import_(datasetType, runRecord, data_ids)) if self._managers.obscore: self._managers.obscore.add_datasets(refs) except sqlalchemy.exc.IntegrityError as err: raise ConflictingDefinitionError( "A database constraint failure was triggered by inserting " - f"one or more datasets of type {storage.datasetType} into " + f"one or more datasets of type {datasetType} into " f"collection '{run}'. " "This probably means a dataset with the same data ID " "and dataset type already exists, but it may also mean a " @@ -1231,36 +1217,28 @@ def getDataset(self, id: DatasetId) -> DatasetRef | None: def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: """Remove datasets from the Registry. - The datasets will be removed unconditionally from all collections, and - any `Quantum` that consumed this dataset will instead be marked with - having a NULL input. `Datastore` records will *not* be deleted; the - caller is responsible for ensuring that the dataset has already been - removed from all Datastores. + The datasets will be removed unconditionally from all collections. + `Datastore` records will *not* be deleted; the caller is responsible + for ensuring that the dataset has already been removed from all + Datastores. Parameters ---------- refs : `~collections.abc.Iterable` [`DatasetRef`] - References to the datasets to be removed. Must include a valid - ``id`` attribute, and should be considered invalidated upon return. + References to the datasets to be removed. Should be considered + invalidated upon return. Raises ------ - lsst.daf.butler.AmbiguousDatasetError - Raised if any ``ref.id`` is `None`. lsst.daf.butler.registry.OrphanedRecordError Raised if any dataset is still present in any `Datastore`. """ - progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) - for datasetType, refsForType in progress.iter_item_chunks( - DatasetRef.iter_by_type(refs), desc="Removing datasets by type" - ): - storage = self._managers.datasets[datasetType.name] - try: - storage.delete(refsForType) - except sqlalchemy.exc.IntegrityError as err: - raise OrphanedRecordError( - "One or more datasets is still present in one or more Datastores." - ) from err + try: + self._managers.datasets.delete(refs) + except sqlalchemy.exc.IntegrityError as err: + raise OrphanedRecordError( + "One or more datasets is still present in one or more Datastores." + ) from err @transactional def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: @@ -1292,16 +1270,11 @@ def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: """ progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) collectionRecord = self._managers.collections.find(collection) - if collectionRecord.type is not CollectionType.TAGGED: - raise CollectionTypeError( - f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." - ) for datasetType, refsForType in progress.iter_item_chunks( DatasetRef.iter_by_type(refs), desc="Associating datasets by type" ): - storage = self._managers.datasets[datasetType.name] try: - storage.associate(collectionRecord, refsForType) + self._managers.datasets.associate(datasetType, collectionRecord, refsForType) if self._managers.obscore: # If a TAGGED collection is being monitored by ObsCore # manager then we may need to save the dataset. @@ -1341,15 +1314,10 @@ def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: """ progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) collectionRecord = self._managers.collections.find(collection) - if collectionRecord.type is not CollectionType.TAGGED: - raise CollectionTypeError( - f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." - ) for datasetType, refsForType in progress.iter_item_chunks( DatasetRef.iter_by_type(refs), desc="Disassociating datasets by type" ): - storage = self._managers.datasets[datasetType.name] - storage.disassociate(collectionRecord, refsForType) + self._managers.datasets.disassociate(datasetType, collectionRecord, refsForType) if self._managers.obscore: self._managers.obscore.disassociate(refsForType, collectionRecord) @@ -1376,18 +1344,19 @@ def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespa Raised if the collection already contains a different dataset with the same `DatasetType` and data ID and an overlapping validity range. - lsst.daf.butler.registry.CollectionTypeError - Raised if ``collection`` is not a `~CollectionType.CALIBRATION` - collection or if one or more datasets are of a dataset type for - which `DatasetType.isCalibration` returns `False`. + DatasetTypeError + Raised if ``ref.datasetType.isCalibration() is False`` for any ref. + CollectionTypeError + Raised if + ``collection.type is not CollectionType.CALIBRATION``. """ progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) collectionRecord = self._managers.collections.find(collection) for datasetType, refsForType in progress.iter_item_chunks( DatasetRef.iter_by_type(refs), desc="Certifying datasets by type" ): - storage = self._managers.datasets[datasetType.name] - storage.certify( + self._managers.datasets.certify( + datasetType, collectionRecord, refsForType, timespan, @@ -1425,24 +1394,25 @@ def decertify( Raises ------ - lsst.daf.butler.registry.CollectionTypeError - Raised if ``collection`` is not a `~CollectionType.CALIBRATION` - collection or if ``datasetType.isCalibration() is False``. + DatasetTypeError + Raised if ``datasetType.isCalibration() is False``. + CollectionTypeError + Raised if + ``collection.type is not CollectionType.CALIBRATION``. """ collectionRecord = self._managers.collections.find(collection) if isinstance(datasetType, str): - storage = self._managers.datasets[datasetType] - else: - storage = self._managers.datasets[datasetType.name] + datasetType = self.getDatasetType(datasetType) standardizedDataIds = None if dataIds is not None: standardizedDataIds = [ - DataCoordinate.standardize(d, dimensions=storage.datasetType.dimensions) for d in dataIds + DataCoordinate.standardize(d, dimensions=datasetType.dimensions) for d in dataIds ] - storage.decertify( + self._managers.datasets.decertify( + datasetType, collectionRecord, timespan, - dataIds=standardizedDataIds, + data_ids=standardizedDataIds, context=queries.SqlQueryContext(self._db, self._managers.column_types), ) diff --git a/python/lsst/daf/butler/tests/_dummyRegistry.py b/python/lsst/daf/butler/tests/_dummyRegistry.py index e8e573b465..09bc21c574 100644 --- a/python/lsst/daf/butler/tests/_dummyRegistry.py +++ b/python/lsst/daf/butler/tests/_dummyRegistry.py @@ -31,7 +31,6 @@ from collections.abc import Iterable, Iterator from typing import Any -import sqlalchemy from lsst.daf.butler import DimensionUniverse, ddl from lsst.daf.butler.registry.bridge.ephemeral import EphemeralDatastoreRegistryBridge from lsst.daf.butler.registry.interfaces import ( @@ -173,13 +172,11 @@ def __init__( self, opaque: OpaqueTableStorageManager, universe: DimensionUniverse, - datasetIdColumnType: type, registry_schema_version: VersionTuple | None = None, ): super().__init__( opaque=opaque, universe=universe, - datasetIdColumnType=datasetIdColumnType, registry_schema_version=registry_schema_version, ) self._bridges: dict[str, EphemeralDatastoreRegistryBridge] = {} @@ -188,7 +185,6 @@ def clone(self, *, db: Database, opaque: OpaqueTableStorageManager) -> Datastore return DummyDatastoreRegistryBridgeManager( opaque=opaque, universe=self.universe, - datasetIdColumnType=self.datasetIdColumnType, registry_schema_version=self._registry_schema_version, ) @@ -208,7 +204,6 @@ def initialize( return cls( opaque=opaque, universe=universe, - datasetIdColumnType=datasets.getIdColumnType(), registry_schema_version=registry_schema_version, ) @@ -238,9 +233,7 @@ class DummyRegistry: def __init__(self) -> None: self._opaque = DummyOpaqueTableStorageManager() self.dimensions = DimensionUniverse() - self._datastoreBridges = DummyDatastoreRegistryBridgeManager( - self._opaque, self.dimensions, sqlalchemy.BigInteger - ) + self._datastoreBridges = DummyDatastoreRegistryBridgeManager(self._opaque, self.dimensions, None) def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: return self._datastoreBridges diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 6de7b81dfb..595b08957f 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -823,6 +823,35 @@ def test_clone(self): self.assertCountEqual(clone5.registry.defaults.collections, ["imported_r"]) self.assertEqual(clone5.run, "imported_r") + def test_calibration_dataset_type_registration(self) -> None: + # Register two dataset types that should share the same tags table, + # but only one is a calibration and hence needs a calibs table. + butler1 = self.makeButler(writeable=True) + a = DatasetType("a", ["instrument"], universe=butler1.dimensions, storageClass="StructuredDataDict") + b = DatasetType( + "b", + ["instrument"], + universe=butler1.dimensions, + storageClass="StructuredDataDict", + isCalibration=True, + ) + butler1.registry.registerDatasetType(a) + butler1.registry.registerDatasetType(b) + self.assertEqual(butler1.get_dataset_type("a"), a) + self.assertEqual(butler1.get_dataset_type("b"), b) + butler1.registry.refresh() + self.assertEqual(butler1.get_dataset_type("a"), a) + self.assertEqual(butler1.get_dataset_type("b"), b) + # Register them in the opposite order in a new repo. + butler2 = self.makeButler(writeable=True) + butler2.registry.registerDatasetType(b) + butler2.registry.registerDatasetType(a) + self.assertEqual(butler2.get_dataset_type("a"), a) + self.assertEqual(butler2.get_dataset_type("b"), b) + butler2.registry.refresh() + self.assertEqual(butler2.get_dataset_type("a"), a) + self.assertEqual(butler2.get_dataset_type("b"), b) + class DirectSimpleButlerTestCase(SimpleButlerTests, unittest.TestCase): """Run tests against DirectButler implementation."""