Improve error messages for dataset import conflicts.

Just had to debug some of these, and found the integer primary keys embedded in the old messages hard to use. Unfortunately I couldn't get rid of all of them: this code has no access to the general mapping from dataset type ID to dataset type name. But I think that's the least likely field to be causing the conflict here, so it's not a huge loss.
lsst · Aug 21, 2023 · 83620cf · 83620cf
1 parent 75408b4
commit 83620cf
Showing 1 changed file with 36 additions and 9 deletions.
diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
@@ -728,9 +728,9 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
             sqlalchemy.sql.select(
                 dataset.columns.id.label("dataset_id"),
                 dataset.columns.dataset_type_id.label("dataset_type_id"),
-                tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
+                tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"),
                 dataset.columns[self._runKeyColumn].label("run"),
-                tmp_tags.columns[collFkName].label("new run"),
+                tmp_tags.columns[collFkName].label("new_run"),
             )
             .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
             .where(
@@ -742,8 +742,28 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
             .limit(1)
         )
         with self._db.query(query) as result:
+            # Only include the first one in the exception message
             if (row := result.first()) is not None:
-                # Only include the first one in the exception message
+                existing_run = self._collections[row.run].name
+                new_run = self._collections[row.new_run].name
+                if row.dataset_type_id == self._dataset_type_id:
+                    if row.new_dataset_type_id == self._dataset_type_id:
+                        raise ConflictingDefinitionError(
+                            f"Current run {existing_run!r} and new run {new_run!r} do not agree for "
+                            f"dataset {row.dataset_id}."
+                        )
+                    else:
+                        raise ConflictingDefinitionError(
+                            f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} "
+                            f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} "
+                            f"in run {run!r}."
+                        )
+                else:
+                    raise ConflictingDefinitionError(
+                        f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} "
+                        f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} "
+                        f"in run {run!r}."
+                    )
                 raise ConflictingDefinitionError(
                     f"Existing dataset type or run do not match new dataset: {row._asdict()}"
                 )
@@ -753,10 +773,10 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
             sqlalchemy.sql.select(
                 tags.columns.dataset_id,
                 tags.columns.dataset_type_id.label("type_id"),
-                tmp_tags.columns.dataset_type_id.label("new type_id"),
+                tmp_tags.columns.dataset_type_id.label("new_type_id"),
                 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
                 *[
-                    tmp_tags.columns[dim].label(f"new {dim}")
+                    tmp_tags.columns[dim].label(f"new_{dim}")
                     for dim in self.datasetType.dimensions.required.names
                 ],
             )
@@ -783,12 +803,11 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
         # Check that matching run+dataId have the same dataset ID.
         query = (
             sqlalchemy.sql.select(
-                tags.columns.dataset_type_id.label("dataset_type_id"),
                 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
                 tags.columns.dataset_id,
-                tmp_tags.columns.dataset_id.label("new dataset_id"),
+                tmp_tags.columns.dataset_id.label("new_dataset_id"),
                 tags.columns[collFkName],
-                tmp_tags.columns[collFkName].label(f"new {collFkName}"),
+                tmp_tags.columns[collFkName].label(f"new_{collFkName}"),
             )
             .select_from(
                 tags.join(
@@ -807,8 +826,16 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
             .limit(1)
         )
         with self._db.query(query) as result:
+            # only include the first one in the exception message
             if (row := result.first()) is not None:
-                # only include the first one in the exception message
+                data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names}
+                existing_collection = self._collections[getattr(row, collFkName)].name
+                new_collection = self._collections[getattr(row, f"new_{collFkName}")].name
+                raise ConflictingDefinitionError(
+                    f"Dataset with type {self.datasetType.name!r} and data ID {data_id} "
+                    f"has ID {row.dataset_id} in existing collection {existing_collection!r} "
+                    f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
+                )
                 raise ConflictingDefinitionError(
                     f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
                 )