From 93e3286a36c6cdc942daa3d6707c0abbdeaa783c Mon Sep 17 00:00:00 2001
From: Nate Lust <nlust@astro.princeton.edu>
Date: Tue, 27 Jun 2023 14:55:04 -0400
Subject: [PATCH] Convert integer ids to UUID early

Downstream code now depends on refs holding UUIDs. Have the yaml
loader convert old style integer ids to UUIDs early rather than
waiting for downstream cleanups.
---
 python/lsst/daf/butler/transfers/_yaml.py | 30 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/python/lsst/daf/butler/transfers/_yaml.py b/python/lsst/daf/butler/transfers/_yaml.py
index 25ef39e04a..e1309ef72f 100644
--- a/python/lsst/daf/butler/transfers/_yaml.py
+++ b/python/lsst/daf/butler/transfers/_yaml.py
@@ -28,7 +28,7 @@
 from collections import defaultdict
 from collections.abc import Iterable, Mapping
 from datetime import datetime
-from typing import IO, TYPE_CHECKING, Any
+from typing import IO, TYPE_CHECKING, Any, cast
 
 import astropy.time
 import yaml
@@ -64,6 +64,8 @@
 this version of the code.
 """
 
+_refIntId2UUID = defaultdict[int, uuid.UUID](uuid.uuid4)
+
 
 def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
     """Generate YAML representation for UUID.
@@ -338,16 +340,27 @@ def __init__(self, stream: IO, registry: Registry):
             elif data["type"] == "associations":
                 collectionType = CollectionType.from_name(data["collection_type"])
                 if collectionType is CollectionType.TAGGED:
-                    self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
+                    self.tagAssociations[data["collection"]].extend(
+                        [
+                            x if not isinstance(x, int) else cast(DatasetId, _refIntId2UUID[x])
+                            for x in data["dataset_ids"]
+                        ]
+                    )
                 elif collectionType is CollectionType.CALIBRATION:
                     assocsByTimespan = self.calibAssociations[data["collection"]]
                     for d in data["validity_ranges"]:
                         if "timespan" in d:
-                            assocsByTimespan[d["timespan"]] = d["dataset_ids"]
+                            assocsByTimespan[d["timespan"]] = [
+                                x if not isinstance(x, int) else cast(DatasetId, _refIntId2UUID[x])
+                                for x in d["dataset_ids"]
+                            ]
                         else:
                             # TODO: this is for backward compatibility, should
                             # be removed at some point.
-                            assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
+                            assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [
+                                x if not isinstance(x, int) else cast(DatasetId, _refIntId2UUID[x])
+                                for x in d["dataset_ids"]
+                            ]
                 else:
                     raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
             else:
@@ -362,7 +375,14 @@ def __init__(self, stream: IO, registry: Registry):
                 FileDataset(
                     d.get("path"),
                     [
-                        DatasetRef(datasetType, dataId, run=data["run"], id=refid)
+                        DatasetRef(
+                            datasetType,
+                            dataId,
+                            run=data["run"],
+                            id=refid
+                            if not isinstance(refid, int)
+                            else cast(DatasetId, _refIntId2UUID[refid]),
+                        )
                         for dataId, refid in zip(
                             ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"])
                         )