From 38575abf11e463993bdb953fc3ad3653cbf8a3b6 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Mon, 22 Jul 2024 16:33:30 -0700
Subject: [PATCH 1/3] [nexus] Decommission disks in reconfigurator, clean their
 DB state (#6059)

This PR adds the following step to the reconfigurator's execution step:
For all disks that are marked **expunged**, mark them
**decommissioned**. This notably happens after the `deploy_disks` step
of execution.

This PR also adds a background task that looks for all disks that are
**decommissioned**, but have **zpools**. It deletes these zpools (and
their **datasets**) as long as no regions nor region snapshots are
referencing the contained datasets.

Fixes https://github.com/oxidecomputer/omicron/issues/6051
---
 dev-tools/omdb/tests/env.out                  |  15 +
 dev-tools/omdb/tests/successes.out            |  12 +
 nexus-config/src/nexus_config.rs              |  23 +
 .../src/db/datastore/physical_disk.rs         |  74 ++-
 nexus/db-queries/src/db/datastore/zpool.rs    | 135 ++++++
 nexus/examples/config-second.toml             |   1 +
 nexus/examples/config.toml                    |   1 +
 .../execution/src/cockroachdb.rs              |   5 +-
 .../reconfigurator/execution/src/datasets.rs  |   5 +-
 nexus/reconfigurator/execution/src/dns.rs     |   5 +-
 nexus/reconfigurator/execution/src/lib.rs     |  27 +-
 .../execution/src/omicron_physical_disks.rs   | 258 +++++++++-
 nexus/src/app/background/init.rs              |  21 +
 .../tasks/decommissioned_disk_cleaner.rs      | 458 ++++++++++++++++++
 nexus/src/app/background/tasks/mod.rs         |   1 +
 nexus/tests/config.test.toml                  |   3 +
 nexus/tests/integration_tests/sleds.rs        |   9 +-
 nexus/types/src/deployment/planning_input.rs  |   7 +
 smf/nexus/multi-sled/config-partial.toml      |   1 +
 smf/nexus/single-sled/config-partial.toml     |   1 +
 uuid-kinds/src/lib.rs                         |   2 +
 21 files changed, 1013 insertions(+), 51 deletions(-)
 create mode 100644 nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs
diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
index 75acc5c584..a6bf4d4667 100644
--- a/dev-tools/omdb/tests/env.out
+++ b/dev-tools/omdb/tests/env.out
@@ -47,6 +47,11 @@ task: "crdb_node_id_collector"
     Collects node IDs of running CockroachDB zones
 
 
+task: "decommissioned_disk_cleaner"
+    deletes DB records for decommissioned disks, after regions and region
+    snapshots have been replaced
+
+
 task: "dns_config_external"
     watches external DNS data stored in CockroachDB
 
@@ -187,6 +192,11 @@ task: "crdb_node_id_collector"
     Collects node IDs of running CockroachDB zones
 
 
+task: "decommissioned_disk_cleaner"
+    deletes DB records for decommissioned disks, after regions and region
+    snapshots have been replaced
+
+
 task: "dns_config_external"
     watches external DNS data stored in CockroachDB
 
@@ -314,6 +324,11 @@ task: "crdb_node_id_collector"
     Collects node IDs of running CockroachDB zones
 
 
+task: "decommissioned_disk_cleaner"
+    deletes DB records for decommissioned disks, after regions and region
+    snapshots have been replaced
+
+
 task: "dns_config_external"
     watches external DNS data stored in CockroachDB
 
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index 982a9d8403..cec3fa3052 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -248,6 +248,11 @@ task: "crdb_node_id_collector"
     Collects node IDs of running CockroachDB zones
 
 
+task: "decommissioned_disk_cleaner"
+    deletes DB records for decommissioned disks, after regions and region
+    snapshots have been replaced
+
+
 task: "dns_config_external"
     watches external DNS data stored in CockroachDB
 
@@ -453,6 +458,13 @@ task: "crdb_node_id_collector"
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
     last completion reported error: no blueprint
 
+task: "decommissioned_disk_cleaner"
+  configured period: every 1m
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+warning: unknown background task: "decommissioned_disk_cleaner" (don't know how to interpret details: Object {"deleted": Number(0), "error": Null, "error_count": Number(0), "found": Number(0), "not_ready_to_be_deleted": Number(0)})
+
 task: "external_endpoints"
   configured period: every 1m
   currently executing: no
diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 3bc3a36126..6e9d6b0cf0 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -361,6 +361,8 @@ pub struct BackgroundTaskConfig {
     pub inventory: InventoryConfig,
     /// configuration for physical disk adoption tasks
     pub physical_disk_adoption: PhysicalDiskAdoptionConfig,
+    /// configuration for decommissioned disk cleaner task
+    pub decommissioned_disk_cleaner: DecommissionedDiskCleanerConfig,
     /// configuration for phantom disks task
     pub phantom_disks: PhantomDiskConfig,
     /// configuration for blueprint related tasks
@@ -444,6 +446,20 @@ pub struct PhysicalDiskAdoptionConfig {
     pub disable: bool,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct DecommissionedDiskCleanerConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+
+    /// A toggle to disable automated disk cleanup
+    ///
+    /// Default: Off
+    #[serde(default)]
+    pub disable: bool,
+}
+
 #[serde_as]
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct NatCleanupConfig {
@@ -822,6 +838,7 @@ mod test {
             inventory.nkeep = 11
             inventory.disable = false
             physical_disk_adoption.period_secs = 30
+            decommissioned_disk_cleaner.period_secs = 30
             phantom_disks.period_secs = 30
             blueprints.period_secs_load = 10
             blueprints.period_secs_execute = 60
@@ -947,6 +964,11 @@ mod test {
                             period_secs: Duration::from_secs(30),
                             disable: false,
                         },
+                        decommissioned_disk_cleaner:
+                            DecommissionedDiskCleanerConfig {
+                                period_secs: Duration::from_secs(30),
+                                disable: false,
+                            },
                         phantom_disks: PhantomDiskConfig {
                             period_secs: Duration::from_secs(30),
                         },
@@ -1049,6 +1071,7 @@ mod test {
             inventory.nkeep = 3
             inventory.disable = false
             physical_disk_adoption.period_secs = 30
+            decommissioned_disk_cleaner.period_secs = 30
             phantom_disks.period_secs = 30
             blueprints.period_secs_load = 10
             blueprints.period_secs_execute = 60
diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs
index 11e056d19b..5e3b51f228 100644
--- a/nexus/db-queries/src/db/datastore/physical_disk.rs
+++ b/nexus/db-queries/src/db/datastore/physical_disk.rs
@@ -37,6 +37,7 @@ use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
 use omicron_uuid_kinds::CollectionUuid;
 use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::PhysicalDiskUuid;
 use uuid::Uuid;
 
 impl DataStore {
@@ -278,23 +279,36 @@ impl DataStore {
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
     }
 
+    /// Decommissions all expunged disks.
+    pub async fn physical_disk_decommission_all_expunged(
+        &self,
+        opctx: &OpContext,
+    ) -> Result<(), Error> {
+        opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
+        use db::schema::physical_disk::dsl;
+
+        let conn = &*self.pool_connection_authorized(&opctx).await?;
+        diesel::update(dsl::physical_disk)
+            .filter(dsl::time_deleted.is_null())
+            .physical_disk_filter(DiskFilter::ExpungedButActive)
+            .set(dsl::disk_state.eq(PhysicalDiskState::Decommissioned))
+            .execute_async(conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+        Ok(())
+    }
+
     /// Deletes a disk from the database.
     pub async fn physical_disk_delete(
         &self,
         opctx: &OpContext,
-        vendor: String,
-        serial: String,
-        model: String,
-        sled_id: Uuid,
+        id: PhysicalDiskUuid,
     ) -> DeleteResult {
         opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
         let now = Utc::now();
         use db::schema::physical_disk::dsl;
         diesel::update(dsl::physical_disk)
-            .filter(dsl::vendor.eq(vendor))
-            .filter(dsl::serial.eq(serial))
-            .filter(dsl::model.eq(model))
-            .filter(dsl::sled_id.eq(sled_id))
+            .filter(dsl::id.eq(id.into_untyped_uuid()))
             .filter(dsl::time_deleted.is_null())
             .set(dsl::time_deleted.eq(now))
             .execute_async(&*self.pool_connection_authorized(opctx).await?)
@@ -451,8 +465,9 @@ mod test {
         let sled = create_test_sled(&datastore).await;
 
         // Insert a disk
+        let disk_id = PhysicalDiskUuid::new_v4();
         let disk = PhysicalDisk::new(
-            Uuid::new_v4(),
+            disk_id.into_untyped_uuid(),
             String::from("Oxide"),
             String::from("123"),
             String::from("FakeDisk"),
@@ -472,13 +487,7 @@ mod test {
 
         // Delete the inserted disk
         datastore
-            .physical_disk_delete(
-                &opctx,
-                disk.vendor.clone(),
-                disk.serial.clone(),
-                disk.model.clone(),
-                disk.sled_id,
-            )
+            .physical_disk_delete(&opctx, disk_id)
             .await
             .expect("Failed to delete disk");
         let disks = datastore
@@ -489,13 +498,7 @@ mod test {
 
         // Deleting again should not throw an error
         datastore
-            .physical_disk_delete(
-                &opctx,
-                disk.vendor,
-                disk.serial,
-                disk.model,
-                disk.sled_id,
-            )
+            .physical_disk_delete(&opctx, disk_id)
             .await
             .expect("Failed to delete disk");
 
@@ -520,8 +523,9 @@ mod test {
         let sled_b = create_test_sled(&datastore).await;
 
         // Insert a disk
+        let disk_id = PhysicalDiskUuid::new_v4();
         let disk = PhysicalDisk::new(
-            Uuid::new_v4(),
+            disk_id.into_untyped_uuid(),
             String::from("Oxide"),
             String::from("123"),
             String::from("FakeDisk"),
@@ -546,13 +550,7 @@ mod test {
 
         // Delete the inserted disk
         datastore
-            .physical_disk_delete(
-                &opctx,
-                disk.vendor,
-                disk.serial,
-                disk.model,
-                disk.sled_id,
-            )
+            .physical_disk_delete(&opctx, disk_id)
             .await
             .expect("Failed to delete disk");
         let disks = datastore
@@ -567,8 +565,9 @@ mod test {
         assert!(disks.is_empty());
 
         // Attach the disk to the second sled
+        let disk_id = PhysicalDiskUuid::new_v4();
         let disk = PhysicalDisk::new(
-            Uuid::new_v4(),
+            disk_id.into_untyped_uuid(),
             String::from("Oxide"),
             String::from("123"),
             String::from("FakeDisk"),
@@ -613,8 +612,9 @@ mod test {
         let sled_b = create_test_sled(&datastore).await;
 
         // Insert a disk
+        let disk_id = PhysicalDiskUuid::new_v4();
         let disk = PhysicalDisk::new(
-            Uuid::new_v4(),
+            disk_id.into_untyped_uuid(),
             String::from("Oxide"),
             String::from("123"),
             String::from("FakeDisk"),
@@ -639,13 +639,7 @@ mod test {
 
         // Remove the disk from the first sled
         datastore
-            .physical_disk_delete(
-                &opctx,
-                disk.vendor.clone(),
-                disk.serial.clone(),
-                disk.model.clone(),
-                disk.sled_id,
-            )
+            .physical_disk_delete(&opctx, disk_id)
             .await
             .expect("Failed to delete disk");
 
diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs
index a771202387..a314b9d371 100644
--- a/nexus/db-queries/src/db/datastore/zpool.rs
+++ b/nexus/db-queries/src/db/datastore/zpool.rs
@@ -15,6 +15,7 @@ use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
 use crate::db::identity::Asset;
 use crate::db::model::PhysicalDisk;
+use crate::db::model::PhysicalDiskState;
 use crate::db::model::Sled;
 use crate::db::model::Zpool;
 use crate::db::pagination::paginated;
@@ -27,10 +28,13 @@ use diesel::upsert::excluded;
 use nexus_db_model::PhysicalDiskKind;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DataPageParams;
+use omicron_common::api::external::DeleteResult;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
+use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::ZpoolUuid;
 use uuid::Uuid;
 
 impl DataStore {
@@ -135,4 +139,135 @@ impl DataStore {
 
         Ok(zpools)
     }
+
+    /// Returns all (non-deleted) zpools on decommissioned (or deleted) disks
+    pub async fn zpool_on_decommissioned_disk_list(
+        &self,
+        opctx: &OpContext,
+        pagparams: &DataPageParams<'_, Uuid>,
+    ) -> ListResultVec<Zpool> {
+        opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
+        use db::schema::physical_disk::dsl as physical_disk_dsl;
+        use db::schema::zpool::dsl as zpool_dsl;
+
+        paginated(zpool_dsl::zpool, zpool_dsl::id, pagparams)
+            .filter(zpool_dsl::time_deleted.is_null())
+            // Note the LEFT JOIN here -- we want to see zpools where the
+            // physical disk has been deleted too.
+            .left_join(
+                physical_disk_dsl::physical_disk
+                    .on(physical_disk_dsl::id.eq(zpool_dsl::physical_disk_id)),
+            )
+            .filter(
+                // The physical disk has been either explicitly decommissioned,
+                // or has been deleted altogether.
+                physical_disk_dsl::disk_state
+                    .eq(PhysicalDiskState::Decommissioned)
+                    .or(physical_disk_dsl::id.is_null())
+                    .or(
+                        // NOTE: We should probably get rid of this altogether
+                        // (it's kinda implied by "Decommissioned", being a terminal
+                        // state) but this is an extra cautious statement.
+                        physical_disk_dsl::time_deleted.is_not_null(),
+                    ),
+            )
+            .select(Zpool::as_select())
+            .load_async(&*self.pool_connection_authorized(opctx).await?)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
+
+    /// Soft-deletes the Zpools and cleans up all associated DB resources.
+    ///
+    /// This should only be called for zpools on physical disks which
+    /// have been decommissioned.
+    ///
+    /// In order:
+    /// - Finds all datasets within the zpool
+    /// - Ensures that no regions nor region snapshots are using these datasets
+    /// - Soft-deletes the datasets within the zpool
+    /// - Soft-deletes the zpool itself
+    pub async fn zpool_delete_self_and_all_datasets(
+        &self,
+        opctx: &OpContext,
+        zpool_id: ZpoolUuid,
+    ) -> DeleteResult {
+        let conn = &*self.pool_connection_authorized(&opctx).await?;
+        Self::zpool_delete_self_and_all_datasets_on_connection(
+            &conn, opctx, zpool_id,
+        )
+        .await
+    }
+
+    /// See: [Self::zpool_delete_self_and_all_datasets]
+    pub(crate) async fn zpool_delete_self_and_all_datasets_on_connection(
+        conn: &async_bb8_diesel::Connection<db::DbConnection>,
+        opctx: &OpContext,
+        zpool_id: ZpoolUuid,
+    ) -> DeleteResult {
+        opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
+        let now = Utc::now();
+        use db::schema::dataset::dsl as dataset_dsl;
+        use db::schema::zpool::dsl as zpool_dsl;
+
+        let zpool_id = *zpool_id.as_untyped_uuid();
+
+        // Get the IDs of all datasets to-be-deleted
+        let dataset_ids: Vec<Uuid> = dataset_dsl::dataset
+            .filter(dataset_dsl::time_deleted.is_null())
+            .filter(dataset_dsl::pool_id.eq(zpool_id))
+            .select(dataset_dsl::id)
+            .load_async(conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        // Verify that there are no regions nor region snapshots using this dataset
+        use db::schema::region::dsl as region_dsl;
+        let region_count = region_dsl::region
+            .filter(region_dsl::dataset_id.eq_any(dataset_ids.clone()))
+            .count()
+            .first_async::<i64>(conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+        if region_count > 0 {
+            return Err(Error::unavail(&format!(
+                "Cannot delete this zpool; it has {region_count} regions"
+            )));
+        }
+
+        use db::schema::region_snapshot::dsl as region_snapshot_dsl;
+        let region_snapshot_count = region_snapshot_dsl::region_snapshot
+            .filter(region_snapshot_dsl::dataset_id.eq_any(dataset_ids))
+            .count()
+            .first_async::<i64>(conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+        if region_snapshot_count > 0 {
+            return Err(
+                Error::unavail(&format!("Cannot delete this zpool; it has {region_snapshot_count} region snapshots"))
+            );
+        }
+
+        // Ensure the datasets are deleted
+        diesel::update(dataset_dsl::dataset)
+            .filter(dataset_dsl::time_deleted.is_null())
+            .filter(dataset_dsl::pool_id.eq(zpool_id))
+            .set(dataset_dsl::time_deleted.eq(now))
+            .execute_async(conn)
+            .await
+            .map(|_rows_modified| ())
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        // Ensure the zpool is deleted
+        diesel::update(zpool_dsl::zpool)
+            .filter(zpool_dsl::id.eq(zpool_id))
+            .filter(zpool_dsl::time_deleted.is_null())
+            .set(zpool_dsl::time_deleted.eq(now))
+            .execute_async(conn)
+            .await
+            .map(|_rows_modified| ())
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        Ok(())
+    }
 }
diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml
index ef67749a4b..40f5d95a5f 100644
--- a/nexus/examples/config-second.toml
+++ b/nexus/examples/config-second.toml
@@ -122,6 +122,7 @@ inventory.nkeep = 5
 inventory.disable = false
 phantom_disks.period_secs = 30
 physical_disk_adoption.period_secs = 30
+decommissioned_disk_cleaner.period_secs = 60
 blueprints.period_secs_load = 10
 blueprints.period_secs_execute = 60
 blueprints.period_secs_collect_crdb_node_ids = 180
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index 6ec80359ab..b194ecf1b6 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -108,6 +108,7 @@ inventory.nkeep = 5
 inventory.disable = false
 phantom_disks.period_secs = 30
 physical_disk_adoption.period_secs = 30
+decommissioned_disk_cleaner.period_secs = 60
 blueprints.period_secs_load = 10
 blueprints.period_secs_execute = 60
 blueprints.period_secs_collect_crdb_node_ids = 180
diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs
index 5a8710a1c5..498944598d 100644
--- a/nexus/reconfigurator/execution/src/cockroachdb.rs
+++ b/nexus/reconfigurator/execution/src/cockroachdb.rs
@@ -90,7 +90,10 @@ mod test {
         );
         // Record the zpools so we don't fail to ensure datasets (unrelated to
         // crdb settings) during blueprint execution.
-        crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
+        crate::tests::create_disks_for_zones_using_datasets(
+            datastore, &opctx, &blueprint,
+        )
+        .await;
         // Execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
         crate::realize_blueprint_with_overrides(
diff --git a/nexus/reconfigurator/execution/src/datasets.rs b/nexus/reconfigurator/execution/src/datasets.rs
index 139c94c53f..6b7e30a738 100644
--- a/nexus/reconfigurator/execution/src/datasets.rs
+++ b/nexus/reconfigurator/execution/src/datasets.rs
@@ -153,7 +153,10 @@ mod tests {
 
         // Record the sleds and zpools.
         crate::tests::insert_sled_records(datastore, &blueprint).await;
-        crate::tests::insert_zpool_records(datastore, opctx, &blueprint).await;
+        crate::tests::create_disks_for_zones_using_datasets(
+            datastore, opctx, &blueprint,
+        )
+        .await;
 
         // Prior to ensuring datasets exist, there should be none.
         assert_eq!(
diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs
index f3210a12aa..bdcfc66bad 100644
--- a/nexus/reconfigurator/execution/src/dns.rs
+++ b/nexus/reconfigurator/execution/src/dns.rs
@@ -1182,7 +1182,10 @@ mod test {
 
         // Record the zpools so we don't fail to ensure datasets (unrelated to
         // DNS) during blueprint execution.
-        crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
+        crate::tests::create_disks_for_zones_using_datasets(
+            datastore, &opctx, &blueprint,
+        )
+        .await;
 
         // Now, execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs
index 8cdbd46265..e3d2019230 100644
--- a/nexus/reconfigurator/execution/src/lib.rs
+++ b/nexus/reconfigurator/execution/src/lib.rs
@@ -227,6 +227,12 @@ where
     )
     .await?;
 
+    // This depends on the "deploy_disks" call earlier -- disk expungement is a
+    // statement of policy, but we need to be assured that the Sled Agent has
+    // stopped using that disk before we can mark its state as decommissioned.
+    omicron_physical_disks::decommission_expunged_disks(&opctx, datastore)
+        .await?;
+
     // This is likely to error if any cluster upgrades are in progress (which
     // can take some time), so it should remain at the end so that other parts
     // of the blueprint can progress normally.
@@ -241,6 +247,8 @@ where
 mod tests {
     use super::*;
     use nexus_db_model::Generation;
+    use nexus_db_model::PhysicalDisk;
+    use nexus_db_model::PhysicalDiskKind;
     use nexus_db_model::SledBaseboard;
     use nexus_db_model::SledSystemHardware;
     use nexus_db_model::SledUpdate;
@@ -290,7 +298,7 @@ mod tests {
     // tests expect to be able to realize the the blueprint created from an
     // initial collection, and ensuring the zones' datasets exist requires first
     // inserting the sled and zpool records.
-    pub(crate) async fn insert_zpool_records(
+    pub(crate) async fn create_disks_for_zones_using_datasets(
         datastore: &DataStore,
         opctx: &OpContext,
         blueprint: &Blueprint,
@@ -304,12 +312,27 @@ mod tests {
                 continue;
             };
 
+            let physical_disk_id = Uuid::new_v4();
             let pool_id = dataset.dataset.pool_name.id();
+
+            let disk = PhysicalDisk::new(
+                physical_disk_id,
+                String::from("Oxide"),
+                format!("PhysDisk of {}", pool_id),
+                String::from("FakeDisk"),
+                PhysicalDiskKind::U2,
+                sled_id.into_untyped_uuid(),
+            );
+            datastore
+                .physical_disk_insert(&opctx, disk.clone())
+                .await
+                .expect("failed to upsert physical disk");
+
             if pool_inserted.insert(pool_id) {
                 let zpool = Zpool::new(
                     pool_id.into_untyped_uuid(),
                     sled_id.into_untyped_uuid(),
-                    Uuid::new_v4(), // physical_disk_id
+                    physical_disk_id,
                 );
                 datastore
                     .zpool_insert(opctx, zpool)
diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
index d7d8604e7e..9ae72d2049 100644
--- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
+++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
@@ -10,6 +10,7 @@ use anyhow::Context;
 use futures::stream;
 use futures::StreamExt;
 use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
 use nexus_types::deployment::BlueprintPhysicalDisksConfig;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::SledUuid;
@@ -97,26 +98,58 @@ pub(crate) async fn deploy_disks(
     }
 }
 
+/// Decommissions all disks which are currently expunged
+pub(crate) async fn decommission_expunged_disks(
+    opctx: &OpContext,
+    datastore: &DataStore,
+) -> Result<(), Vec<anyhow::Error>> {
+    datastore
+        .physical_disk_decommission_all_expunged(&opctx)
+        .await
+        .map_err(|e| vec![anyhow!(e)])?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use super::deploy_disks;
+
+    use crate::DataStore;
     use crate::Sled;
+    use async_bb8_diesel::AsyncRunQueryDsl;
+    use diesel::ExpressionMethods;
+    use diesel::QueryDsl;
     use httptest::matchers::{all_of, json_decoded, request};
     use httptest::responders::json_encoded;
     use httptest::responders::status_code;
     use httptest::Expectation;
+    use nexus_db_model::Dataset;
+    use nexus_db_model::DatasetKind;
+    use nexus_db_model::PhysicalDisk;
+    use nexus_db_model::PhysicalDiskKind;
+    use nexus_db_model::PhysicalDiskPolicy;
+    use nexus_db_model::PhysicalDiskState;
+    use nexus_db_model::Region;
+    use nexus_db_model::Zpool;
     use nexus_db_queries::context::OpContext;
+    use nexus_db_queries::db;
+    use nexus_test_utils::SLED_AGENT_UUID;
     use nexus_test_utils_macros::nexus_test;
     use nexus_types::deployment::{
         Blueprint, BlueprintPhysicalDiskConfig, BlueprintPhysicalDisksConfig,
-        BlueprintTarget, CockroachDbPreserveDowngrade,
+        BlueprintTarget, CockroachDbPreserveDowngrade, DiskFilter,
     };
+    use nexus_types::identity::Asset;
+    use omicron_common::api::external::DataPageParams;
     use omicron_common::api::external::Generation;
     use omicron_common::disk::DiskIdentity;
+    use omicron_uuid_kinds::GenericUuid;
+    use omicron_uuid_kinds::PhysicalDiskUuid;
     use omicron_uuid_kinds::SledUuid;
     use omicron_uuid_kinds::ZpoolUuid;
     use std::collections::BTreeMap;
     use std::net::SocketAddr;
+    use std::str::FromStr;
     use uuid::Uuid;
 
     type ControlPlaneTestContext =
@@ -350,4 +383,227 @@ mod test {
         s1.verify_and_clear();
         s2.verify_and_clear();
     }
+
+    async fn make_disk_in_db(
+        datastore: &DataStore,
+        opctx: &OpContext,
+        i: usize,
+        sled_id: SledUuid,
+    ) -> PhysicalDiskUuid {
+        let id = PhysicalDiskUuid::from_untyped_uuid(Uuid::new_v4());
+        let physical_disk = PhysicalDisk::new(
+            id.into_untyped_uuid(),
+            "v".into(),
+            format!("s-{i})"),
+            "m".into(),
+            PhysicalDiskKind::U2,
+            sled_id.into_untyped_uuid(),
+        );
+        datastore
+            .physical_disk_insert(&opctx, physical_disk.clone())
+            .await
+            .unwrap();
+        id
+    }
+
+    async fn add_zpool_dataset_and_region(
+        datastore: &DataStore,
+        opctx: &OpContext,
+        id: PhysicalDiskUuid,
+        sled_id: SledUuid,
+    ) {
+        let zpool = datastore
+            .zpool_insert(
+                opctx,
+                Zpool::new(
+                    Uuid::new_v4(),
+                    sled_id.into_untyped_uuid(),
+                    id.into_untyped_uuid(),
+                ),
+            )
+            .await
+            .unwrap();
+
+        let dataset = datastore
+            .dataset_upsert(Dataset::new(
+                Uuid::new_v4(),
+                zpool.id(),
+                Some(std::net::SocketAddrV6::new(
+                    std::net::Ipv6Addr::LOCALHOST,
+                    0,
+                    0,
+                    0,
+                )),
+                DatasetKind::Crucible,
+            ))
+            .await
+            .unwrap();
+
+        // There isn't a great API to insert regions (we normally allocate!)
+        // so insert the record manually here.
+        let region = {
+            let volume_id = Uuid::new_v4();
+            Region::new(
+                dataset.id(),
+                volume_id,
+                512_i64.try_into().unwrap(),
+                10,
+                10,
+                1,
+            )
+        };
+        let conn = datastore.pool_connection_for_tests().await.unwrap();
+        use nexus_db_model::schema::region::dsl;
+        diesel::insert_into(dsl::region)
+            .values(region)
+            .execute_async(&*conn)
+            .await
+            .unwrap();
+    }
+
+    async fn get_pools(
+        datastore: &DataStore,
+        id: PhysicalDiskUuid,
+    ) -> Vec<ZpoolUuid> {
+        let conn = datastore.pool_connection_for_tests().await.unwrap();
+
+        use db::schema::zpool::dsl;
+        dsl::zpool
+            .filter(dsl::time_deleted.is_null())
+            .filter(dsl::physical_disk_id.eq(id.into_untyped_uuid()))
+            .select(dsl::id)
+            .load_async::<Uuid>(&*conn)
+            .await
+            .map(|ids| {
+                ids.into_iter()
+                    .map(|id| ZpoolUuid::from_untyped_uuid(id))
+                    .collect()
+            })
+            .unwrap()
+    }
+
+    async fn get_datasets(datastore: &DataStore, id: ZpoolUuid) -> Vec<Uuid> {
+        let conn = datastore.pool_connection_for_tests().await.unwrap();
+
+        use db::schema::dataset::dsl;
+        dsl::dataset
+            .filter(dsl::time_deleted.is_null())
+            .filter(dsl::pool_id.eq(id.into_untyped_uuid()))
+            .select(dsl::id)
+            .load_async(&*conn)
+            .await
+            .unwrap()
+    }
+
+    async fn get_regions(datastore: &DataStore, id: Uuid) -> Vec<Uuid> {
+        let conn = datastore.pool_connection_for_tests().await.unwrap();
+
+        use db::schema::region::dsl;
+        dsl::region
+            .filter(dsl::dataset_id.eq(id.into_untyped_uuid()))
+            .select(dsl::id)
+            .load_async(&*conn)
+            .await
+            .unwrap()
+    }
+
+    #[nexus_test]
+    async fn test_decommission_expunged_disks(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        let sled_id = SledUuid::from_untyped_uuid(
+            Uuid::from_str(&SLED_AGENT_UUID).unwrap(),
+        );
+
+        // Create a couple disks in the database
+        let disks = [
+            make_disk_in_db(&datastore, &opctx, 0, sled_id).await,
+            make_disk_in_db(&datastore, &opctx, 1, sled_id).await,
+        ];
+
+        // Add a zpool, dataset, and region to each disk.
+        for disk_id in disks {
+            add_zpool_dataset_and_region(&datastore, &opctx, disk_id, sled_id)
+                .await;
+        }
+
+        let disk_to_decommission = disks[0];
+        let other_disk = disks[1];
+
+        // Expunge one of the disks
+        datastore
+            .physical_disk_update_policy(
+                &opctx,
+                disk_to_decommission.into_untyped_uuid(),
+                PhysicalDiskPolicy::Expunged,
+            )
+            .await
+            .unwrap();
+
+        // Verify that the state of both disks is "active"
+        let all_disks = datastore
+            .physical_disk_list(
+                &opctx,
+                &DataPageParams::max_page(),
+                DiskFilter::All,
+            )
+            .await
+            .unwrap()
+            .into_iter()
+            .map(|disk| (disk.id(), disk))
+            .collect::<BTreeMap<_, _>>();
+        let d = &all_disks[&disk_to_decommission.into_untyped_uuid()];
+        assert_eq!(d.disk_state, PhysicalDiskState::Active);
+        assert_eq!(d.disk_policy, PhysicalDiskPolicy::Expunged);
+        let d = &all_disks[&other_disk.into_untyped_uuid()];
+        assert_eq!(d.disk_state, PhysicalDiskState::Active);
+        assert_eq!(d.disk_policy, PhysicalDiskPolicy::InService);
+
+        super::decommission_expunged_disks(&opctx, &datastore).await.unwrap();
+
+        // After decommissioning, we see the expunged disk become
+        // decommissioned. The other disk remains in-service.
+        let all_disks = datastore
+            .physical_disk_list(
+                &opctx,
+                &DataPageParams::max_page(),
+                DiskFilter::All,
+            )
+            .await
+            .unwrap()
+            .into_iter()
+            .map(|disk| (disk.id(), disk))
+            .collect::<BTreeMap<_, _>>();
+        let d = &all_disks[&disk_to_decommission.into_untyped_uuid()];
+        assert_eq!(d.disk_state, PhysicalDiskState::Decommissioned);
+        assert_eq!(d.disk_policy, PhysicalDiskPolicy::Expunged);
+        let d = &all_disks[&other_disk.into_untyped_uuid()];
+        assert_eq!(d.disk_state, PhysicalDiskState::Active);
+        assert_eq!(d.disk_policy, PhysicalDiskPolicy::InService);
+
+        // Even though we've decommissioned this disk, the pools, datasets, and
+        // regions should remain. Refer to the "decommissioned_disk_cleaner"
+        // for how these get eventually cleared up.
+        let pools = get_pools(&datastore, disk_to_decommission).await;
+        assert_eq!(pools.len(), 1);
+        let datasets = get_datasets(&datastore, pools[0]).await;
+        assert_eq!(datasets.len(), 1);
+        let regions = get_regions(&datastore, datasets[0]).await;
+        assert_eq!(regions.len(), 1);
+
+        // Similarly, the "other disk" should still exist.
+        let pools = get_pools(&datastore, other_disk).await;
+        assert_eq!(pools.len(), 1);
+        let datasets = get_datasets(&datastore, pools[0]).await;
+        assert_eq!(datasets.len(), 1);
+        let regions = get_regions(&datastore, datasets[0]).await;
+        assert_eq!(regions.len(), 1);
+    }
 }
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 4a5d792c80..2f1c4cd738 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -93,6 +93,7 @@ use super::tasks::bfd;
 use super::tasks::blueprint_execution;
 use super::tasks::blueprint_load;
 use super::tasks::crdb_node_id_collector;
+use super::tasks::decommissioned_disk_cleaner;
 use super::tasks::dns_config;
 use super::tasks::dns_propagation;
 use super::tasks::dns_servers;
@@ -142,6 +143,7 @@ pub struct BackgroundTasks {
     pub task_bfd_manager: Activator,
     pub task_inventory_collection: Activator,
     pub task_physical_disk_adoption: Activator,
+    pub task_decommissioned_disk_cleaner: Activator,
     pub task_phantom_disks: Activator,
     pub task_blueprint_loader: Activator,
     pub task_blueprint_executor: Activator,
@@ -221,6 +223,7 @@ impl BackgroundTasksInitializer {
             task_bfd_manager: Activator::new(),
             task_inventory_collection: Activator::new(),
             task_physical_disk_adoption: Activator::new(),
+            task_decommissioned_disk_cleaner: Activator::new(),
             task_phantom_disks: Activator::new(),
             task_blueprint_loader: Activator::new(),
             task_blueprint_executor: Activator::new(),
@@ -280,6 +283,7 @@ impl BackgroundTasksInitializer {
             task_bfd_manager,
             task_inventory_collection,
             task_physical_disk_adoption,
+            task_decommissioned_disk_cleaner,
             task_phantom_disks,
             task_blueprint_loader,
             task_blueprint_executor,
@@ -511,6 +515,23 @@ impl BackgroundTasksInitializer {
             activator: task_physical_disk_adoption,
         });
 
+        driver.register(TaskDefinition {
+            name: "decommissioned_disk_cleaner",
+            description:
+                "deletes DB records for decommissioned disks, after regions \
+                 and region snapshots have been replaced",
+            period: config.decommissioned_disk_cleaner.period_secs,
+            task_impl: Box::new(
+                decommissioned_disk_cleaner::DecommissionedDiskCleaner::new(
+                    datastore.clone(),
+                    config.decommissioned_disk_cleaner.disable,
+                ),
+            ),
+            opctx: opctx.child(BTreeMap::new()),
+            watchers: vec![],
+            activator: task_decommissioned_disk_cleaner,
+        });
+
         driver.register(TaskDefinition {
             name: "service_zone_nat_tracker",
             description:
diff --git a/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs
new file mode 100644
index 0000000000..cb3ef9a569
--- /dev/null
+++ b/nexus/src/app/background/tasks/decommissioned_disk_cleaner.rs
@@ -0,0 +1,458 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Cleans up old database state from decommissioned disks.
+//!
+//! This cannot happen at decommissioning time, because it depends on region
+//! (and snapshot) replacement, which happens in the background.
+//!
+//! Cleanup involves deleting database records for disks (datasets, zpools)
+//! that are no longer viable after the physical disk has been decommissioned.
+
+use crate::app::background::BackgroundTask;
+use anyhow::Context;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use nexus_db_model::Zpool;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::pagination::Paginator;
+use nexus_db_queries::db::DataStore;
+use nexus_types::identity::Asset;
+use omicron_common::api::external::Error;
+use omicron_uuid_kinds::{GenericUuid, ZpoolUuid};
+use std::num::NonZeroU32;
+use std::sync::Arc;
+
+/// Background task that cleans decommissioned disk DB records.
+pub struct DecommissionedDiskCleaner {
+    datastore: Arc<DataStore>,
+    disable: bool,
+}
+
+#[derive(Debug, Default)]
+struct ActivationResults {
+    found: usize,
+    not_ready_to_be_deleted: usize,
+    deleted: usize,
+    error_count: usize,
+}
+
+const MAX_BATCH: NonZeroU32 = unsafe {
+    // Safety: last time I checked, 100 was greater than zero.
+    NonZeroU32::new_unchecked(100)
+};
+
+impl DecommissionedDiskCleaner {
+    pub fn new(datastore: Arc<DataStore>, disable: bool) -> Self {
+        Self { datastore, disable }
+    }
+
+    async fn clean_all(
+        &mut self,
+        results: &mut ActivationResults,
+        opctx: &OpContext,
+    ) -> Result<(), anyhow::Error> {
+        slog::info!(opctx.log, "Decommissioned disk cleaner running");
+
+        let mut paginator = Paginator::new(MAX_BATCH);
+        let mut last_err = Ok(());
+        while let Some(p) = paginator.next() {
+            let zpools = self
+                .datastore
+                .zpool_on_decommissioned_disk_list(
+                    opctx,
+                    &p.current_pagparams(),
+                )
+                .await
+                .context("failed to list zpools on decommissioned disks")?;
+            paginator = p.found_batch(&zpools, &|zpool| zpool.id());
+            self.clean_batch(results, &mut last_err, opctx, &zpools).await;
+        }
+
+        last_err
+    }
+
+    async fn clean_batch(
+        &mut self,
+        results: &mut ActivationResults,
+        last_err: &mut Result<(), anyhow::Error>,
+        opctx: &OpContext,
+        zpools: &[Zpool],
+    ) {
+        results.found += zpools.len();
+        slog::debug!(opctx.log, "Found zpools on decommissioned disks"; "count" => zpools.len());
+
+        for zpool in zpools {
+            let zpool_id = ZpoolUuid::from_untyped_uuid(zpool.id());
+            slog::trace!(opctx.log, "Deleting Zpool"; "zpool" => %zpool_id);
+
+            match self
+                .datastore
+                .zpool_delete_self_and_all_datasets(opctx, zpool_id)
+                .await
+            {
+                Ok(_) => {
+                    slog::info!(
+                        opctx.log,
+                        "Deleted zpool and datasets within";
+                        "zpool" => %zpool_id,
+                    );
+                    results.deleted += 1;
+                }
+                Err(Error::ServiceUnavailable { internal_message }) => {
+                    slog::trace!(
+                        opctx.log,
+                        "Zpool on decommissioned disk not ready for deletion";
+                        "zpool" => %zpool_id,
+                        "error" => internal_message,
+                    );
+                    results.not_ready_to_be_deleted += 1;
+                }
+                Err(e) => {
+                    slog::warn!(
+                        opctx.log,
+                        "Failed to zpool on decommissioned disk";
+                        "zpool" => %zpool_id,
+                        "error" => %e,
+                    );
+                    results.error_count += 1;
+                    *last_err = Err(e).with_context(|| {
+                        format!("failed to delete zpool record {zpool_id}")
+                    });
+                }
+            }
+        }
+    }
+}
+
+impl BackgroundTask for DecommissionedDiskCleaner {
+    fn activate<'a>(
+        &'a mut self,
+        opctx: &'a OpContext,
+    ) -> BoxFuture<'a, serde_json::Value> {
+        async move {
+            let mut results = ActivationResults::default();
+
+            let error = if !self.disable {
+                match self.clean_all(&mut results, opctx).await {
+                    Ok(_) => {
+                        slog::info!(opctx.log, "Cleaned decommissioned zpools";
+                            "found" => results.found,
+                            "not_ready_to_be_deleted" => results.not_ready_to_be_deleted,
+                            "deleted" => results.deleted,
+                            "error_count" => results.error_count,
+                        );
+                        None
+                    }
+                    Err(err) => {
+                        slog::error!(opctx.log, "Failed to clean decommissioned zpools";
+                            "error" => %err,
+                            "found" => results.found,
+                            "not_ready_to_be_deleted" => results.not_ready_to_be_deleted,
+                            "deleted" => results.deleted,
+                            "error_count" => results.error_count,
+                        );
+                        Some(err.to_string())
+                    }
+                }
+            } else {
+                slog::info!(opctx.log, "Decommissioned Disk Cleaner disabled");
+                None
+            };
+            serde_json::json!({
+                "found": results.found,
+                "not_ready_to_be_deleted": results.not_ready_to_be_deleted,
+                "deleted": results.deleted,
+                "error_count": results.error_count,
+                "error": error,
+            })
+        }
+        .boxed()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use async_bb8_diesel::AsyncRunQueryDsl;
+    use diesel::ExpressionMethods;
+    use diesel::QueryDsl;
+    use nexus_db_model::Dataset;
+    use nexus_db_model::DatasetKind;
+    use nexus_db_model::PhysicalDisk;
+    use nexus_db_model::PhysicalDiskKind;
+    use nexus_db_model::PhysicalDiskPolicy;
+    use nexus_db_model::Region;
+    use nexus_test_utils::SLED_AGENT_UUID;
+    use nexus_test_utils_macros::nexus_test;
+    use omicron_uuid_kinds::{
+        DatasetUuid, PhysicalDiskUuid, RegionUuid, SledUuid,
+    };
+    use std::str::FromStr;
+    use uuid::Uuid;
+
+    type ControlPlaneTestContext =
+        nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+    async fn make_disk_in_db(
+        datastore: &DataStore,
+        opctx: &OpContext,
+        i: usize,
+        sled_id: SledUuid,
+    ) -> PhysicalDiskUuid {
+        let id = PhysicalDiskUuid::from_untyped_uuid(Uuid::new_v4());
+        let physical_disk = PhysicalDisk::new(
+            id.into_untyped_uuid(),
+            "v".into(),
+            format!("s-{i})"),
+            "m".into(),
+            PhysicalDiskKind::U2,
+            sled_id.into_untyped_uuid(),
+        );
+        datastore
+            .physical_disk_insert(&opctx, physical_disk.clone())
+            .await
+            .unwrap();
+        id
+    }
+
+    async fn add_zpool_dataset_and_region(
+        datastore: &DataStore,
+        opctx: &OpContext,
+        id: PhysicalDiskUuid,
+        sled_id: SledUuid,
+    ) -> (ZpoolUuid, DatasetUuid, RegionUuid) {
+        let zpool = datastore
+            .zpool_insert(
+                opctx,
+                Zpool::new(
+                    Uuid::new_v4(),
+                    sled_id.into_untyped_uuid(),
+                    id.into_untyped_uuid(),
+                ),
+            )
+            .await
+            .unwrap();
+
+        let dataset = datastore
+            .dataset_upsert(Dataset::new(
+                Uuid::new_v4(),
+                zpool.id(),
+                Some(std::net::SocketAddrV6::new(
+                    std::net::Ipv6Addr::LOCALHOST,
+                    0,
+                    0,
+                    0,
+                )),
+                DatasetKind::Crucible,
+            ))
+            .await
+            .unwrap();
+
+        // There isn't a great API to insert regions (we normally allocate!)
+        // so insert the record manually here.
+        let region = {
+            let volume_id = Uuid::new_v4();
+            Region::new(
+                dataset.id(),
+                volume_id,
+                512_i64.try_into().unwrap(),
+                10,
+                10,
+                1,
+            )
+        };
+        let region_id = region.id();
+        let conn = datastore.pool_connection_for_tests().await.unwrap();
+        use nexus_db_model::schema::region::dsl;
+        diesel::insert_into(dsl::region)
+            .values(region)
+            .execute_async(&*conn)
+            .await
+            .unwrap();
+
+        (
+            ZpoolUuid::from_untyped_uuid(zpool.id()),
+            DatasetUuid::from_untyped_uuid(dataset.id()),
+            RegionUuid::from_untyped_uuid(region_id),
+        )
+    }
+
+    struct TestFixture {
+        zpool_id: ZpoolUuid,
+        dataset_id: DatasetUuid,
+        region_id: RegionUuid,
+    }
+
+    impl TestFixture {
+        async fn setup(datastore: &Arc<DataStore>, opctx: &OpContext) -> Self {
+            let sled_id = SledUuid::from_untyped_uuid(
+                Uuid::from_str(&SLED_AGENT_UUID).unwrap(),
+            );
+
+            let disk_id = make_disk_in_db(datastore, opctx, 0, sled_id).await;
+            let (zpool_id, dataset_id, region_id) =
+                add_zpool_dataset_and_region(
+                    &datastore, &opctx, disk_id, sled_id,
+                )
+                .await;
+            datastore
+                .physical_disk_update_policy(
+                    &opctx,
+                    disk_id.into_untyped_uuid(),
+                    PhysicalDiskPolicy::Expunged,
+                )
+                .await
+                .unwrap();
+
+            Self { zpool_id, dataset_id, region_id }
+        }
+
+        async fn delete_region(&self, datastore: &DataStore) {
+            let conn = datastore.pool_connection_for_tests().await.unwrap();
+            use nexus_db_model::schema::region::dsl;
+
+            diesel::delete(
+                dsl::region
+                    .filter(dsl::id.eq(self.region_id.into_untyped_uuid())),
+            )
+            .execute_async(&*conn)
+            .await
+            .unwrap();
+        }
+
+        async fn has_been_cleaned(&self, datastore: &DataStore) -> bool {
+            use async_bb8_diesel::AsyncRunQueryDsl;
+            use diesel::{
+                ExpressionMethods, OptionalExtension, QueryDsl,
+                SelectableHelper,
+            };
+            use nexus_db_queries::db::schema::zpool::dsl as zpool_dsl;
+
+            let conn = datastore.pool_connection_for_tests().await.unwrap();
+            let fetched_zpool = zpool_dsl::zpool
+                .filter(zpool_dsl::id.eq(self.zpool_id.into_untyped_uuid()))
+                .filter(zpool_dsl::time_deleted.is_null())
+                .select(Zpool::as_select())
+                .first_async(&*conn)
+                .await
+                .optional()
+                .expect("Zpool query should succeed");
+
+            use nexus_db_queries::db::schema::dataset::dsl as dataset_dsl;
+            let fetched_dataset = dataset_dsl::dataset
+                .filter(dataset_dsl::id.eq(self.dataset_id.into_untyped_uuid()))
+                .filter(dataset_dsl::time_deleted.is_null())
+                .select(Dataset::as_select())
+                .first_async(&*conn)
+                .await
+                .optional()
+                .expect("Dataset query should succeed");
+
+            match (fetched_zpool, fetched_dataset) {
+                (Some(_), Some(_)) => false,
+                (None, None) => true,
+                _ => panic!("If zpool and dataset were cleaned, they should be cleaned together"),
+            }
+        }
+    }
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_disk_cleanup_ignores_active_disks(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+        let fixture = TestFixture::setup(datastore, &opctx).await;
+
+        let mut task = DecommissionedDiskCleaner::new(datastore.clone(), false);
+
+        // Setup: Disk is expunged, not decommissioned.
+        // Expectation: We ignore it.
+
+        let mut results = ActivationResults::default();
+        dbg!(task.clean_all(&mut results, &opctx).await)
+            .expect("activation completes successfully");
+        dbg!(&results);
+        assert_eq!(results.found, 0);
+        assert_eq!(results.not_ready_to_be_deleted, 0);
+        assert_eq!(results.deleted, 0);
+        assert_eq!(results.error_count, 0);
+
+        assert!(!fixture.has_been_cleaned(&datastore).await);
+    }
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_disk_cleanup_does_not_clean_disks_with_regions(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+        let fixture = TestFixture::setup(datastore, &opctx).await;
+
+        let mut task = DecommissionedDiskCleaner::new(datastore.clone(), false);
+
+        datastore
+            .physical_disk_decommission_all_expunged(&opctx)
+            .await
+            .unwrap();
+
+        // Setup: Disk is decommissioned, but has a region.
+        // Expectation: We ignore it.
+
+        let mut results = ActivationResults::default();
+        dbg!(task.clean_all(&mut results, &opctx).await)
+            .expect("activation completes successfully");
+        dbg!(&results);
+        assert_eq!(results.found, 1);
+        assert_eq!(results.not_ready_to_be_deleted, 1);
+        assert_eq!(results.deleted, 0);
+        assert_eq!(results.error_count, 0);
+
+        assert!(!fixture.has_been_cleaned(&datastore).await);
+    }
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_disk_cleanup_cleans_disks_with_no_regions(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+        let fixture = TestFixture::setup(datastore, &opctx).await;
+
+        let mut task = DecommissionedDiskCleaner::new(datastore.clone(), false);
+
+        datastore
+            .physical_disk_decommission_all_expunged(&opctx)
+            .await
+            .unwrap();
+        fixture.delete_region(&datastore).await;
+
+        // Setup: Disk is decommissioned and has no regions.
+        // Expectation: We clean it.
+
+        let mut results = ActivationResults::default();
+        dbg!(task.clean_all(&mut results, &opctx).await)
+            .expect("activation completes successfully");
+        dbg!(&results);
+        assert_eq!(results.found, 1);
+        assert_eq!(results.not_ready_to_be_deleted, 0);
+        assert_eq!(results.deleted, 1);
+        assert_eq!(results.error_count, 0);
+
+        assert!(fixture.has_been_cleaned(&datastore).await);
+    }
+}
diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs
index a5204588d8..5062799bdb 100644
--- a/nexus/src/app/background/tasks/mod.rs
+++ b/nexus/src/app/background/tasks/mod.rs
@@ -9,6 +9,7 @@ pub mod bfd;
 pub mod blueprint_execution;
 pub mod blueprint_load;
 pub mod crdb_node_id_collector;
+pub mod decommissioned_disk_cleaner;
 pub mod dns_config;
 pub mod dns_propagation;
 pub mod dns_servers;
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index 415727693b..e231f665fa 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -104,6 +104,9 @@ phantom_disks.period_secs = 30
 physical_disk_adoption.period_secs = 30
 # Disable automatic disk adoption to avoid interfering with tests.
 physical_disk_adoption.disable = true
+decommissioned_disk_cleaner.period_secs = 60
+# Disable disk decommissioning cleanup to avoid interfering with tests.
+decommissioned_disk_cleaner.disable = true
 blueprints.period_secs_load = 100
 blueprints.period_secs_execute = 600
 blueprints.period_secs_collect_crdb_node_ids = 600
diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs
index bd8a8877fc..aa1ff3a667 100644
--- a/nexus/tests/integration_tests/sleds.rs
+++ b/nexus/tests/integration_tests/sleds.rs
@@ -20,6 +20,8 @@ use nexus_test_utils_macros::nexus_test;
 use nexus_types::external_api::views::SledInstance;
 use nexus_types::external_api::views::{PhysicalDisk, Sled};
 use omicron_sled_agent::sim;
+use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::PhysicalDiskUuid;
 use omicron_uuid_kinds::SledUuid;
 use std::str::FromStr;
 use uuid::Uuid;
@@ -128,7 +130,7 @@ async fn test_physical_disk_create_list_delete(
 
     let disks = physical_disks_list(&external_client, &disks_url).await;
     assert_eq!(disks.len(), disks_initial.len() + 1);
-    let _new_disk = disks
+    let new_disk = disks
         .iter()
         .find(|found_disk| {
             found_disk.vendor == "v"
@@ -141,10 +143,7 @@ async fn test_physical_disk_create_list_delete(
     datastore
         .physical_disk_delete(
             &opctx,
-            "v".into(),
-            "s".into(),
-            "m".into(),
-            sled_id,
+            PhysicalDiskUuid::from_untyped_uuid(new_disk.identity.id),
         )
         .await
         .expect("Failed to upsert physical disk");
diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs
index a8f3989da4..a5feff067a 100644
--- a/nexus/types/src/deployment/planning_input.rs
+++ b/nexus/types/src/deployment/planning_input.rs
@@ -347,6 +347,9 @@ pub enum DiskFilter {
 
     /// All disks which are in-service.
     InService,
+
+    /// All disks which are expunged but still active.
+    ExpungedButActive,
 }
 
 impl DiskFilter {
@@ -366,10 +369,12 @@ impl PhysicalDiskPolicy {
             PhysicalDiskPolicy::InService => match filter {
                 DiskFilter::All => true,
                 DiskFilter::InService => true,
+                DiskFilter::ExpungedButActive => false,
             },
             PhysicalDiskPolicy::Expunged => match filter {
                 DiskFilter::All => true,
                 DiskFilter::InService => false,
+                DiskFilter::ExpungedButActive => true,
             },
         }
     }
@@ -391,10 +396,12 @@ impl PhysicalDiskState {
             PhysicalDiskState::Active => match filter {
                 DiskFilter::All => true,
                 DiskFilter::InService => true,
+                DiskFilter::ExpungedButActive => true,
             },
             PhysicalDiskState::Decommissioned => match filter {
                 DiskFilter::All => true,
                 DiskFilter::InService => false,
+                DiskFilter::ExpungedButActive => false,
             },
         }
     }
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 50f9bf646e..396e3615b2 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -50,6 +50,7 @@ inventory.nkeep = 3
 inventory.disable = false
 phantom_disks.period_secs = 30
 physical_disk_adoption.period_secs = 30
+decommissioned_disk_cleaner.period_secs = 60
 blueprints.period_secs_load = 10
 blueprints.period_secs_execute = 60
 blueprints.period_secs_collect_crdb_node_ids = 180
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index 31db278616..df49476eed 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -50,6 +50,7 @@ inventory.nkeep = 3
 inventory.disable = false
 phantom_disks.period_secs = 30
 physical_disk_adoption.period_secs = 30
+decommissioned_disk_cleaner.period_secs = 60
 blueprints.period_secs_load = 10
 blueprints.period_secs_execute = 60
 blueprints.period_secs_collect_crdb_node_ids = 180
diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs
index fb8a6f6fa9..8f695d2399 100644
--- a/uuid-kinds/src/lib.rs
+++ b/uuid-kinds/src/lib.rs
@@ -50,6 +50,7 @@ macro_rules! impl_typed_uuid_kind {
 
 impl_typed_uuid_kind! {
     Collection => "collection",
+    Dataset => "dataset",
     Downstairs => "downstairs",
     DownstairsRegion => "downstairs_region",
     ExternalIp => "external_ip",
@@ -60,6 +61,7 @@ impl_typed_uuid_kind! {
     Propolis => "propolis",
     RackInit => "rack_init",
     RackReset => "rack_reset",
+    Region => "region",
     Sled => "sled",
     TufRepo => "tuf_repo",
     Upstairs => "upstairs",

From 12df35d64d0d2be87d258662f6e03c263b955d6b Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@users.noreply.github.com>
Date: Wed, 24 Jul 2024 15:39:03 -0400
Subject: [PATCH 2/3] Correctly mark RoT Bootloader as skipped (#6117)

When the RoT bootloader is skipped we need to update it similarly to the
RoT and SP.
---
 wicket/src/state/update.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/wicket/src/state/update.rs b/wicket/src/state/update.rs
index 3e0c89e83e..97c0b49d2d 100644
--- a/wicket/src/state/update.rs
+++ b/wicket/src/state/update.rs
@@ -258,12 +258,15 @@ impl UpdateItem {
                 }
                 | StepEventKind::StepCompleted { step, outcome, .. } => {
                     if step.info.is_last_step_in_component() {
-                        // The RoT and SP components each have two steps in
-                        // them. If the second step ("Updating RoT/SP") is
+                        // The RoT (and bootloader) and SP components each
+                        // have two steps in them. If the second step
+                        // ("Updating RoT Bootloader/RoT/SP") is
                         // skipped, then treat the component as skipped.
                         if matches!(
                             step.info.component,
-                            UpdateComponent::Sp | UpdateComponent::Rot
+                            UpdateComponent::Sp
+                                | UpdateComponent::Rot
+                                | UpdateComponent::RotBootloader
                         ) {
                             assert_eq!(
                                 step.info.id,

From 18f2520b3a70c51d28b4230a9c595960bd515095 Mon Sep 17 00:00:00 2001
From: Benjamin Naecker <ben@oxide.computer>
Date: Wed, 24 Jul 2024 14:13:43 -0700
Subject: [PATCH 3/3] Define switch data link timeseries in TOML (#6120)

Note that this renames and reorganizes the timeseries a good deal. We
want to include more of the switch / sled identifiers, and make the name
of the timeseries more consistent with the existing sled data link and
planned instance data link timeseries.
---
 openapi/nexus.json                            |  23 +-
 oximeter/impl/src/schema/codegen.rs           |   1 +
 oximeter/impl/src/schema/mod.rs               |   2 +
 .../oximeter/schema/switch-data-link.toml     | 213 ++++++++++++++++++
 4 files changed, 233 insertions(+), 6 deletions(-)
 create mode 100644 oximeter/oximeter/schema/switch-data-link.toml

diff --git a/openapi/nexus.json b/openapi/nexus.json
index 9e46a92039..a4baa8124f 100644
--- a/openapi/nexus.json
+++ b/openapi/nexus.json
@@ -19797,12 +19797,23 @@
       },
       "Units": {
         "description": "Measurement units for timeseries samples.",
-        "type": "string",
-        "enum": [
-          "count",
-          "bytes",
-          "seconds",
-          "nanoseconds"
+        "oneOf": [
+          {
+            "type": "string",
+            "enum": [
+              "count",
+              "bytes",
+              "seconds",
+              "nanoseconds"
+            ]
+          },
+          {
+            "description": "No meaningful units, e.g. a dimensionless quanity.",
+            "type": "string",
+            "enum": [
+              "none"
+            ]
+          }
         ]
       },
       "User": {
diff --git a/oximeter/impl/src/schema/codegen.rs b/oximeter/impl/src/schema/codegen.rs
index cde67439de..ef686c3cdd 100644
--- a/oximeter/impl/src/schema/codegen.rs
+++ b/oximeter/impl/src/schema/codegen.rs
@@ -515,6 +515,7 @@ fn quote_creation_time(created: DateTime<Utc>) -> TokenStream {
 impl quote::ToTokens for Units {
     fn to_tokens(&self, tokens: &mut TokenStream) {
         let toks = match self {
+            Units::None => quote! { ::oximeter::schema::Units::None },
             Units::Count => quote! { ::oximeter::schema::Units::Count },
             Units::Bytes => quote! { ::oximeter::schema::Units::Bytes },
             Units::Seconds => quote! { ::oximeter::schema::Units::Seconds },
diff --git a/oximeter/impl/src/schema/mod.rs b/oximeter/impl/src/schema/mod.rs
index 83a83e95b2..7743034e31 100644
--- a/oximeter/impl/src/schema/mod.rs
+++ b/oximeter/impl/src/schema/mod.rs
@@ -183,6 +183,8 @@ pub struct TimeseriesDescription {
 // TODO-completeness: Decide whether and how to handle dimensional analysis
 // during queries, if needed.
 pub enum Units {
+    /// No meaningful units, e.g. a dimensionless quanity.
+    None,
     Count,
     Bytes,
     Seconds,
diff --git a/oximeter/oximeter/schema/switch-data-link.toml b/oximeter/oximeter/schema/switch-data-link.toml
new file mode 100644
index 0000000000..fa10759ca9
--- /dev/null
+++ b/oximeter/oximeter/schema/switch-data-link.toml
@@ -0,0 +1,213 @@
+format_version = 1
+
+[target]
+name = "switch_data_link"
+description = "A network data link on an Oxide switch"
+authz_scope = "fleet"
+versions = [
+    { version = 1, fields = [ "rack_id", "sled_id", "sled_model", "sled_revision", "sled_serial", "switch_id", "switch_model", "switch_revision", "switch_serial" ] },
+]
+
+[[metrics]]
+name = "bytes_sent"
+description = "Total number of bytes sent on the data link"
+units = "bytes"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "bytes_received"
+description = "Total number of bytes received on the data link"
+units = "bytes"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "errors_sent"
+description = "Total number of errors when sending on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "errors_received"
+description = "Total number of packets for the data link dropped due to any error"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "receive_crc_error_drops"
+description = "Total number of packets for the data link dropped due to CRC errors"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "receive_buffer_full_drops"
+description = "Total number of packets for the data link dropped due to ASIC buffer congestion"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "packets_sent"
+description = "Total number of packets sent on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "packets_received"
+description = "Total number of packets received on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "link_up"
+description = "Reports whether the link is currently up"
+units = "none"
+datum_type = "bool"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "link_fsm"
+description = """\
+Total entries into each state of the autonegotation / \
+link-training finite state machine\
+"""
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id", "state" ] }
+]
+
+[[metrics]]
+name = "pcs_bad_sync_headers"
+description = "Total number of bad PCS sync headers on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_errored_blocks"
+description = "Total number of PCS error blocks on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_block_lock_loss"
+description = "Total number of detected losses of block-lock on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_high_ber"
+description = "Total number of high bit-error-rate events on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_valid_errors"
+description = "Total number of valid error events on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_invalid_errors"
+description = "Total number of invalid error events on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[[metrics]]
+name = "pcs_unknown_errors"
+description = "Total number of unknown error events on the data link"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "port_id", "link_id" ] }
+]
+
+[fields.rack_id]
+type = "uuid"
+description = "ID of the rack the link's switch is in"
+
+[fields.sled_id]
+type = "uuid"
+description = "ID of the sled managing the link's switch"
+
+[fields.sled_model]
+type = "string"
+description = "Model number of the sled managing the link's switch"
+
+[fields.sled_revision]
+type = "u32"
+description = "Revision number of the sled managing the link's switch"
+
+[fields.sled_serial]
+type = "string"
+description = "Serial number of the sled managing the link's switch"
+
+[fields.switch_id]
+type = "uuid"
+description = "ID of the switch the link is on"
+
+[fields.switch_model]
+type = "string"
+description = "The model number switch the link is on"
+
+[fields.switch_revision]
+type = "u32"
+description = "Revision number of the switch the link is on"
+
+[fields.switch_serial]
+type = "string"
+description = "Serial number of the switch the link is on"
+
+[fields.port_id]
+type = "string"
+description = "Physical switch port the link is on"
+
+[fields.link_id]
+type = "u8"
+description = "ID of the link within its switch port"
+
+[fields.state]
+type = "string"
+description = "Name of the data link FSM state"