From ab29a029ba612722cc062f0d691a7b63deb0afe7 Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Tue, 20 Aug 2024 21:58:12 -0700
Subject: [PATCH] [reconfigurator] a couple of type improvements (#6395)

* Use a typestate for the physical disk dependency. Just want to make
sure the ordering is right once this moves to being expressed via the
update engine.
* Add strong typing around the output of `realize_blueprint`.
---
 .../execution/src/cockroachdb.rs              |  22 ++--
 nexus/reconfigurator/execution/src/dns.rs     | 106 +++++++++---------
 nexus/reconfigurator/execution/src/lib.rs     |  28 +++--
 .../execution/src/omicron_physical_disks.rs   |  51 ++++++---
 .../background/tasks/blueprint_execution.rs   |  41 +++++--
 5 files changed, 156 insertions(+), 92 deletions(-)
diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs
index 277f5f91c4..01baebfb57 100644
--- a/nexus/reconfigurator/execution/src/cockroachdb.rs
+++ b/nexus/reconfigurator/execution/src/cockroachdb.rs
@@ -34,6 +34,7 @@ pub(crate) async fn ensure_settings(
 mod test {
     use super::*;
     use crate::overridables::Overridables;
+    use crate::RealizeBlueprintOutput;
     use nexus_db_queries::authn;
     use nexus_db_queries::authz;
     use nexus_test_utils_macros::nexus_test;
@@ -97,16 +98,17 @@ mod test {
         .await;
         // Execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute initial blueprint");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute initial blueprint");
         // The CockroachDB settings should not have changed.
         assert_eq!(
             settings,
diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs
index 4395944b25..9ca14f8e24 100644
--- a/nexus/reconfigurator/execution/src/dns.rs
+++ b/nexus/reconfigurator/execution/src/dns.rs
@@ -458,6 +458,7 @@ pub fn blueprint_nexus_external_ips(blueprint: &Blueprint) -> Vec<IpAddr> {
 mod test {
     use super::*;
     use crate::overridables::Overridables;
+    use crate::RealizeBlueprintOutput;
     use crate::Sled;
     use dns_service_client::DnsDiff;
     use internal_dns::config::Host;
@@ -1245,16 +1246,17 @@ mod test {
 
         // Now, execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute initial blueprint");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute initial blueprint");
 
         // DNS ought not to have changed.
         verify_dns_unchanged(
@@ -1385,16 +1387,17 @@ mod test {
             .await
             .expect("failed to set blueprint as target");
 
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint2,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute second blueprint");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint2,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute second blueprint");
 
         // Now fetch DNS again.  Both should have changed this time.
         let dns_latest_internal = datastore
@@ -1459,16 +1462,17 @@ mod test {
         }
 
         // If we execute it again, we should see no more changes.
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint2,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute second blueprint again");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint2,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute second blueprint again");
         verify_dns_unchanged(
             &opctx,
             datastore,
@@ -1495,16 +1499,17 @@ mod test {
 
         // One more time, make sure that executing the blueprint does not do
         // anything.
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint2,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute second blueprint again");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint2,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute second blueprint again");
         verify_dns_unchanged(
             &opctx,
             datastore,
@@ -1589,16 +1594,17 @@ mod test {
         );
 
         // If we execute the blueprint, DNS should not be changed.
-        crate::realize_blueprint_with_overrides(
-            &opctx,
-            datastore,
-            resolver,
-            &blueprint,
-            Uuid::new_v4(),
-            &overrides,
-        )
-        .await
-        .expect("failed to execute blueprint");
+        let _: RealizeBlueprintOutput =
+            crate::realize_blueprint_with_overrides(
+                &opctx,
+                datastore,
+                resolver,
+                &blueprint,
+                Uuid::new_v4(),
+                &overrides,
+            )
+            .await
+            .expect("failed to execute blueprint");
         let dns_latest_internal = datastore
             .dns_config_read(&opctx, DnsGroup::Internal)
             .await
diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs
index 8606187762..2c70c7acbb 100644
--- a/nexus/reconfigurator/execution/src/lib.rs
+++ b/nexus/reconfigurator/execution/src/lib.rs
@@ -70,6 +70,15 @@ impl From<nexus_db_model::Sled> for Sled {
     }
 }
 
+/// The result of calling [`realize_blueprint`] or
+/// [`realize_blueprint_with_overrides`].
+#[derive(Debug)]
+#[must_use = "the output of realize_blueprint should probably be used"]
+pub struct RealizeBlueprintOutput {
+    /// Whether any sagas need to be reassigned to a new Nexus.
+    pub needs_saga_recovery: bool,
+}
+
 /// Make one attempt to realize the given blueprint, meaning to take actions to
 /// alter the real system to match the blueprint
 ///
@@ -81,7 +90,7 @@ pub async fn realize_blueprint(
     resolver: &Resolver,
     blueprint: &Blueprint,
     nexus_id: Uuid,
-) -> Result<bool, Vec<anyhow::Error>> {
+) -> Result<RealizeBlueprintOutput, Vec<anyhow::Error>> {
     realize_blueprint_with_overrides(
         opctx,
         datastore,
@@ -100,7 +109,7 @@ pub async fn realize_blueprint_with_overrides(
     blueprint: &Blueprint,
     nexus_id: Uuid,
     overrides: &Overridables,
-) -> Result<bool, Vec<anyhow::Error>> {
+) -> Result<RealizeBlueprintOutput, Vec<anyhow::Error>> {
     let opctx = opctx.child(BTreeMap::from([(
         "comment".to_string(),
         blueprint.comment.clone(),
@@ -132,7 +141,7 @@ pub async fn realize_blueprint_with_overrides(
         })
         .collect();
 
-    omicron_physical_disks::deploy_disks(
+    let deploy_disks_done = omicron_physical_disks::deploy_disks(
         &opctx,
         &sleds_by_id,
         &blueprint.blueprint_disks,
@@ -205,11 +214,12 @@ pub async fn realize_blueprint_with_overrides(
     )
     .await?;
 
-    // This depends on the "deploy_disks" call earlier -- disk expungement is a
-    // statement of policy, but we need to be assured that the Sled Agent has
-    // stopped using that disk before we can mark its state as decommissioned.
-    omicron_physical_disks::decommission_expunged_disks(&opctx, datastore)
-        .await?;
+    omicron_physical_disks::decommission_expunged_disks(
+        &opctx,
+        datastore,
+        deploy_disks_done,
+    )
+    .await?;
 
     // From this point on, we'll assume that any errors that we encounter do
     // *not* require stopping execution.  We'll just accumulate them and return
@@ -244,7 +254,7 @@ pub async fn realize_blueprint_with_overrides(
     }
 
     if errors.is_empty() {
-        Ok(needs_saga_recovery)
+        Ok(RealizeBlueprintOutput { needs_saga_recovery })
     } else {
         Err(errors)
     }
diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
index 7adc41213e..af95eb8e77 100644
--- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
+++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs
@@ -25,7 +25,7 @@ pub(crate) async fn deploy_disks(
     opctx: &OpContext,
     sleds_by_id: &BTreeMap<SledUuid, Sled>,
     sled_configs: &BTreeMap<SledUuid, BlueprintPhysicalDisksConfig>,
-) -> Result<(), Vec<anyhow::Error>> {
+) -> Result<DeployDisksDone, Vec<anyhow::Error>> {
     let errors: Vec<_> = stream::iter(sled_configs)
         .filter_map(|(sled_id, config)| async move {
             let log = opctx.log.new(o!(
@@ -92,16 +92,26 @@ pub(crate) async fn deploy_disks(
         .await;
 
     if errors.is_empty() {
-        Ok(())
+        Ok(DeployDisksDone {})
     } else {
         Err(errors)
     }
 }
 
-/// Decommissions all disks which are currently expunged
+/// Typestate indicating that the deploy disks step was performed.
+#[derive(Debug)]
+#[must_use = "this should be passed into decommission_expunged_disks"]
+pub(crate) struct DeployDisksDone {}
+
+/// Decommissions all disks which are currently expunged.
 pub(crate) async fn decommission_expunged_disks(
     opctx: &OpContext,
     datastore: &DataStore,
+    // This is taken as a parameter to ensure that this depends on a
+    // "deploy_disks" call made earlier. Disk expungement is a statement of
+    // policy, but we need to be assured that the Sled Agent has stopped using
+    // that disk before we can mark its state as decommissioned.
+    _deploy_disks_done: DeployDisksDone,
 ) -> Result<(), Vec<anyhow::Error>> {
     datastore
         .physical_disk_decommission_all_expunged(&opctx)
@@ -113,6 +123,7 @@ pub(crate) async fn decommission_expunged_disks(
 #[cfg(test)]
 mod test {
     use super::deploy_disks;
+    use super::DeployDisksDone;
 
     use crate::DataStore;
     use crate::Sled;
@@ -217,9 +228,13 @@ mod test {
         // Get a success result back when the blueprint has an empty set of
         // disks.
         let (_, blueprint) = create_blueprint(BTreeMap::new());
-        deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
-            .await
-            .expect("failed to deploy no disks");
+        // Use an explicit type here because not doing so can cause errors to
+        // be ignored (this behavior is genuinely terrible). Instead, ensure
+        // that the type has the right result.
+        let _: DeployDisksDone =
+            deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
+                .await
+                .expect("failed to deploy no disks");
 
         // Disks are updated in a particular order, but each request contains
         // the full set of disks that must be running.
@@ -272,9 +287,10 @@ mod test {
         }
 
         // Execute it.
-        deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
-            .await
-            .expect("failed to deploy initial disks");
+        let _: DeployDisksDone =
+            deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
+                .await
+                .expect("failed to deploy initial disks");
 
         s1.verify_and_clear();
         s2.verify_and_clear();
@@ -293,9 +309,10 @@ mod test {
                 )),
             );
         }
-        deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
-            .await
-            .expect("failed to deploy same disks");
+        let _: DeployDisksDone =
+            deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks)
+                .await
+                .expect("failed to deploy same disks");
         s1.verify_and_clear();
         s2.verify_and_clear();
 
@@ -567,7 +584,15 @@ mod test {
         assert_eq!(d.disk_state, PhysicalDiskState::Active);
         assert_eq!(d.disk_policy, PhysicalDiskPolicy::InService);
 
-        super::decommission_expunged_disks(&opctx, &datastore).await.unwrap();
+        super::decommission_expunged_disks(
+            &opctx,
+            &datastore,
+            // This is an internal test, and we're testing decommissioning in
+            // isolation, so it's okay to create the typestate here.
+            DeployDisksDone {},
+        )
+        .await
+        .unwrap();
 
         // After decommissioning, we see the expunged disk become
         // decommissioned. The other disk remains in-service.
diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs
index b430270ec9..d13e5428f8 100644
--- a/nexus/src/app/background/tasks/blueprint_execution.rs
+++ b/nexus/src/app/background/tasks/blueprint_execution.rs
@@ -10,6 +10,7 @@ use futures::FutureExt;
 use internal_dns::resolver::Resolver;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
+use nexus_reconfigurator_execution::RealizeBlueprintOutput;
 use nexus_types::deployment::{Blueprint, BlueprintTarget};
 use serde_json::json;
 use std::sync::Arc;
@@ -98,16 +99,23 @@ impl BlueprintExecutor {
         // Trigger anybody waiting for this to finish.
         self.tx.send_modify(|count| *count = *count + 1);
 
-        // If executing the blueprint requires activating the saga recovery
-        // background task, do that now.
-        info!(&opctx.log, "activating saga recovery task");
-        if let Ok(true) = result {
-            self.saga_recovery.activate();
-        }
-
         // Return the result as a `serde_json::Value`
         match result {
-            Ok(_) => json!({}),
+            Ok(RealizeBlueprintOutput { needs_saga_recovery }) => {
+                // If executing the blueprint requires activating the saga
+                // recovery background task, do that now.
+                if let Ok(output) = &result {
+                    if output.needs_saga_recovery {
+                        info!(&opctx.log, "activating saga recovery task");
+                        self.saga_recovery.activate();
+                    }
+                }
+
+                json!({
+                    "target_id": blueprint.id.to_string(),
+                    "needs_saga_recovery": needs_saga_recovery,
+                })
+            }
             Err(errors) => {
                 let errors: Vec<_> =
                     errors.into_iter().map(|e| format!("{:#}", e)).collect();
@@ -302,10 +310,17 @@ mod test {
             )
             .await,
         );
+        let blueprint_id = blueprint.1.id;
         blueprint_tx.send(Some(blueprint)).unwrap();
         let value = task.activate(&opctx).await;
         println!("activating with no zones: {:?}", value);
-        assert_eq!(value, json!({}));
+        assert_eq!(
+            value,
+            json!({
+                "target_id": blueprint_id,
+                "needs_saga_recovery": false,
+            })
+        );
 
         // Create a non-empty blueprint describing two servers and verify that
         // the task correctly winds up making requests to both of them and
@@ -393,7 +408,13 @@ mod test {
         // Activate the task to trigger zone configuration on the sled-agents
         let value = task.activate(&opctx).await;
         println!("activating two sled agents: {:?}", value);
-        assert_eq!(value, json!({}));
+        assert_eq!(
+            value,
+            json!({
+                "target_id": blueprint.1.id.to_string(),
+                "needs_saga_recovery": false,
+            })
+        );
         s1.verify_and_clear();
         s2.verify_and_clear();