From ab29a029ba612722cc062f0d691a7b63deb0afe7 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 20 Aug 2024 21:58:12 -0700 Subject: [PATCH] [reconfigurator] a couple of type improvements (#6395) * Use a typestate for the physical disk dependency. Just want to make sure the ordering is right once this moves to being expressed via the update engine. * Add strong typing around the output of `realize_blueprint`. --- .../execution/src/cockroachdb.rs | 22 ++-- nexus/reconfigurator/execution/src/dns.rs | 106 +++++++++--------- nexus/reconfigurator/execution/src/lib.rs | 28 +++-- .../execution/src/omicron_physical_disks.rs | 51 ++++++--- .../background/tasks/blueprint_execution.rs | 41 +++++-- 5 files changed, 156 insertions(+), 92 deletions(-) diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs index 277f5f91c4..01baebfb57 100644 --- a/nexus/reconfigurator/execution/src/cockroachdb.rs +++ b/nexus/reconfigurator/execution/src/cockroachdb.rs @@ -34,6 +34,7 @@ pub(crate) async fn ensure_settings( mod test { use super::*; use crate::overridables::Overridables; + use crate::RealizeBlueprintOutput; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_test_utils_macros::nexus_test; @@ -97,16 +98,17 @@ mod test { .await; // Execute the initial blueprint. let overrides = Overridables::for_test(cptestctx); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute initial blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute initial blueprint"); // The CockroachDB settings should not have changed. assert_eq!( settings, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 4395944b25..9ca14f8e24 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -458,6 +458,7 @@ pub fn blueprint_nexus_external_ips(blueprint: &Blueprint) -> Vec { mod test { use super::*; use crate::overridables::Overridables; + use crate::RealizeBlueprintOutput; use crate::Sled; use dns_service_client::DnsDiff; use internal_dns::config::Host; @@ -1245,16 +1246,17 @@ mod test { // Now, execute the initial blueprint. let overrides = Overridables::for_test(cptestctx); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute initial blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute initial blueprint"); // DNS ought not to have changed. verify_dns_unchanged( @@ -1385,16 +1387,17 @@ mod test { .await .expect("failed to set blueprint as target"); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint"); // Now fetch DNS again. Both should have changed this time. let dns_latest_internal = datastore @@ -1459,16 +1462,17 @@ mod test { } // If we execute it again, we should see no more changes. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint again"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); verify_dns_unchanged( &opctx, datastore, @@ -1495,16 +1499,17 @@ mod test { // One more time, make sure that executing the blueprint does not do // anything. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint again"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); verify_dns_unchanged( &opctx, datastore, @@ -1589,16 +1594,17 @@ mod test { ); // If we execute the blueprint, DNS should not be changed. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute blueprint"); let dns_latest_internal = datastore .dns_config_read(&opctx, DnsGroup::Internal) .await diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 8606187762..2c70c7acbb 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -70,6 +70,15 @@ impl From for Sled { } } +/// The result of calling [`realize_blueprint`] or +/// [`realize_blueprint_with_overrides`]. +#[derive(Debug)] +#[must_use = "the output of realize_blueprint should probably be used"] +pub struct RealizeBlueprintOutput { + /// Whether any sagas need to be reassigned to a new Nexus. + pub needs_saga_recovery: bool, +} + /// Make one attempt to realize the given blueprint, meaning to take actions to /// alter the real system to match the blueprint /// @@ -81,7 +90,7 @@ pub async fn realize_blueprint( resolver: &Resolver, blueprint: &Blueprint, nexus_id: Uuid, -) -> Result> { +) -> Result> { realize_blueprint_with_overrides( opctx, datastore, @@ -100,7 +109,7 @@ pub async fn realize_blueprint_with_overrides( blueprint: &Blueprint, nexus_id: Uuid, overrides: &Overridables, -) -> Result> { +) -> Result> { let opctx = opctx.child(BTreeMap::from([( "comment".to_string(), blueprint.comment.clone(), @@ -132,7 +141,7 @@ pub async fn realize_blueprint_with_overrides( }) .collect(); - omicron_physical_disks::deploy_disks( + let deploy_disks_done = omicron_physical_disks::deploy_disks( &opctx, &sleds_by_id, &blueprint.blueprint_disks, @@ -205,11 +214,12 @@ pub async fn realize_blueprint_with_overrides( ) .await?; - // This depends on the "deploy_disks" call earlier -- disk expungement is a - // statement of policy, but we need to be assured that the Sled Agent has - // stopped using that disk before we can mark its state as decommissioned. - omicron_physical_disks::decommission_expunged_disks(&opctx, datastore) - .await?; + omicron_physical_disks::decommission_expunged_disks( + &opctx, + datastore, + deploy_disks_done, + ) + .await?; // From this point on, we'll assume that any errors that we encounter do // *not* require stopping execution. We'll just accumulate them and return @@ -244,7 +254,7 @@ pub async fn realize_blueprint_with_overrides( } if errors.is_empty() { - Ok(needs_saga_recovery) + Ok(RealizeBlueprintOutput { needs_saga_recovery }) } else { Err(errors) } diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs index 7adc41213e..af95eb8e77 100644 --- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs +++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs @@ -25,7 +25,7 @@ pub(crate) async fn deploy_disks( opctx: &OpContext, sleds_by_id: &BTreeMap, sled_configs: &BTreeMap, -) -> Result<(), Vec> { +) -> Result> { let errors: Vec<_> = stream::iter(sled_configs) .filter_map(|(sled_id, config)| async move { let log = opctx.log.new(o!( @@ -92,16 +92,26 @@ pub(crate) async fn deploy_disks( .await; if errors.is_empty() { - Ok(()) + Ok(DeployDisksDone {}) } else { Err(errors) } } -/// Decommissions all disks which are currently expunged +/// Typestate indicating that the deploy disks step was performed. +#[derive(Debug)] +#[must_use = "this should be passed into decommission_expunged_disks"] +pub(crate) struct DeployDisksDone {} + +/// Decommissions all disks which are currently expunged. pub(crate) async fn decommission_expunged_disks( opctx: &OpContext, datastore: &DataStore, + // This is taken as a parameter to ensure that this depends on a + // "deploy_disks" call made earlier. Disk expungement is a statement of + // policy, but we need to be assured that the Sled Agent has stopped using + // that disk before we can mark its state as decommissioned. + _deploy_disks_done: DeployDisksDone, ) -> Result<(), Vec> { datastore .physical_disk_decommission_all_expunged(&opctx) @@ -113,6 +123,7 @@ pub(crate) async fn decommission_expunged_disks( #[cfg(test)] mod test { use super::deploy_disks; + use super::DeployDisksDone; use crate::DataStore; use crate::Sled; @@ -217,9 +228,13 @@ mod test { // Get a success result back when the blueprint has an empty set of // disks. let (_, blueprint) = create_blueprint(BTreeMap::new()); - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy no disks"); + // Use an explicit type here because not doing so can cause errors to + // be ignored (this behavior is genuinely terrible). Instead, ensure + // that the type has the right result. + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy no disks"); // Disks are updated in a particular order, but each request contains // the full set of disks that must be running. @@ -272,9 +287,10 @@ mod test { } // Execute it. - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy initial disks"); + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy initial disks"); s1.verify_and_clear(); s2.verify_and_clear(); @@ -293,9 +309,10 @@ mod test { )), ); } - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy same disks"); + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy same disks"); s1.verify_and_clear(); s2.verify_and_clear(); @@ -567,7 +584,15 @@ mod test { assert_eq!(d.disk_state, PhysicalDiskState::Active); assert_eq!(d.disk_policy, PhysicalDiskPolicy::InService); - super::decommission_expunged_disks(&opctx, &datastore).await.unwrap(); + super::decommission_expunged_disks( + &opctx, + &datastore, + // This is an internal test, and we're testing decommissioning in + // isolation, so it's okay to create the typestate here. + DeployDisksDone {}, + ) + .await + .unwrap(); // After decommissioning, we see the expunged disk become // decommissioned. The other disk remains in-service. diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index b430270ec9..d13e5428f8 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -10,6 +10,7 @@ use futures::FutureExt; use internal_dns::resolver::Resolver; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_reconfigurator_execution::RealizeBlueprintOutput; use nexus_types::deployment::{Blueprint, BlueprintTarget}; use serde_json::json; use std::sync::Arc; @@ -98,16 +99,23 @@ impl BlueprintExecutor { // Trigger anybody waiting for this to finish. self.tx.send_modify(|count| *count = *count + 1); - // If executing the blueprint requires activating the saga recovery - // background task, do that now. - info!(&opctx.log, "activating saga recovery task"); - if let Ok(true) = result { - self.saga_recovery.activate(); - } - // Return the result as a `serde_json::Value` match result { - Ok(_) => json!({}), + Ok(RealizeBlueprintOutput { needs_saga_recovery }) => { + // If executing the blueprint requires activating the saga + // recovery background task, do that now. + if let Ok(output) = &result { + if output.needs_saga_recovery { + info!(&opctx.log, "activating saga recovery task"); + self.saga_recovery.activate(); + } + } + + json!({ + "target_id": blueprint.id.to_string(), + "needs_saga_recovery": needs_saga_recovery, + }) + } Err(errors) => { let errors: Vec<_> = errors.into_iter().map(|e| format!("{:#}", e)).collect(); @@ -302,10 +310,17 @@ mod test { ) .await, ); + let blueprint_id = blueprint.1.id; blueprint_tx.send(Some(blueprint)).unwrap(); let value = task.activate(&opctx).await; println!("activating with no zones: {:?}", value); - assert_eq!(value, json!({})); + assert_eq!( + value, + json!({ + "target_id": blueprint_id, + "needs_saga_recovery": false, + }) + ); // Create a non-empty blueprint describing two servers and verify that // the task correctly winds up making requests to both of them and @@ -393,7 +408,13 @@ mod test { // Activate the task to trigger zone configuration on the sled-agents let value = task.activate(&opctx).await; println!("activating two sled agents: {:?}", value); - assert_eq!(value, json!({})); + assert_eq!( + value, + json!({ + "target_id": blueprint.1.id.to_string(), + "needs_saga_recovery": false, + }) + ); s1.verify_and_clear(); s2.verify_and_clear();