From 2d2465207884e75e35d2ba30948de7fafade2821 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Oct 2024 14:38:08 -0400 Subject: [PATCH 1/2] Saga pantry attach/detach: fail if pantry is removed from DNS Removal from DNS should correspond with the pantry zone's expungement, which should result in a saga node failure. --- common/src/progenitor_operation_retry.rs | 9 +- nexus/src/app/sagas/common_storage.rs | 82 +++++++++++++++---- nexus/src/app/sagas/disk_create.rs | 8 +- nexus/src/app/sagas/finalize_disk.rs | 8 +- .../src/app/sagas/region_replacement_drive.rs | 11 +-- nexus/src/app/sagas/snapshot_create.rs | 18 +++- 6 files changed, 105 insertions(+), 31 deletions(-) diff --git a/common/src/progenitor_operation_retry.rs b/common/src/progenitor_operation_retry.rs index 6007e117b3..4864cc90b7 100644 --- a/common/src/progenitor_operation_retry.rs +++ b/common/src/progenitor_operation_retry.rs @@ -11,17 +11,20 @@ use crate::backoff::retry_notify; use crate::backoff::retry_policy_internal_service; use crate::backoff::BackoffError; -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum ProgenitorOperationRetryError { /// Nexus determined that the operation will never return a known result /// because the remote server is gone. + #[error("remote server is gone")] Gone, /// Attempting to check if the retry loop should be stopped failed - GoneCheckError(Error), + #[error("failed to determine whether remote server is gone")] + GoneCheckError(#[source] Error), /// The retry loop progenitor operation saw a permanent client error - ProgenitorError(progenitor_client::Error), + #[error("permanent error")] + ProgenitorError(#[source] progenitor_client::Error), } impl ProgenitorOperationRetryError { diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index 28e0bc4771..dc25589b37 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -8,12 +8,13 @@ use super::*; use crate::Nexus; use crucible_pantry_client::types::VolumeConstructionRequest; +use internal_dns_types::names::ServiceName; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; -use omicron_common::retry_until_known_result; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::net::SocketAddrV6; @@ -26,7 +27,7 @@ pub(crate) use pantry_pool::PooledPantryClient; // Common Pantry operations pub(crate) async fn get_pantry_address( - nexus: &Arc, + nexus: &Nexus, ) -> Result { let client = nexus.pantry_connection_pool().claim().await.map_err(|e| { ActionError::action_failed(format!( @@ -37,10 +38,42 @@ pub(crate) async fn get_pantry_address( Ok(client.address()) } +// Helper function for attach/detach below: we retry as long as the pantry isn't +// gone, and we detect "gone" by seeing whether the pantry address we've chosen +// is still present when we resolve all the crucible pantry records in DNS. +// +// This function never returns an error because it's expected to be used with +// `ProgenitorOperationRetry`, which treats an error in the "gone check" as a +// fatal error. We don't want to amplify failures: if something is wrong with +// DNS, we can't go back and choose another pantry anyway, so we'll just keep +// retrying until DNS comes back. All that to say: a failure to resolve DNS is +// treated as "the pantry is not gone". +async fn is_pantry_gone( + nexus: &Nexus, + pantry_address: SocketAddrV6, + log: &Logger, +) -> bool { + let all_pantry_dns_entries = match nexus + .resolver() + .lookup_all_socket_v6(ServiceName::CruciblePantry) + .await + { + Ok(entries) => entries, + Err(err) => { + warn!( + log, "Failed to resolve Crucible pantry in DNS"; + InlineErrorChain::new(&err), + ); + return false; + } + }; + !all_pantry_dns_entries.contains(&pantry_address) +} + pub(crate) async fn call_pantry_attach_for_disk( log: &slog::Logger, opctx: &OpContext, - nexus: &Arc, + nexus: &Nexus, disk_id: Uuid, pantry_address: SocketAddrV6, ) -> Result<(), ActionError> { @@ -82,18 +115,26 @@ pub(crate) async fn call_pantry_attach_for_disk( volume_construction_request, }; - retry_until_known_result(log, || async { - client.attach(&disk_id.to_string(), &attach_request).await - }) - .await - .map_err(|e| { - ActionError::action_failed(format!("pantry attach failed with {:?}", e)) - })?; + let attach_operation = + || async { client.attach(&disk_id.to_string(), &attach_request).await }; + let gone_check = + || async { Ok(is_pantry_gone(nexus, pantry_address, log).await) }; + + ProgenitorOperationRetry::new(attach_operation, gone_check) + .run(log) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "pantry attach failed: {}", + InlineErrorChain::new(&e) + )) + })?; Ok(()) } pub(crate) async fn call_pantry_detach_for_disk( + nexus: &Nexus, log: &slog::Logger, disk_id: Uuid, pantry_address: SocketAddrV6, @@ -104,13 +145,20 @@ pub(crate) async fn call_pantry_detach_for_disk( let client = crucible_pantry_client::Client::new(&endpoint); - retry_until_known_result(log, || async { - client.detach(&disk_id.to_string()).await - }) - .await - .map_err(|e| { - ActionError::action_failed(format!("pantry detach failed with {:?}", e)) - })?; + let detach_operation = + || async { client.detach(&disk_id.to_string()).await }; + let gone_check = + || async { Ok(is_pantry_gone(nexus, pantry_address, log).await) }; + + ProgenitorOperationRetry::new(detach_operation, gone_check) + .run(log) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "pantry detach failed: {}", + InlineErrorChain::new(&e) + )) + })?; Ok(()) } diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..844669a7e0 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -769,7 +769,13 @@ async fn sdc_call_pantry_attach_for_disk_undo( let pantry_address = sagactx.lookup::("pantry_address")?; - call_pantry_detach_for_disk(&log, disk_id, pantry_address).await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + disk_id, + pantry_address, + ) + .await?; Ok(()) } diff --git a/nexus/src/app/sagas/finalize_disk.rs b/nexus/src/app/sagas/finalize_disk.rs index 89893fb703..997914df45 100644 --- a/nexus/src/app/sagas/finalize_disk.rs +++ b/nexus/src/app/sagas/finalize_disk.rs @@ -286,7 +286,13 @@ async fn sfd_call_pantry_detach_for_disk( let params = sagactx.saga_params::()?; let pantry_address = sagactx.lookup::("pantry_address")?; - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address).await + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await } async fn sfd_clear_pantry_address( diff --git a/nexus/src/app/sagas/region_replacement_drive.rs b/nexus/src/app/sagas/region_replacement_drive.rs index 4e6335435f..7c90c4d65d 100644 --- a/nexus/src/app/sagas/region_replacement_drive.rs +++ b/nexus/src/app/sagas/region_replacement_drive.rs @@ -1284,11 +1284,12 @@ async fn execute_pantry_drive_action( volume_id: Uuid, job_id: Uuid, ) -> Result<(), ActionError> { - // Importantly, _do not use `call_pantry_attach_for_disk`_! That uses - // `retry_until_known_result`, which we _do not want here_. The Pantry - // attach can fail if there's a racing Volume checkout to be sent to - // Propolis. Additionally, that call uses `attach` instead of - // `attach_activate_background`, which means it will hang on the activation. + // Importantly, _do not use `call_pantry_attach_for_disk`_! That retries + // as long as `pantry_address` is still resolvable in DNS, which we _do not + // want here_. The Pantry attach can fail if there's a racing Volume + // checkout to be sent to Propolis. Additionally, that call uses `attach` + // instead of `attach_activate_background`, which means it will hang on the + // activation. let endpoint = format!("http://{}", pantry_address); let client = crucible_pantry_client::Client::new(&endpoint); diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index b47d036d9c..6a5862f883 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1145,8 +1145,13 @@ async fn ssc_call_pantry_attach_for_disk_undo( pantry_address ); - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address) - .await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await?; } else { info!( log, @@ -1248,8 +1253,13 @@ async fn ssc_call_pantry_detach_for_disk( params.disk_id, pantry_address ); - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address) - .await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await?; } Ok(()) From 401229643f80a4994c7c704932a06f685cb2d505 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 14 Oct 2024 16:17:58 -0400 Subject: [PATCH 2/2] missed a pantry retry_until_known_result --- nexus/src/app/sagas/common_storage.rs | 2 +- nexus/src/app/sagas/snapshot_create.rs | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index dc25589b37..b5bc4ee33e 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -48,7 +48,7 @@ pub(crate) async fn get_pantry_address( // DNS, we can't go back and choose another pantry anyway, so we'll just keep // retrying until DNS comes back. All that to say: a failure to resolve DNS is // treated as "the pantry is not gone". -async fn is_pantry_gone( +pub(super) async fn is_pantry_gone( nexus: &Nexus, pantry_address: SocketAddrV6, log: &Logger, diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 6a5862f883..dd10fa267b 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -91,7 +91,7 @@ use super::{ common_storage::{ call_pantry_attach_for_disk, call_pantry_detach_for_disk, - get_pantry_address, + get_pantry_address, is_pantry_gone, }, ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, @@ -103,9 +103,11 @@ use anyhow::anyhow; use nexus_db_model::Generation; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; -use omicron_common::api::external; use omicron_common::api::external::Error; use omicron_common::retry_until_known_result; +use omicron_common::{ + api::external, progenitor_operation_retry::ProgenitorOperationRetry, +}; use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid}; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; @@ -1167,6 +1169,7 @@ async fn ssc_call_pantry_snapshot_for_disk( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let log = sagactx.user_data().log(); + let nexus = sagactx.user_data().nexus(); let params = sagactx.saga_params::()?; let (pantry_address, _) = @@ -1185,7 +1188,7 @@ async fn ssc_call_pantry_snapshot_for_disk( let client = crucible_pantry_client::Client::new(&endpoint); - retry_until_known_result(log, || async { + let snapshot_operation = || async { client .snapshot( ¶ms.disk_id.to_string(), @@ -1194,11 +1197,16 @@ async fn ssc_call_pantry_snapshot_for_disk( }, ) .await - }) - .await - .map_err(|e| { - ActionError::action_failed(Error::internal_error(&e.to_string())) - })?; + }; + let gone_check = + || async { Ok(is_pantry_gone(nexus, pantry_address, log).await) }; + + ProgenitorOperationRetry::new(snapshot_operation, gone_check) + .run(log) + .await + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; Ok(()) }