diff --git a/common/src/progenitor_operation_retry.rs b/common/src/progenitor_operation_retry.rs index 6007e117b3..4864cc90b7 100644 --- a/common/src/progenitor_operation_retry.rs +++ b/common/src/progenitor_operation_retry.rs @@ -11,17 +11,20 @@ use crate::backoff::retry_notify; use crate::backoff::retry_policy_internal_service; use crate::backoff::BackoffError; -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum ProgenitorOperationRetryError { /// Nexus determined that the operation will never return a known result /// because the remote server is gone. + #[error("remote server is gone")] Gone, /// Attempting to check if the retry loop should be stopped failed - GoneCheckError(Error), + #[error("failed to determine whether remote server is gone")] + GoneCheckError(#[source] Error), /// The retry loop progenitor operation saw a permanent client error - ProgenitorError(progenitor_client::Error), + #[error("permanent error")] + ProgenitorError(#[source] progenitor_client::Error), } impl ProgenitorOperationRetryError { diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index 28e0bc4771..dc25589b37 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -8,12 +8,13 @@ use super::*; use crate::Nexus; use crucible_pantry_client::types::VolumeConstructionRequest; +use internal_dns_types::names::ServiceName; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; -use omicron_common::retry_until_known_result; +use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::net::SocketAddrV6; @@ -26,7 +27,7 @@ pub(crate) use pantry_pool::PooledPantryClient; // Common Pantry operations pub(crate) async fn get_pantry_address( - nexus: &Arc, + nexus: &Nexus, ) -> Result { let client = nexus.pantry_connection_pool().claim().await.map_err(|e| { ActionError::action_failed(format!( @@ -37,10 +38,42 @@ pub(crate) async fn get_pantry_address( Ok(client.address()) } +// Helper function for attach/detach below: we retry as long as the pantry isn't +// gone, and we detect "gone" by seeing whether the pantry address we've chosen +// is still present when we resolve all the crucible pantry records in DNS. +// +// This function never returns an error because it's expected to be used with +// `ProgenitorOperationRetry`, which treats an error in the "gone check" as a +// fatal error. We don't want to amplify failures: if something is wrong with +// DNS, we can't go back and choose another pantry anyway, so we'll just keep +// retrying until DNS comes back. All that to say: a failure to resolve DNS is +// treated as "the pantry is not gone". +async fn is_pantry_gone( + nexus: &Nexus, + pantry_address: SocketAddrV6, + log: &Logger, +) -> bool { + let all_pantry_dns_entries = match nexus + .resolver() + .lookup_all_socket_v6(ServiceName::CruciblePantry) + .await + { + Ok(entries) => entries, + Err(err) => { + warn!( + log, "Failed to resolve Crucible pantry in DNS"; + InlineErrorChain::new(&err), + ); + return false; + } + }; + !all_pantry_dns_entries.contains(&pantry_address) +} + pub(crate) async fn call_pantry_attach_for_disk( log: &slog::Logger, opctx: &OpContext, - nexus: &Arc, + nexus: &Nexus, disk_id: Uuid, pantry_address: SocketAddrV6, ) -> Result<(), ActionError> { @@ -82,18 +115,26 @@ pub(crate) async fn call_pantry_attach_for_disk( volume_construction_request, }; - retry_until_known_result(log, || async { - client.attach(&disk_id.to_string(), &attach_request).await - }) - .await - .map_err(|e| { - ActionError::action_failed(format!("pantry attach failed with {:?}", e)) - })?; + let attach_operation = + || async { client.attach(&disk_id.to_string(), &attach_request).await }; + let gone_check = + || async { Ok(is_pantry_gone(nexus, pantry_address, log).await) }; + + ProgenitorOperationRetry::new(attach_operation, gone_check) + .run(log) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "pantry attach failed: {}", + InlineErrorChain::new(&e) + )) + })?; Ok(()) } pub(crate) async fn call_pantry_detach_for_disk( + nexus: &Nexus, log: &slog::Logger, disk_id: Uuid, pantry_address: SocketAddrV6, @@ -104,13 +145,20 @@ pub(crate) async fn call_pantry_detach_for_disk( let client = crucible_pantry_client::Client::new(&endpoint); - retry_until_known_result(log, || async { - client.detach(&disk_id.to_string()).await - }) - .await - .map_err(|e| { - ActionError::action_failed(format!("pantry detach failed with {:?}", e)) - })?; + let detach_operation = + || async { client.detach(&disk_id.to_string()).await }; + let gone_check = + || async { Ok(is_pantry_gone(nexus, pantry_address, log).await) }; + + ProgenitorOperationRetry::new(detach_operation, gone_check) + .run(log) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "pantry detach failed: {}", + InlineErrorChain::new(&e) + )) + })?; Ok(()) } diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..844669a7e0 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -769,7 +769,13 @@ async fn sdc_call_pantry_attach_for_disk_undo( let pantry_address = sagactx.lookup::("pantry_address")?; - call_pantry_detach_for_disk(&log, disk_id, pantry_address).await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + disk_id, + pantry_address, + ) + .await?; Ok(()) } diff --git a/nexus/src/app/sagas/finalize_disk.rs b/nexus/src/app/sagas/finalize_disk.rs index 89893fb703..997914df45 100644 --- a/nexus/src/app/sagas/finalize_disk.rs +++ b/nexus/src/app/sagas/finalize_disk.rs @@ -286,7 +286,13 @@ async fn sfd_call_pantry_detach_for_disk( let params = sagactx.saga_params::()?; let pantry_address = sagactx.lookup::("pantry_address")?; - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address).await + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await } async fn sfd_clear_pantry_address( diff --git a/nexus/src/app/sagas/region_replacement_drive.rs b/nexus/src/app/sagas/region_replacement_drive.rs index 4e6335435f..7c90c4d65d 100644 --- a/nexus/src/app/sagas/region_replacement_drive.rs +++ b/nexus/src/app/sagas/region_replacement_drive.rs @@ -1284,11 +1284,12 @@ async fn execute_pantry_drive_action( volume_id: Uuid, job_id: Uuid, ) -> Result<(), ActionError> { - // Importantly, _do not use `call_pantry_attach_for_disk`_! That uses - // `retry_until_known_result`, which we _do not want here_. The Pantry - // attach can fail if there's a racing Volume checkout to be sent to - // Propolis. Additionally, that call uses `attach` instead of - // `attach_activate_background`, which means it will hang on the activation. + // Importantly, _do not use `call_pantry_attach_for_disk`_! That retries + // as long as `pantry_address` is still resolvable in DNS, which we _do not + // want here_. The Pantry attach can fail if there's a racing Volume + // checkout to be sent to Propolis. Additionally, that call uses `attach` + // instead of `attach_activate_background`, which means it will hang on the + // activation. let endpoint = format!("http://{}", pantry_address); let client = crucible_pantry_client::Client::new(&endpoint); diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index b47d036d9c..6a5862f883 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1145,8 +1145,13 @@ async fn ssc_call_pantry_attach_for_disk_undo( pantry_address ); - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address) - .await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await?; } else { info!( log, @@ -1248,8 +1253,13 @@ async fn ssc_call_pantry_detach_for_disk( params.disk_id, pantry_address ); - call_pantry_detach_for_disk(&log, params.disk_id, pantry_address) - .await?; + call_pantry_detach_for_disk( + sagactx.user_data().nexus(), + &log, + params.disk_id, + pantry_address, + ) + .await?; } Ok(())