Skip to content

Commit

Permalink
Saga pantry attach/detach: fail if pantry is removed from DNS
Browse files Browse the repository at this point in the history
Removal from DNS should correspond with the pantry zone's expungement,
which should result in a saga node failure.
  • Loading branch information
jgallagher committed Oct 14, 2024
1 parent cf4b8df commit 2d24652
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 31 deletions.
9 changes: 6 additions & 3 deletions common/src/progenitor_operation_retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@ use crate::backoff::retry_notify;
use crate::backoff::retry_policy_internal_service;
use crate::backoff::BackoffError;

#[derive(Debug)]
#[derive(Debug, thiserror::Error)]
pub enum ProgenitorOperationRetryError<E> {
/// Nexus determined that the operation will never return a known result
/// because the remote server is gone.
#[error("remote server is gone")]
Gone,

/// Attempting to check if the retry loop should be stopped failed
GoneCheckError(Error),
#[error("failed to determine whether remote server is gone")]
GoneCheckError(#[source] Error),

/// The retry loop progenitor operation saw a permanent client error
ProgenitorError(progenitor_client::Error<E>),
#[error("permanent error")]
ProgenitorError(#[source] progenitor_client::Error<E>),
}

impl<E> ProgenitorOperationRetryError<E> {
Expand Down
82 changes: 65 additions & 17 deletions nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ use super::*;

use crate::Nexus;
use crucible_pantry_client::types::VolumeConstructionRequest;
use internal_dns_types::names::ServiceName;
use nexus_db_queries::authz;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use std::net::SocketAddrV6;
Expand All @@ -26,7 +27,7 @@ pub(crate) use pantry_pool::PooledPantryClient;
// Common Pantry operations

pub(crate) async fn get_pantry_address(
nexus: &Arc<Nexus>,
nexus: &Nexus,
) -> Result<SocketAddrV6, ActionError> {
let client = nexus.pantry_connection_pool().claim().await.map_err(|e| {
ActionError::action_failed(format!(
Expand All @@ -37,10 +38,42 @@ pub(crate) async fn get_pantry_address(
Ok(client.address())
}

// Helper function for attach/detach below: we retry as long as the pantry isn't
// gone, and we detect "gone" by seeing whether the pantry address we've chosen
// is still present when we resolve all the crucible pantry records in DNS.
//
// This function never returns an error because it's expected to be used with
// `ProgenitorOperationRetry`, which treats an error in the "gone check" as a
// fatal error. We don't want to amplify failures: if something is wrong with
// DNS, we can't go back and choose another pantry anyway, so we'll just keep
// retrying until DNS comes back. All that to say: a failure to resolve DNS is
// treated as "the pantry is not gone".
async fn is_pantry_gone(
nexus: &Nexus,
pantry_address: SocketAddrV6,
log: &Logger,
) -> bool {
let all_pantry_dns_entries = match nexus
.resolver()
.lookup_all_socket_v6(ServiceName::CruciblePantry)
.await
{
Ok(entries) => entries,
Err(err) => {
warn!(
log, "Failed to resolve Crucible pantry in DNS";
InlineErrorChain::new(&err),
);
return false;
}
};
!all_pantry_dns_entries.contains(&pantry_address)
}

pub(crate) async fn call_pantry_attach_for_disk(
log: &slog::Logger,
opctx: &OpContext,
nexus: &Arc<Nexus>,
nexus: &Nexus,
disk_id: Uuid,
pantry_address: SocketAddrV6,
) -> Result<(), ActionError> {
Expand Down Expand Up @@ -82,18 +115,26 @@ pub(crate) async fn call_pantry_attach_for_disk(
volume_construction_request,
};

retry_until_known_result(log, || async {
client.attach(&disk_id.to_string(), &attach_request).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry attach failed with {:?}", e))
})?;
let attach_operation =
|| async { client.attach(&disk_id.to_string(), &attach_request).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(attach_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry attach failed: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}

pub(crate) async fn call_pantry_detach_for_disk(
nexus: &Nexus,
log: &slog::Logger,
disk_id: Uuid,
pantry_address: SocketAddrV6,
Expand All @@ -104,13 +145,20 @@ pub(crate) async fn call_pantry_detach_for_disk(

let client = crucible_pantry_client::Client::new(&endpoint);

retry_until_known_result(log, || async {
client.detach(&disk_id.to_string()).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry detach failed with {:?}", e))
})?;
let detach_operation =
|| async { client.detach(&disk_id.to_string()).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(detach_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry detach failed: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}
Expand Down
8 changes: 7 additions & 1 deletion nexus/src/app/sagas/disk_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,13 @@ async fn sdc_call_pantry_attach_for_disk_undo(

let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, disk_id, pantry_address).await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
disk_id,
pantry_address,
)
.await?;

Ok(())
}
Expand Down
8 changes: 7 additions & 1 deletion nexus/src/app/sagas/finalize_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,13 @@ async fn sfd_call_pantry_detach_for_disk(
let params = sagactx.saga_params::<Params>()?;
let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address).await
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await
}

async fn sfd_clear_pantry_address(
Expand Down
11 changes: 6 additions & 5 deletions nexus/src/app/sagas/region_replacement_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1284,11 +1284,12 @@ async fn execute_pantry_drive_action(
volume_id: Uuid,
job_id: Uuid,
) -> Result<(), ActionError> {
// Importantly, _do not use `call_pantry_attach_for_disk`_! That uses
// `retry_until_known_result`, which we _do not want here_. The Pantry
// attach can fail if there's a racing Volume checkout to be sent to
// Propolis. Additionally, that call uses `attach` instead of
// `attach_activate_background`, which means it will hang on the activation.
// Importantly, _do not use `call_pantry_attach_for_disk`_! That retries
// as long as `pantry_address` is still resolvable in DNS, which we _do not
// want here_. The Pantry attach can fail if there's a racing Volume
// checkout to be sent to Propolis. Additionally, that call uses `attach`
// instead of `attach_activate_background`, which means it will hang on the
// activation.

let endpoint = format!("http://{}", pantry_address);
let client = crucible_pantry_client::Client::new(&endpoint);
Expand Down
18 changes: 14 additions & 4 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1145,8 +1145,13 @@ async fn ssc_call_pantry_attach_for_disk_undo(
pantry_address
);

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await?;
} else {
info!(
log,
Expand Down Expand Up @@ -1248,8 +1253,13 @@ async fn ssc_call_pantry_detach_for_disk(
params.disk_id,
pantry_address
);
call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await?;
}

Ok(())
Expand Down

0 comments on commit 2d24652

Please sign in to comment.