Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saga pantry attach/detach: fail if pantry is removed from DNS #6866

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions common/src/progenitor_operation_retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@ use crate::backoff::retry_notify;
use crate::backoff::retry_policy_internal_service;
use crate::backoff::BackoffError;

#[derive(Debug)]
#[derive(Debug, thiserror::Error)]
pub enum ProgenitorOperationRetryError<E> {
/// Nexus determined that the operation will never return a known result
/// because the remote server is gone.
#[error("remote server is gone")]
Gone,

/// Attempting to check if the retry loop should be stopped failed
GoneCheckError(Error),
#[error("failed to determine whether remote server is gone")]
GoneCheckError(#[source] Error),

/// The retry loop progenitor operation saw a permanent client error
ProgenitorError(progenitor_client::Error<E>),
#[error("permanent error")]
ProgenitorError(#[source] progenitor_client::Error<E>),
}

impl<E> ProgenitorOperationRetryError<E> {
Expand Down
82 changes: 65 additions & 17 deletions nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ use super::*;

use crate::Nexus;
use crucible_pantry_client::types::VolumeConstructionRequest;
use internal_dns_types::names::ServiceName;
use nexus_db_queries::authz;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db;
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::progenitor_operation_retry::ProgenitorOperationRetry;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use std::net::SocketAddrV6;
Expand All @@ -26,7 +27,7 @@ pub(crate) use pantry_pool::PooledPantryClient;
// Common Pantry operations

pub(crate) async fn get_pantry_address(
nexus: &Arc<Nexus>,
nexus: &Nexus,
) -> Result<SocketAddrV6, ActionError> {
let client = nexus.pantry_connection_pool().claim().await.map_err(|e| {
ActionError::action_failed(format!(
Expand All @@ -37,10 +38,42 @@ pub(crate) async fn get_pantry_address(
Ok(client.address())
}

// Helper function for attach/detach below: we retry as long as the pantry isn't
// gone, and we detect "gone" by seeing whether the pantry address we've chosen
// is still present when we resolve all the crucible pantry records in DNS.
//
// This function never returns an error because it's expected to be used with
// `ProgenitorOperationRetry`, which treats an error in the "gone check" as a
// fatal error. We don't want to amplify failures: if something is wrong with
// DNS, we can't go back and choose another pantry anyway, so we'll just keep
// retrying until DNS comes back. All that to say: a failure to resolve DNS is
// treated as "the pantry is not gone".
pub(super) async fn is_pantry_gone(
nexus: &Nexus,
pantry_address: SocketAddrV6,
log: &Logger,
) -> bool {
let all_pantry_dns_entries = match nexus
.resolver()
.lookup_all_socket_v6(ServiceName::CruciblePantry)
.await
{
Ok(entries) => entries,
Err(err) => {
warn!(
log, "Failed to resolve Crucible pantry in DNS";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
log, "Failed to resolve Crucible pantry in DNS";
log, "Failed to query DNS for Crucible pantry";

super nitty but sometimes "failed to resolve X in DNS" means "I successfully reached DNS and X wasn't there", which isn't the case here

InlineErrorChain::new(&err),
);
return false;
}
};
!all_pantry_dns_entries.contains(&pantry_address)
}

pub(crate) async fn call_pantry_attach_for_disk(
log: &slog::Logger,
opctx: &OpContext,
nexus: &Arc<Nexus>,
nexus: &Nexus,
disk_id: Uuid,
pantry_address: SocketAddrV6,
) -> Result<(), ActionError> {
Expand Down Expand Up @@ -82,18 +115,26 @@ pub(crate) async fn call_pantry_attach_for_disk(
volume_construction_request,
};

retry_until_known_result(log, || async {
client.attach(&disk_id.to_string(), &attach_request).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry attach failed with {:?}", e))
})?;
let attach_operation =
|| async { client.attach(&disk_id.to_string(), &attach_request).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(attach_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry attach failed: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}

pub(crate) async fn call_pantry_detach_for_disk(
nexus: &Nexus,
log: &slog::Logger,
disk_id: Uuid,
pantry_address: SocketAddrV6,
Expand All @@ -104,13 +145,20 @@ pub(crate) async fn call_pantry_detach_for_disk(

let client = crucible_pantry_client::Client::new(&endpoint);

retry_until_known_result(log, || async {
client.detach(&disk_id.to_string()).await
})
.await
.map_err(|e| {
ActionError::action_failed(format!("pantry detach failed with {:?}", e))
})?;
let detach_operation =
|| async { client.detach(&disk_id.to_string()).await };
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(detach_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"pantry detach failed: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}
Expand Down
8 changes: 7 additions & 1 deletion nexus/src/app/sagas/disk_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,13 @@ async fn sdc_call_pantry_attach_for_disk_undo(

let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, disk_id, pantry_address).await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
disk_id,
pantry_address,
)
.await?;

Ok(())
}
Expand Down
8 changes: 7 additions & 1 deletion nexus/src/app/sagas/finalize_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,13 @@ async fn sfd_call_pantry_detach_for_disk(
let params = sagactx.saga_params::<Params>()?;
let pantry_address = sagactx.lookup::<SocketAddrV6>("pantry_address")?;

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address).await
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await
}

async fn sfd_clear_pantry_address(
Expand Down
11 changes: 6 additions & 5 deletions nexus/src/app/sagas/region_replacement_drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1284,11 +1284,12 @@ async fn execute_pantry_drive_action(
volume_id: Uuid,
job_id: Uuid,
) -> Result<(), ActionError> {
// Importantly, _do not use `call_pantry_attach_for_disk`_! That uses
// `retry_until_known_result`, which we _do not want here_. The Pantry
// attach can fail if there's a racing Volume checkout to be sent to
// Propolis. Additionally, that call uses `attach` instead of
// `attach_activate_background`, which means it will hang on the activation.
// Importantly, _do not use `call_pantry_attach_for_disk`_! That retries
// as long as `pantry_address` is still resolvable in DNS, which we _do not
// want here_. The Pantry attach can fail if there's a racing Volume
// checkout to be sent to Propolis. Additionally, that call uses `attach`
// instead of `attach_activate_background`, which means it will hang on the
// activation.
Comment on lines +1287 to +1292
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would update this message to only

Importantly, _do not use `call_pantry_attach_for_disk`_! That call uses `attach` instead of `attach_activate_background`, which means it will hang on the activation.

At some point, some sort of retry should be added here, but that shouldn't block this PR.


let endpoint = format!("http://{}", pantry_address);
let client = crucible_pantry_client::Client::new(&endpoint);
Expand Down
42 changes: 30 additions & 12 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
use super::{
common_storage::{
call_pantry_attach_for_disk, call_pantry_detach_for_disk,
get_pantry_address,
get_pantry_address, is_pantry_gone,
},
ActionRegistry, NexusActionContext, NexusSaga, SagaInitError,
ACTION_GENERATE_ID,
Expand All @@ -103,9 +103,11 @@ use anyhow::anyhow;
use nexus_db_model::Generation;
use nexus_db_queries::db::identity::{Asset, Resource};
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::{
api::external, progenitor_operation_retry::ProgenitorOperationRetry,
};
use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
use rand::{rngs::StdRng, RngCore, SeedableRng};
use serde::Deserialize;
Expand Down Expand Up @@ -1145,8 +1147,13 @@ async fn ssc_call_pantry_attach_for_disk_undo(
pantry_address
);

call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await?;
} else {
info!(
log,
Expand All @@ -1162,6 +1169,7 @@ async fn ssc_call_pantry_snapshot_for_disk(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let log = sagactx.user_data().log();
let nexus = sagactx.user_data().nexus();
let params = sagactx.saga_params::<Params>()?;

let (pantry_address, _) =
Expand All @@ -1180,7 +1188,7 @@ async fn ssc_call_pantry_snapshot_for_disk(

let client = crucible_pantry_client::Client::new(&endpoint);

retry_until_known_result(log, || async {
let snapshot_operation = || async {
client
.snapshot(
&params.disk_id.to_string(),
Expand All @@ -1189,11 +1197,16 @@ async fn ssc_call_pantry_snapshot_for_disk(
},
)
.await
})
.await
.map_err(|e| {
ActionError::action_failed(Error::internal_error(&e.to_string()))
})?;
};
let gone_check =
|| async { Ok(is_pantry_gone(nexus, pantry_address, log).await) };

ProgenitorOperationRetry::new(snapshot_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(Error::internal_error(&e.to_string()))
})?;

Ok(())
}
Expand Down Expand Up @@ -1248,8 +1261,13 @@ async fn ssc_call_pantry_detach_for_disk(
params.disk_id,
pantry_address
);
call_pantry_detach_for_disk(&log, params.disk_id, pantry_address)
.await?;
call_pantry_detach_for_disk(
sagactx.user_data().nexus(),
&log,
params.disk_id,
pantry_address,
)
.await?;
}

Ok(())
Expand Down
Loading