Skip to content

Commit

Permalink
Remove retry_until_known_result()
Browse files Browse the repository at this point in the history
This builds on #6866. After its changes, there was only caller of
`retry_until_known_result` left; this PR removes it. We keep the retry
loop, but instead of retrying for ever, we bail out if the sled we're
trying to reach is "gone", as determined by "is it no longer
in-service", which in practice means it's been expunged.
  • Loading branch information
jgallagher committed Oct 14, 2024
1 parent 4012296 commit 2794c3e
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 44 deletions.
38 changes: 0 additions & 38 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,44 +61,6 @@ impl slog::KV for FileKv {

pub const OMICRON_DPD_TAG: &str = "omicron";

use crate::api::external::Error;
use crate::progenitor_operation_retry::ProgenitorOperationRetry;
use crate::progenitor_operation_retry::ProgenitorOperationRetryError;
use std::future::Future;

/// Retry a progenitor client operation until a known result is returned.
///
/// See [`ProgenitorOperationRetry`] for more information.
// TODO mark this deprecated, `never_bail` is a bad idea
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
match ProgenitorOperationRetry::new(f, never_bail).run(log).await {
Ok(v) => Ok(v),

Err(e) => match e {
ProgenitorOperationRetryError::ProgenitorError(e) => Err(e),

ProgenitorOperationRetryError::Gone
| ProgenitorOperationRetryError::GoneCheckError(_) => {
// ProgenitorOperationRetry::new called with `never_bail` as the
// bail check should never return these variants!
unreachable!();
}
},
}
}

async fn never_bail() -> Result<bool, Error> {
Ok(false)
}

/// A wrapper struct that does nothing other than elide the inner value from
/// [`std::fmt::Debug`] output.
///
Expand Down
12 changes: 12 additions & 0 deletions nexus/db-queries/src/db/datastore/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,18 @@ impl DataStore {
Ok((sled, was_modified))
}

/// Confirms that a sled exists and is in-service.
pub async fn check_sled_in_service(
&self,
opctx: &OpContext,
sled_id: Uuid,
) -> Result<(), Error> {
let conn = &*self.pool_connection_authorized(&opctx).await?;
Self::check_sled_in_service_on_connection(conn, sled_id)
.await
.map_err(From::from)
}

/// Confirms that a sled exists and is in-service.
///
/// This function may be called from a transaction context.
Expand Down
25 changes: 19 additions & 6 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ use nexus_db_model::Generation;
use nexus_db_queries::db::identity::{Asset, Resource};
use nexus_db_queries::db::lookup::LookupPath;
use omicron_common::api::external::Error;
use omicron_common::retry_until_known_result;
use omicron_common::{
api::external, progenitor_operation_retry::ProgenitorOperationRetry,
};
Expand All @@ -116,6 +115,7 @@ use sled_agent_client::types::CrucibleOpts;
use sled_agent_client::types::VmmIssueDiskSnapshotRequestBody;
use sled_agent_client::types::VolumeConstructionRequest;
use slog::info;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::net::SocketAddrV6;
Expand Down Expand Up @@ -860,18 +860,31 @@ async fn ssc_send_snapshot_request_to_sled_agent(
.await
.map_err(ActionError::action_failed)?;

retry_until_known_result(log, || async {
let snapshot_operation = || async {
sled_agent_client
.vmm_issue_disk_snapshot_request(
&PropolisUuid::from_untyped_uuid(propolis_id),
&params.disk_id,
&VmmIssueDiskSnapshotRequestBody { snapshot_id },
)
.await
})
.await
.map_err(|e| e.to_string())
.map_err(ActionError::action_failed)?;
};
let gone_check = || async {
osagactx.datastore().check_sled_in_service(&opctx, sled_id).await?;
// `check_sled_in_service` returns an error if the sled is no longer in
// service; if it succeeds, the sled is not gone.
Ok(false)
};

ProgenitorOperationRetry::new(snapshot_operation, gone_check)
.run(log)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"failed to issue VMM disk snapshot request: {}",
InlineErrorChain::new(&e)
))
})?;

Ok(())
}
Expand Down

0 comments on commit 2794c3e

Please sign in to comment.