From 64c533280c8d2ed7e6a25fa2600497e9f9b4d0e1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Jul 2024 17:31:28 -0700 Subject: [PATCH] breadcrumbs for next time (and it compiles) --- .../src/app/background/tasks/saga_recovery.rs | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/nexus/src/app/background/tasks/saga_recovery.rs b/nexus/src/app/background/tasks/saga_recovery.rs index a251770a28..7cfd3ab458 100644 --- a/nexus/src/app/background/tasks/saga_recovery.rs +++ b/nexus/src/app/background/tasks/saga_recovery.rs @@ -2,6 +2,34 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +// XXX-dap working here +// As I write this now, there's one compile error left: we're incorrectly trying +// to store a Future into a map that's specified to hold timestamps. This is an +// artifact of combining two pre-existing hunks of code: one tracked all sagas +// recovered and stored timestamps; the other tracked the sagas recovered *in +// one pass* and stored the completion Futures. The completion Futures are only +// used for the test suite. So the answer here is probably to refactor some of +// this stuff to expose smaller pieces for use by the test suite. +// +// That's an important next step anyway. I need to take a pass through +// `activate()` and all the functions that it calls to figure out how to +// decompose it into smaller pieces, particularly for testing, but also in hopes +// that it'll be more obviously correct. Some ideas: +// - separate out planning from execution? i.e., compute which sagas are +// to be recovered/skipped/etc. with a helper that we can test in isolation. +// This is a little tricky because recovery *is* one of those steps needed to +// determine if something succeeded or failed. +// - create a StatusBuilder where we can report what we're doing? This seems +// kind of hard though +// +// I think maybe the very next step is to change `self.sagas_recovered` to +// `self.recent_sagas_recovered` and make it look like `self.recent_failures`. +// +// XXX-dap at the end, verify: +// - counters (maybe plumb these into Oximeter?) +// - task status reported by omdb +// - log entries + //! Saga recovery //! //! ## Review of distributed sagas @@ -349,7 +377,6 @@ impl SagaRecovery { bgtask_log: &slog::Logger, candidate_sagas: Vec, ) -> LastPassSuccess { - let datastore = &self.datastore; let nfound = candidate_sagas.len(); let mut nskipped = 0; let mut nrecovered = 0; @@ -358,11 +385,13 @@ impl SagaRecovery { let mut nremoved = 0; let time = Utc::now(); - let candidate_saga_ids: BTreeMap = - candidate_sagas - .into_iter() - .map(|saga| (saga.id.into(), saga)) - .collect(); + let mut candidate_saga_ids: BTreeMap< + steno::SagaId, + nexus_db_model::Saga, + > = candidate_sagas + .into_iter() + .map(|saga| (saga.id.into(), saga)) + .collect(); // First of all, remove finished sagas from our "ignore" set. // @@ -466,9 +495,9 @@ impl SagaRecovery { Ok(completion_future) => { info!(&saga_log, "recovered saga"); nrecovered += 1; - // XXX-dap where to store these completion futures - self.sagas_recovered - .insert(saga_id, completion_future.boxed()); + // XXX-dap what to do with the completion future (besides + // boxing it) + self.sagas_recovered.insert(saga_id, time); } Err(error) => { // It's essential that we not bail out early just because we @@ -493,7 +522,7 @@ impl SagaRecovery { "nfailed" => nfailed, "nskipped" => nskipped, "nambiguous" => nambiguous, - "nremove_next" => remove_next.len(), + "nremove_next" => self.remove_next.len(), ); assert_eq!(nrecovered + nfailed + nskipped + nambiguous, nfound);