diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index f008b04819b..f7d35090d2c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -518,6 +518,164 @@ impl DataStore { Ok(updated) } + /// Updates an instance record by setting the instance's migration ID. + pub async fn instance_set_migration_ids( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + src_propolis_id: PropolisUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let src_propolis_id = src_propolis_id.into_untyped_uuid(); + let updated = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::migration_id.is_null()) + .filter(dsl::target_propolis_id.is_null()) + .filter(dsl::active_propolis_id.eq(src_propolis_id)) + .set(( + dsl::migration_id.eq(Some(migration_id)), + dsl::target_propolis_id.eq(Some(target_propolis_id)), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .check_if_exists::(instance_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match updated { + // If we updated the instance, that's great! Good job team! + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + Ok(true) + } + // No update was performed because the migration ID has already been + // set to the ID we were trying to set it to. That's fine, count it + // as a success. + UpdateAndQueryResult { + found: Instance { runtime_state, .. }, + .. + } if runtime_state.migration_id == Some(migration_id) => { + debug_assert_eq!( + runtime_state.dst_propolis_id, + Some(target_propolis_id) + ); + debug_assert_eq!( + runtime_state.propolis_id, + Some(src_propolis_id) + ); + Ok(false) + } + + // On the other hand, if there was already a different migration ID, + // that means another migrate saga has already started a migration. + // Guess I'll die! + UpdateAndQueryResult { + found: + Instance { + runtime_state: + InstanceRuntimeState { + migration_id: Some(actual_migration_id), + .. + }, + .. + }, + .. + } => { + slog::info!( + opctx.log, + "failed to set instance migration IDs: a different migration ID was already set"; + "instance_id" => %instance_id, + "desired_migration_id" => %migration_id, + "actual_migration_id" => %actual_migration_id, + ); + Err(Error::conflict("instance is already migrating")) + } + // If one of the other filters didn't match, our understanding of + // the instance's state is clearly pretty wromg. + UpdateAndQueryResult { + found: Instance { runtime_state, .. }, + .. + } => { + slog::warn!( + opctx.log, + "failed to set instance migration IDs: one of its Propolis IDs was what way we anticipated!"; + "instance_id" => %instance_id, + "desired_migration_id" => %migration_id, + "desired_active_propolis_id" => %src_propolis_id, + "desired_target_propolis_id" => %target_propolis_id, + "actual_migration_id" => ?runtime_state.migration_id, + "actual_active_propolis_id" => ?runtime_state.propolis_id, + "actual_target_propolis_id" => ?runtime_state.dst_propolis_id, + ); + Err(Error::conflict( + "instance snapshot didn't match actual state", + )) + } + } + } + + /// Unsets the migration IDs set by + /// [`DataStore::instance_set_migration_ids`]. + /// + /// This method will only unset the instance's migration IDs if they match + /// the provided ones. + pub async fn instance_unset_migration_ids( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let updated = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::migration_id.eq(migration_id)) + .filter(dsl::target_propolis_id.eq(target_propolis_id)) + .set(( + dsl::migration_id.eq(None::), + dsl::target_propolis_id.eq(None::), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .check_if_exists::(instance_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + Ok(updated) + } + /// Updates an instance record and a VMM record with a single database /// command. /// diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index ba8a4e03921..23f0776d59d 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -38,24 +38,24 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Marks a migration record as deleted if and only if both sides of the - /// migration are in a terminal state. - pub async fn migration_terminate( + /// Marks a migration record as failed. + pub async fn migration_mark_failed( &self, opctx: &OpContext, migration_id: Uuid, ) -> UpdateResult { - const TERMINAL_STATES: &[MigrationState] = &[ - MigrationState(nexus::MigrationState::Completed), - MigrationState(nexus::MigrationState::Failed), - ]; - + let failed = MigrationState(nexus::MigrationState::Failed); diesel::update(dsl::migration) .filter(dsl::id.eq(migration_id)) .filter(dsl::time_deleted.is_null()) - .filter(dsl::source_state.eq_any(TERMINAL_STATES)) - .filter(dsl::target_state.eq_any(TERMINAL_STATES)) - .set(dsl::time_deleted.eq(Utc::now())) + .set(( + dsl::source_state.eq(failed), + dsl::source_gen.eq(dsl::source_gen + 1), + dsl::time_source_updated.eq(Utc::now()), + dsl::target_state.eq(failed), + dsl::target_gen.eq(dsl::target_gen + 1), + dsl::time_target_updated.eq(Utc::now()), + )) .check_if_exists::(migration_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 232f59df879..546c2dbaa6e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -57,10 +57,8 @@ use propolis_client::support::InstanceSerialConsoleHelper; use propolis_client::support::WSClientOffset; use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; -use sled_agent_client::types::InstanceMigrationSourceParams; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; -use sled_agent_client::types::InstancePutMigrationIdsBody; use sled_agent_client::types::InstancePutStateBody; use std::matches; use std::net::SocketAddr; @@ -519,77 +517,6 @@ impl super::Nexus { self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } - /// Attempts to set the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The caller is assumed to have fetched the current instance record from - /// the DB and verified that the record has no migration IDs. - /// - /// Returns `Ok` and the updated instance record if this call successfully - /// updated the instance with the sled agent and that update was - /// successfully reflected into CRDB. Returns `Err` with an appropriate - /// error otherwise. - /// - /// # Panics - /// - /// Asserts that `db_instance` has no migration ID or destination Propolis - /// ID set. - pub(crate) async fn instance_set_migration_ids( - &self, - opctx: &OpContext, - instance_id: InstanceUuid, - src_propolis_id: PropolisUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - migration_params: InstanceMigrationSourceParams, - ) -> UpdateResult { - assert!(prev_instance_runtime.migration_id.is_none()); - assert!(prev_instance_runtime.dst_propolis_id.is_none()); - - let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) - .instance_id(instance_id.into_untyped_uuid()) - .lookup_for(authz::Action::Modify) - .await?; - - let instance_updated = todo!("eliza: do this transition purely in nexus rather than in sled-agent..."); - - if instance_updated { - Ok(self - .db_datastore - .instance_refetch(opctx, &authz_instance) - .await?) - } else { - Err(Error::conflict( - "instance is already migrating, or underwent an operation that \ - prevented this migration from proceeding" - )) - } - } - - /// Attempts to clear the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The supplied instance record must contain valid migration IDs. - /// - /// Returns `Ok` if sled agent accepted the request to clear migration IDs - /// and the resulting attempt to write instance runtime state back to CRDB - /// succeeded. This routine returns `Ok` even if the update was not actually - /// applied (due to a separate generation number change). - /// - /// # Panics - /// - /// Asserts that `db_instance` has a migration ID and destination Propolis - /// ID set. - pub(crate) async fn instance_clear_migration_ids( - &self, - instance_id: InstanceUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - ) -> Result<(), Error> { - assert!(prev_instance_runtime.migration_id.is_some()); - assert!(prev_instance_runtime.dst_propolis_id.is_some()); - - todo!("eliza: do this transition in the DB rather than in sled-agent") - } - /// Reboot the specified instance. pub(crate) async fn instance_reboot( &self, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index c393209bc72..8ebafe3dae1 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -16,9 +16,7 @@ use nexus_db_queries::{authn, authz, db}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{ - InstanceMigrationSourceParams, InstanceMigrationTargetParams, -}; +use sled_agent_client::types::InstanceMigrationTargetParams; use slog::warn; use std::net::{Ipv6Addr, SocketAddr}; use steno::ActionError; @@ -72,12 +70,27 @@ declare_saga_actions! { CREATE_MIGRATION_RECORD -> "migration_record" { + sim_create_migration_record - - sim_delete_migration_record + - sim_fail_migration_record } - // This step the instance's migration ID and destination Propolis ID - // fields. + // fields in the database. + // + // If the instance's migration ID has already been set when we attempt to + // set ours, that means we have probably raced with another migrate saga for + // the same instance. If this is the case, this action will fail and the + // saga will unwind. + // + // Yes, it's a bit unfortunate that our attempt to compare-and-swap in a + // migration ID happens only after we've created VMM and migration records, + // and that we'll have to destroy them as we unwind. However, the + // alternative, setting the migration IDs *before* records for the target + // VMM and the migration are created, would mean that there is a period of + // time during which the instance record contains foreign keys into the + // `vmm` and `migration` tables that don't have corresponding records to + // those tables. Because the `instance` table is queried in the public API, + // we take care to ensure that it doesn't have "dangling pointers" to + // records in the `vmm` and `migration` tables that don't exist yet. SET_MIGRATION_IDS -> "set_migration_ids" { + sim_set_migration_ids - sim_clear_migration_ids @@ -231,7 +244,7 @@ async fn sim_create_migration_record( .map_err(ActionError::action_failed) } -async fn sim_delete_migration_record( +async fn sim_fail_migration_record( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx: &std::sync::Arc = @@ -243,9 +256,24 @@ async fn sim_delete_migration_record( ); let migration_id = sagactx.lookup::("migrate_id")?; - info!(osagactx.log(), "deleting migration record"; - "migration_id" => %migration_id); - osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await?; + info!( + osagactx.log(), + "migration saga unwinding, marking migration record as failed"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + ); + // If the migration record wasn't updated, this means it's already deleted, + // which...seems weird, but isn't worth getting the whole saga unwind stuck over. + if let Err(e) = + osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await + { + warn!(osagactx.log(), + "Error marking migration record as failed during rollback"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + "error" => ?e); + } + Ok(()) } @@ -305,7 +333,7 @@ async fn sim_destroy_vmm_record( async fn sim_set_migration_ids( sagactx: NexusActionContext, -) -> Result { +) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -327,19 +355,19 @@ async fn sim_set_migration_ids( "dst_propolis_id" => %dst_propolis_id, "prev_runtime_state" => ?db_instance.runtime()); - let updated_record = osagactx - .nexus() + osagactx + .datastore() .instance_set_migration_ids( &opctx, instance_id, src_propolis_id, - db_instance.runtime(), - InstanceMigrationSourceParams { dst_propolis_id, migration_id }, + migration_id, + dst_propolis_id, ) .await .map_err(ActionError::action_failed)?; - Ok(updated_record) + Ok(()) } async fn sim_clear_migration_ids( @@ -347,31 +375,31 @@ async fn sim_clear_migration_ids( ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); - let db_instance = - sagactx.lookup::("set_migration_ids")?; + let db_instance = params.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); + let migration_id = sagactx.lookup::("migrate_id")?; + let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; + info!(osagactx.log(), "clearing migration IDs for saga unwind"; "instance_id" => %db_instance.id(), "sled_id" => %src_sled_id, - "prev_runtime_state" => ?db_instance.runtime()); + "migration_id" => %migration_id, + "dst_propolis_id" => %dst_propolis_id); - // Because the migration never actually started (and thus didn't finish), - // the instance should be at the same Propolis generation as it was when - // migration IDs were set, which means sled agent should accept a request to - // clear them. The only exception is if the instance stopped, but that also - // clears its migration IDs; in that case there is no work to do here. - // - // Other failures to clear migration IDs are handled like any other failure - // to update an instance's state: the callee attempts to mark the instance - // as failed; if the failure occurred because the instance changed state - // such that sled agent could not fulfill the request, the callee will - // produce a stale generation number and will not actually mark the instance - // as failed. if let Err(e) = osagactx - .nexus() - .instance_clear_migration_ids(instance_id, db_instance.runtime()) + .datastore() + .instance_unset_migration_ids( + &opctx, + instance_id, + migration_id, + dst_propolis_id, + ) .await { warn!(osagactx.log(), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 4ad12046af2..fd4a05e85cb 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -975,7 +975,7 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) { instance_simulate_migration_source( cptestctx, nexus, - original_sled, + original_sled_id, instance_id, migration_id, ) @@ -4868,7 +4868,7 @@ async fn instance_simulate_migration_source( "migration_id" => %migration_id, ); let sa = nexus.sled_client(&sled_id).await.unwrap(); - sa.instance_simulate_migrationSource( + sa.instance_simulate_migration_source( instance_id.into_untyped_uuid(), sled_agent_client::SimulateMigrationSource { migration_id,