From 370b73ccd21694f228a29e794ad7f9361d21d343 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 11:08:55 -0700 Subject: [PATCH 01/32] add migration table --- nexus/db-model/src/lib.rs | 4 +++ nexus/db-model/src/migration.rs | 34 +++++++++++++++++++ nexus/db-model/src/migration_state.rs | 42 ++++++++++++++++++++++++ nexus/db-model/src/schema.rs | 13 ++++++++ nexus/db-model/src/schema_versions.rs | 3 +- schema/crdb/add-migration-table/up01.sql | 5 +++ schema/crdb/add-migration-table/up02.sql | 7 ++++ schema/crdb/dbinit.sql | 16 ++++++++- 8 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 nexus/db-model/src/migration.rs create mode 100644 nexus/db-model/src/migration_state.rs create mode 100644 schema/crdb/add-migration-table/up01.sql create mode 100644 schema/crdb/add-migration-table/up02.sql diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 040882a8f0..adec410e42 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -42,6 +42,8 @@ pub mod ipv6; mod ipv6net; mod l4_port_range; mod macaddr; +mod migration; +mod migration_state; mod name; mod network_interface; mod oximeter_info; @@ -152,6 +154,8 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; +pub use migration::*; +pub use migration_state::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs new file mode 100644 index 0000000000..2146ab3be8 --- /dev/null +++ b/nexus/db-model/src/migration.rs @@ -0,0 +1,34 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::schema::migration; +use crate::MigrationState; +use serde::Deserialize; +use serde::Serialize; +use uuid::Uuid; + +/// The state of a migration as understood by Nexus. +#[derive( + Clone, Debug, Queryable, Insertable, Selectable, Serialize, Deserialize, +)] +#[diesel(table_name = migration)] +pub struct Migration { + /// The migration's UUID. + /// + /// This is the primary key of the migration table and is referenced by the + /// `instance` table's `migration_id` field. + pub id: Uuid, + + /// The state of the migration source VMM. + pub source_state: MigrationState, + + /// The ID of the migration source VMM. + pub source_propolis_id: Uuid, + + /// The state of the migration target VMM. + pub target_state: MigrationState, + + /// The ID of the migration target VMM. + pub target_propolis_id: Uuid, +} diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs new file mode 100644 index 0000000000..41ed66b3a5 --- /dev/null +++ b/nexus/db-model/src/migration_state.rs @@ -0,0 +1,42 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Database representation of a migration's state as understood by Nexus. + +use super::impl_enum_type; +use serde::{Deserialize, Serialize}; +use std::fmt; + +impl_enum_type!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "migration_state", schema = "public"))] + pub struct MigrationStateEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq)] + #[diesel(sql_type = MigrationStateEnum)] + // match the database representation when serializing + #[serde(rename_all = "snake_case")] + pub enum MigrationState; + + // Enum values + InProgress => b"in_progress" + Completed => b"completed" + Failed => b"failed" +); + +impl MigrationState { + pub fn label(&self) -> &'static str { + match self { + Self::InProgress => "in_progress", + Self::Completed => "completed", + Self::Failed => "failed", + } + } +} + +impl fmt::Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.label()) + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index dedb0efc62..e0626e9dec 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1758,6 +1758,19 @@ table! { } } +table! { + migration (id) { + id -> Uuid, + source_state -> crate::MigrationStateEnum, + source_propolis_id -> Uuid, + target_state -> crate::MigrationStateEnum, + target_propolis_id -> Uuid, + } +} + +allow_tables_to_appear_in_same_query!(instance, migration); +joinable!(instance -> migration (migration_id)); + allow_tables_to_appear_in_same_query!( ip_pool_range, ip_pool, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 09039c952b..9e1dd21165 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(70, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(71, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(71, "add-migration-state-enum"), KnownVersion::new(70, "separate-instance-and-vmm-states"), KnownVersion::new(69, "expose-stage0"), KnownVersion::new(68, "filter-v2p-mapping-by-instance-state"), diff --git a/schema/crdb/add-migration-table/up01.sql b/schema/crdb/add-migration-table/up01.sql new file mode 100644 index 0000000000..4de18e0641 --- /dev/null +++ b/schema/crdb/add-migration-table/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'in_progress', + 'failed', + 'completed' +); diff --git a/schema/crdb/add-migration-table/up02.sql b/schema/crdb/add-migration-table/up02.sql new file mode 100644 index 0000000000..ef3695b838 --- /dev/null +++ b/schema/crdb/add-migration-table/up02.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS omicron.public.migration ( + id UUID PRIMARY KEY, + source_state omicron.public.migration_state NOT NULL, + source_propolis_id UUID NOT NULL, + target_state omicron.public.migration_state NOT NULL, + target_propolis_id UUID NOT NULL +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 7bda66c5f2..06adf21142 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4063,6 +4063,20 @@ VALUES ( ON CONFLICT (id) DO NOTHING; +CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'in_progress', + 'failed', + 'completed' +); + +-- A table of the states of current migrations. +CREATE TABLE IF NOT EXISTS omicron.public.migration ( + id UUID PRIMARY KEY, + source_state omicron.public.migration_state NOT NULL, + source_propolis_id UUID NOT NULL, + target_state omicron.public.migration_state NOT NULL, + target_propolis_id UUID NOT NULL +); /* * Keep this at the end of file so that the database does not contain a version @@ -4075,7 +4089,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '70.0.0', NULL) + (TRUE, NOW(), NOW(), '71.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From d017eb08b1fd13cd84b9bad173957f467c0ecbd9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 11:42:21 -0700 Subject: [PATCH 02/32] add migration state to nexus API --- clients/sled-agent-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 42 +++++++++++++++++++++++++++ nexus/db-model/src/migration_state.rs | 29 +++++++++--------- sled-agent/src/common/instance.rs | 1 + sled-agent/src/sim/collection.rs | 1 + sled-agent/src/sim/sled_agent.rs | 1 + 6 files changed, 59 insertions(+), 16 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 862ae00cc9..48ae59704f 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -327,6 +327,7 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), + migration_state: None, } } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index b569437f43..2ff052259b 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -16,6 +16,7 @@ use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::fmt; use std::net::SocketAddr; use std::time::Duration; use strum::{EnumIter, IntoEnumIterator}; @@ -108,6 +109,47 @@ pub struct SledInstanceState { /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, + + /// The current state of any in-progress migration for this instance, as + /// understood by this sled. + pub migration_state: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct MigrationRuntimeState { + pub migration_id: Uuid, + + pub state: MigrationState, +} + +/// The state of an instance's live migration. +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum MigrationState { + /// The migration is in progress. + InProgress, + /// The migration has failed. + Failed, + /// The migration has completed. + Completed, +} + +impl MigrationState { + pub fn label(&self) -> &'static str { + match self { + Self::InProgress => "in_progress", + Self::Completed => "completed", + Self::Failed => "failed", + } + } +} + +impl fmt::Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.label()) + } } // Oximeter producer/collector objects. diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index 41ed66b3a5..d60b8e552d 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -4,20 +4,21 @@ //! Database representation of a migration's state as understood by Nexus. -use super::impl_enum_type; -use serde::{Deserialize, Serialize}; +use super::impl_enum_wrapper; +use omicron_common::api::internal::nexus; +use serde::Deserialize; +use serde::Serialize; use std::fmt; +use std::io::Write; -impl_enum_type!( +impl_enum_wrapper!( #[derive(Clone, SqlType, Debug, QueryId)] #[diesel(postgres_type(name = "migration_state", schema = "public"))] pub struct MigrationStateEnum; #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq)] #[diesel(sql_type = MigrationStateEnum)] - // match the database representation when serializing - #[serde(rename_all = "snake_case")] - pub enum MigrationState; + pub struct MigrationState(pub nexus::MigrationState); // Enum values InProgress => b"in_progress" @@ -25,18 +26,14 @@ impl_enum_type!( Failed => b"failed" ); -impl MigrationState { - pub fn label(&self) -> &'static str { - match self { - Self::InProgress => "in_progress", - Self::Completed => "completed", - Self::Failed => "failed", - } +impl fmt::Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) } } -impl fmt::Display for MigrationState { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(self.label()) +impl From for MigrationState { + fn from(s: nexus::MigrationState) -> Self { + Self(s) } } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 62af337c4c..e4935e9cbb 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -231,6 +231,7 @@ impl InstanceStates { instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, + migration_state: None, // TODO(eliza): add this } } diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index f5be31bd37..12e17cc3de 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -451,6 +451,7 @@ mod test { instance_state: instance_vmm, vmm_state, propolis_id, + migration_state: None, }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index e2a52bf983..7f07dc199a 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -341,6 +341,7 @@ impl SledAgent { instance_state: instance_runtime, vmm_state: vmm_runtime, propolis_id, + migration_state: None, }, None, ) From 9362651a1087cdb28c6f7c4b50cfe66191404b01 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 13:48:51 -0700 Subject: [PATCH 03/32] okay i really hopep the CTE works --- clients/nexus-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 30 +++- nexus/db-queries/src/db/datastore/instance.rs | 1 + nexus/db-queries/src/db/queries/instance.rs | 146 ++++++++++++++++-- openapi/nexus-internal.json | 74 +++++++++ openapi/sled-agent.json | 74 +++++++++ 6 files changed, 314 insertions(+), 12 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index acf282a1f9..976d0e5d46 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -149,6 +149,7 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), + migration_state: None, } } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 2ff052259b..9865e7de99 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -115,11 +115,13 @@ pub struct SledInstanceState { pub migration_state: Option, } +/// An update from a sled regarding the state of a migration, indicating the +/// role of the VMM whose migration state was updated. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct MigrationRuntimeState { pub migration_id: Uuid, - pub state: MigrationState, + pub role: MigrationRole, } /// The state of an instance's live migration. @@ -152,6 +154,32 @@ impl fmt::Display for MigrationState { } } +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum MigrationRole { + /// This update concerns the source VMM of a migration. + Source, + /// This update concerns the target VMM of a migration. + Target, +} + +impl MigrationRole { + pub fn label(&self) -> &'static str { + match self { + Self::Source => "source", + Self::Target => "target", + } + } +} + +impl fmt::Display for MigrationRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.label()) + } +} + // Oximeter producer/collector objects. /// The kind of metric producer this is. diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index b9989fe31c..2611da9b3e 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -391,6 +391,7 @@ impl DataStore { new_instance.clone(), *vmm_id, new_vmm.clone(), + None, // TODO: ELIZA ADD THIS ); // The InstanceAndVmmUpdate query handles and indicates failure to find diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index ea40877450..821885dc05 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -12,8 +12,14 @@ use diesel::sql_types::{Nullable, Uuid as SqlUuid}; use diesel::{pg::Pg, query_builder::AstPass}; use diesel::{Column, ExpressionMethods, QueryDsl, RunQueryDsl}; use nexus_db_model::{ - schema::{instance::dsl as instance_dsl, vmm::dsl as vmm_dsl}, - InstanceRuntimeState, VmmRuntimeState, + schema::{ + instance::dsl as instance_dsl, migration::dsl as migration_dsl, + vmm::dsl as vmm_dsl, + }, + InstanceRuntimeState, MigrationState, VmmRuntimeState, +}; +use omicron_common::api::internal::nexus::{ + MigrationRole, MigrationRuntimeState, }; use uuid::Uuid; @@ -76,6 +82,12 @@ pub struct InstanceAndVmmUpdate { vmm_find: Box + Send>, instance_update: Box + Send>, vmm_update: Box + Send>, + migration: Option, +} + +struct MigrationUpdate { + find: Box + Send>, + update: Box + Send>, } /// Contains the result of a combined instance-and-VMM update operation. @@ -89,6 +101,11 @@ pub struct InstanceAndVmmUpdateResult { /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` /// indicates whether the row was updated. `None` if the VMM was not found. pub vmm_status: Option, + + /// `Some(status)` if the target migration was found; the wrapped `UpdateStatus` + /// indicates whether the row was updated. `None` if the migration was not + /// found, or no migration update was performed. + pub migration_status: Option, } /// Computes the update status to return from the results of queries that find @@ -135,6 +152,7 @@ impl InstanceAndVmmUpdate { new_instance_runtime_state: InstanceRuntimeState, vmm_id: Uuid, new_vmm_runtime_state: VmmRuntimeState, + migration: Option, ) -> Self { let instance_find = Box::new( instance_dsl::instance @@ -165,24 +183,92 @@ impl InstanceAndVmmUpdate { .set(new_vmm_runtime_state), ); - Self { instance_find, vmm_find, instance_update, vmm_update } + let migration = migration.map( + |MigrationRuntimeState { role, migration_id, state }| { + let state = MigrationState::from(state); + match role { + MigrationRole::Target => { + let find = Box::new( + migration_dsl::migration + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::target_propolis_id + .eq(vmm_id), + ) + .select(migration_dsl::id), + ); + let update = Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::target_propolis_id + .eq(vmm_id), + ) + .set(migration_dsl::target_state.eq(state)), + ); + MigrationUpdate { find, update } + } + MigrationRole::Source => { + let find = Box::new( + migration_dsl::migration + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::source_propolis_id + .eq(vmm_id), + ) + .select(migration_dsl::id), + ); + let update = Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::source_propolis_id + .eq(vmm_id), + ) + .set(migration_dsl::source_state.eq(state)), + ); + MigrationUpdate { find, update } + } + } + }, + ); + + Self { instance_find, vmm_find, instance_update, vmm_update, migration } } pub async fn execute_and_check( self, conn: &(impl async_bb8_diesel::AsyncConnection + Sync), ) -> Result { - let (vmm_found, vmm_updated, instance_found, instance_updated) = - self.get_result_async::<(Option, - Option, - Option, - Option)>(conn).await?; + let ( + vmm_found, + vmm_updated, + instance_found, + instance_updated, + migration_found, + migration_updated, + ) = self + .get_result_async::<( + Option, + Option, + Option, + Option, + Option, + Option, + )>(conn) + .await?; let instance_status = compute_update_status(instance_found, instance_updated); let vmm_status = compute_update_status(vmm_found, vmm_updated); + let migration_status = + compute_update_status(migration_found, migration_updated); - Ok(InstanceAndVmmUpdateResult { instance_status, vmm_status }) + Ok(InstanceAndVmmUpdateResult { + instance_status, + vmm_status, + migration_status, + }) } } @@ -197,6 +283,8 @@ impl Query for InstanceAndVmmUpdate { Nullable, Nullable, Nullable, + Nullable, + Nullable, ); } @@ -212,6 +300,12 @@ impl QueryFragment for InstanceAndVmmUpdate { self.vmm_find.walk_ast(out.reborrow())?; out.push_sql(") AS id), "); + if let Some(MigrationUpdate { ref find, .. }) = self.migration { + out.push_sql("migration_found AS (SELECT ("); + find.walk_ast(out.reborrow())?; + out.push_sql(") AS id), "); + } + out.push_sql("instance_updated AS ("); self.instance_update.walk_ast(out.reborrow())?; out.push_sql(" RETURNING id), "); @@ -220,6 +314,12 @@ impl QueryFragment for InstanceAndVmmUpdate { self.vmm_update.walk_ast(out.reborrow())?; out.push_sql(" RETURNING id), "); + if let Some(MigrationUpdate { ref update, .. }) = self.migration { + out.push_sql("migration_updated AS ("); + update.walk_ast(out.reborrow())?; + out.push_sql(" RETURNING id), "); + } + out.push_sql("vmm_result AS ("); out.push_sql("SELECT vmm_found."); out.push_identifier(vmm_dsl::id::NAME)?; @@ -246,9 +346,33 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(instance_dsl::id::NAME)?; out.push_sql(") "); + if self.migration.is_some() { + out.push_sql("migration_result AS ("); + out.push_sql("SELECT migration_found."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" AS found, migration_updated."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" AS updated"); + out.push_sql( + " FROM migration_found LEFT JOIN migration_updated ON migration_found.", + ); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" = migration_updated."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(") "); + } + out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - out.push_sql("instance_result.found, instance_result.updated "); - out.push_sql("FROM vmm_result, instance_result;"); + out.push_sql("instance_result.found, instance_result.updated, "); + if self.migration.is_some() { + out.push_sql("migration_result.found, migration_result.updated"); + } else { + out.push_sql("NULL, NULL"); + } + out.push_sql("FROM vmm_result, instance_result"); + if self.migration.is_some() { + out.push_sql(", migration_result"); + } Ok(()) } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 637334483d..494cd58823 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3428,6 +3428,71 @@ "minLength": 5, "maxLength": 17 }, + "MigrationRole": { + "oneOf": [ + { + "description": "This update concerns the source VMM of a migration.", + "type": "string", + "enum": [ + "source" + ] + }, + { + "description": "This update concerns the target VMM of a migration.", + "type": "string", + "enum": [ + "target" + ] + } + ] + }, + "MigrationRuntimeState": { + "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", + "type": "object", + "properties": { + "migration_id": { + "type": "string", + "format": "uuid" + }, + "role": { + "$ref": "#/components/schemas/MigrationRole" + }, + "state": { + "$ref": "#/components/schemas/MigrationState" + } + }, + "required": [ + "migration_id", + "role", + "state" + ] + }, + "MigrationState": { + "description": "The state of an instance's live migration.", + "oneOf": [ + { + "description": "The migration is in progress.", + "type": "string", + "enum": [ + "in_progress" + ] + }, + { + "description": "The migration has failed.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "The migration has completed.", + "type": "string", + "enum": [ + "completed" + ] + } + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -4572,6 +4637,15 @@ } ] }, + "migration_state": { + "nullable": true, + "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "type": "string", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 68513345e2..2b4482cab1 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3406,6 +3406,71 @@ "minLength": 5, "maxLength": 17 }, + "MigrationRole": { + "oneOf": [ + { + "description": "This update concerns the source VMM of a migration.", + "type": "string", + "enum": [ + "source" + ] + }, + { + "description": "This update concerns the target VMM of a migration.", + "type": "string", + "enum": [ + "target" + ] + } + ] + }, + "MigrationRuntimeState": { + "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", + "type": "object", + "properties": { + "migration_id": { + "type": "string", + "format": "uuid" + }, + "role": { + "$ref": "#/components/schemas/MigrationRole" + }, + "state": { + "$ref": "#/components/schemas/MigrationState" + } + }, + "required": [ + "migration_id", + "role", + "state" + ] + }, + "MigrationState": { + "description": "The state of an instance's live migration.", + "oneOf": [ + { + "description": "The migration is in progress.", + "type": "string", + "enum": [ + "in_progress" + ] + }, + { + "description": "The migration has failed.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "The migration has completed.", + "type": "string", + "enum": [ + "completed" + ] + } + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -4187,6 +4252,15 @@ } ] }, + "migration_state": { + "nullable": true, + "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "type": "string", From e6e5e14c85cc0e5a57cb43e0fe63948da10c537c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 14:45:09 -0700 Subject: [PATCH 04/32] unbreak CTE, et cetera --- nexus/db-queries/src/db/datastore/instance.rs | 19 ++++++++++++++++--- nexus/db-queries/src/db/queries/instance.rs | 4 ++-- nexus/src/app/instance.rs | 19 +++++++++++++------ 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 2611da9b3e..07d09ebb5c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -46,6 +46,7 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::MigrationRuntimeState; use omicron_common::bail_unless; use ref_cast::RefCast; use uuid::Uuid; @@ -385,13 +386,14 @@ impl DataStore { new_instance: &InstanceRuntimeState, vmm_id: &Uuid, new_vmm: &VmmRuntimeState, - ) -> Result<(bool, bool), Error> { + migration: &Option, + ) -> Result<(bool, bool, Option), Error> { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( *instance_id, new_instance.clone(), *vmm_id, new_vmm.clone(), - None, // TODO: ELIZA ADD THIS + migration.clone(), ); // The InstanceAndVmmUpdate query handles and indicates failure to find @@ -414,7 +416,18 @@ impl DataStore { None => false, }; - Ok((instance_updated, vmm_updated)) + let migration_updated = if migration.is_some() { + Some(match result.migration_status { + Some(UpdateStatus::Updated) => true, + Some(UpdateStatus::NotUpdatedButExists) => false, + None => false, + }) + } else { + debug_assert_eq!(result.migration_status, None); + None + }; + + Ok((instance_updated, vmm_updated, migration_updated)) } /// Lists all instances on in-service sleds with active Propolis VMM diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 821885dc05..4bdc9ea2fd 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -365,9 +365,9 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); out.push_sql("instance_result.found, instance_result.updated, "); if self.migration.is_some() { - out.push_sql("migration_result.found, migration_result.updated"); + out.push_sql("migration_result.found, migration_result.updated "); } else { - out.push_sql("NULL, NULL"); + out.push_sql("NULL, NULL "); } out.push_sql("FROM vmm_result, instance_result"); if self.migration.is_some() { diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 1132f1f5b8..5ec72a618b 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -563,9 +563,12 @@ impl super::Nexus { // outright fails, this operation fails. If the operation nominally // succeeds but nothing was updated, this action is outdated and the // caller should not proceed with migration. - let (updated, _) = match instance_put_result { + let (updated, _, _) = match instance_put_result { Ok(state) => { self.write_returned_instance_state(&instance_id, state).await? + + // TODO(eliza): this is where we would also want to insert to the + // migration table... } Err(e) => { if e.instance_unhealthy() { @@ -1321,7 +1324,7 @@ impl super::Nexus { &self, instance_id: &Uuid, state: Option, - ) -> Result<(bool, bool), Error> { + ) -> Result<(bool, bool, Option), Error> { slog::debug!(&self.log, "writing instance state returned from sled agent"; "instance_id" => %instance_id, @@ -1335,6 +1338,7 @@ impl super::Nexus { &state.instance_state.into(), &state.propolis_id, &state.vmm_state.into(), + &state.migration_state, ) .await; @@ -1346,7 +1350,7 @@ impl super::Nexus { update_result } else { - Ok((false, false)) + Ok((false, false, None)) } } @@ -1983,7 +1987,8 @@ pub(crate) async fn notify_instance_updated( "instance_id" => %instance_id, "instance_state" => ?new_runtime_state.instance_state, "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migration_state); // Grab the current state of the instance in the DB to reason about // whether this update is stale or not. @@ -2071,6 +2076,7 @@ pub(crate) async fn notify_instance_updated( &db::model::VmmRuntimeState::from( new_runtime_state.vmm_state.clone(), ), + &new_runtime_state.migration_state, ) .await; @@ -2112,12 +2118,13 @@ pub(crate) async fn notify_instance_updated( } match result { - Ok((instance_updated, vmm_updated)) => { + Ok((instance_updated, vmm_updated, migration_updated)) => { info!(log, "instance and vmm updated by sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, "instance_updated" => instance_updated, - "vmm_updated" => vmm_updated); + "vmm_updated" => vmm_updated, + "migration_updated" => ?migration_updated); Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) } From c8a0c7cc820349e7f8a3dc39f87426f61e44422a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 15:00:00 -0700 Subject: [PATCH 05/32] fix migration name --- nexus/db-model/src/schema_versions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 9e1dd21165..6b36aa8993 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -29,7 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), - KnownVersion::new(71, "add-migration-state-enum"), + KnownVersion::new(71, "add-migration-table"), KnownVersion::new(70, "separate-instance-and-vmm-states"), KnownVersion::new(69, "expose-stage0"), KnownVersion::new(68, "filter-v2p-mapping-by-instance-state"), From 00c2c9377af08e9fb7b9fd079581e36c1fd7250b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 15:28:33 -0700 Subject: [PATCH 06/32] add generation numbers --- common/src/api/internal/nexus.rs | 1 + nexus/db-model/src/migration.rs | 7 ++ nexus/db-model/src/schema.rs | 2 + nexus/db-queries/src/db/queries/instance.rs | 75 ++++++++------------- schema/crdb/add-migration-table/up02.sql | 4 +- schema/crdb/dbinit.sql | 16 ++++- 6 files changed, 57 insertions(+), 48 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 9865e7de99..31b248f889 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -122,6 +122,7 @@ pub struct MigrationRuntimeState { pub migration_id: Uuid, pub state: MigrationState, pub role: MigrationRole, + pub gen: Generation, } /// The state of an instance's live migration. diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 2146ab3be8..a098c6a4b8 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use super::Generation; use crate::schema::migration; use crate::MigrationState; use serde::Deserialize; @@ -26,9 +27,15 @@ pub struct Migration { /// The ID of the migration source VMM. pub source_propolis_id: Uuid, + /// The generation number for the source state. + pub source_gen: Generation, + /// The state of the migration target VMM. pub target_state: MigrationState, /// The ID of the migration target VMM. pub target_propolis_id: Uuid, + + /// The generation number for the target state. + pub target_gen: Generation, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index e0626e9dec..8eb2ed4844 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1763,8 +1763,10 @@ table! { id -> Uuid, source_state -> crate::MigrationStateEnum, source_propolis_id -> Uuid, + source_gen -> Int8, target_state -> crate::MigrationStateEnum, target_propolis_id -> Uuid, + target_gen -> Int8, } } diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 4bdc9ea2fd..453cfd7012 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -16,7 +16,7 @@ use nexus_db_model::{ instance::dsl as instance_dsl, migration::dsl as migration_dsl, vmm::dsl as vmm_dsl, }, - InstanceRuntimeState, MigrationState, VmmRuntimeState, + Generation, InstanceRuntimeState, MigrationState, VmmRuntimeState, }; use omicron_common::api::internal::nexus::{ MigrationRole, MigrationRuntimeState, @@ -184,52 +184,35 @@ impl InstanceAndVmmUpdate { ); let migration = migration.map( - |MigrationRuntimeState { role, migration_id, state }| { + |MigrationRuntimeState { role, migration_id, state, gen }| { let state = MigrationState::from(state); - match role { - MigrationRole::Target => { - let find = Box::new( - migration_dsl::migration - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::target_propolis_id - .eq(vmm_id), - ) - .select(migration_dsl::id), - ); - let update = Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::target_propolis_id - .eq(vmm_id), - ) - .set(migration_dsl::target_state.eq(state)), - ); - MigrationUpdate { find, update } - } - MigrationRole::Source => { - let find = Box::new( - migration_dsl::migration - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::source_propolis_id - .eq(vmm_id), - ) - .select(migration_dsl::id), - ); - let update = Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::source_propolis_id - .eq(vmm_id), - ) - .set(migration_dsl::source_state.eq(state)), - ); - MigrationUpdate { find, update } - } - } + let find = Box::new( + migration_dsl::migration + .filter(migration_dsl::id.eq(migration_id)) + .select(migration_dsl::id), + ); + let gen = Generation::from(gen); + let update: Box + Send> = match role { + MigrationRole::Target => Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::target_propolis_id.eq(vmm_id), + ) + .filter(migration_dsl::target_gen.lt(gen)) + .set(migration_dsl::target_state.eq(state)), + ), + MigrationRole::Source => Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::source_propolis_id.eq(vmm_id), + ) + .filter(migration_dsl::source_gen.lt(gen)) + .set(migration_dsl::source_state.eq(state)), + ), + }; + MigrationUpdate { find, update } }, ); diff --git a/schema/crdb/add-migration-table/up02.sql b/schema/crdb/add-migration-table/up02.sql index ef3695b838..b846cc655e 100644 --- a/schema/crdb/add-migration-table/up02.sql +++ b/schema/crdb/add-migration-table/up02.sql @@ -2,6 +2,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( id UUID PRIMARY KEY, source_state omicron.public.migration_state NOT NULL, source_propolis_id UUID NOT NULL, + source_gen INT8 NOT NULL DEFAULT 1, target_state omicron.public.migration_state NOT NULL, - target_propolis_id UUID NOT NULL + target_propolis_id UUID NOT NULL, + target_gen INT8 NOT NULL DEFAULT 1 ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 06adf21142..d794ef5de5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4072,10 +4072,24 @@ CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( -- A table of the states of current migrations. CREATE TABLE IF NOT EXISTS omicron.public.migration ( id UUID PRIMARY KEY, + + /* The state of the migration source */ source_state omicron.public.migration_state NOT NULL, + + /* The ID of the migration source Propolis */ source_propolis_id UUID NOT NULL, + + /* Generation number owned and incremented by the source sled-agent */ + source_gen INT8 NOT NULL DEFAULT 1, + + /* The state of the migration target */ target_state omicron.public.migration_state NOT NULL, - target_propolis_id UUID NOT NULL + + /* The ID of the migration target Propolis */ + target_propolis_id UUID NOT NULL, + + /* Generation number owned and incremented by the target sled-agent */ + target_gen INT8 NOT NULL DEFAULT 1 ); /* From 7f952554c5175958d097819c2037988e363eb102 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 17:13:39 -0700 Subject: [PATCH 07/32] regen openapi again --- openapi/nexus-internal.json | 4 ++++ openapi/sled-agent.json | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 494cd58823..5ed1a2cf4b 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3450,6 +3450,9 @@ "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", "properties": { + "gen": { + "$ref": "#/components/schemas/Generation" + }, "migration_id": { "type": "string", "format": "uuid" @@ -3462,6 +3465,7 @@ } }, "required": [ + "gen", "migration_id", "role", "state" diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 2b4482cab1..90d7acdfa0 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3428,6 +3428,9 @@ "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", "properties": { + "gen": { + "$ref": "#/components/schemas/Generation" + }, "migration_id": { "type": "string", "format": "uuid" @@ -3440,6 +3443,7 @@ } }, "required": [ + "gen", "migration_id", "role", "state" From 63048175e73b4f41b105f2f73259780d8a2964e8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 10:21:43 -0700 Subject: [PATCH 08/32] add timestamps --- common/src/api/internal/nexus.rs | 4 ++++ nexus/db-model/src/migration.rs | 13 ++++++++++++ nexus/db-model/src/schema.rs | 4 ++++ nexus/db-queries/src/db/queries/instance.rs | 21 +++++++++++++++++--- openapi/nexus-internal.json | 8 +++++++- openapi/sled-agent.json | 8 +++++++- schema/crdb/add-migration-table/up02.sql | 6 +++++- schema/crdb/dbinit.sql | 22 ++++++++++++++++++++- 8 files changed, 79 insertions(+), 7 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 31b248f889..d71f1c62ef 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -123,6 +123,10 @@ pub struct MigrationRuntimeState { pub state: MigrationState, pub role: MigrationRole, pub gen: Generation, + + /// Timestamp for the migration state update. + // TODO(eliza): could this just be the VMM state timestamp? + pub time_updated: DateTime, } /// The state of an instance's live migration. diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index a098c6a4b8..cfc973574e 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -5,6 +5,8 @@ use super::Generation; use crate::schema::migration; use crate::MigrationState; +use chrono::DateTime; +use chrono::Utc; use serde::Deserialize; use serde::Serialize; use uuid::Uuid; @@ -21,6 +23,11 @@ pub struct Migration { /// `instance` table's `migration_id` field. pub id: Uuid, + /// The time at which this migration record was created. + pub time_created: DateTime, + + /// The time at which the source VMM state was last updated. + /// The state of the migration source VMM. pub source_state: MigrationState, @@ -30,6 +37,9 @@ pub struct Migration { /// The generation number for the source state. pub source_gen: Generation, + /// The time the source VMM state was most recently updated. + pub time_source_updated: Option>, + /// The state of the migration target VMM. pub target_state: MigrationState, @@ -38,4 +48,7 @@ pub struct Migration { /// The generation number for the target state. pub target_gen: Generation, + + /// The time the target VMM state was most recently updated. + pub time_target_updated: Option>, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8eb2ed4844..477553b846 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1761,12 +1761,16 @@ table! { table! { migration (id) { id -> Uuid, + time_created -> Timestamptz, + time_deleted -> Nullable, source_state -> crate::MigrationStateEnum, source_propolis_id -> Uuid, source_gen -> Int8, + time_source_updated -> Nullable, target_state -> crate::MigrationStateEnum, target_propolis_id -> Uuid, target_gen -> Int8, + time_target_updated -> Nullable, } } diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 453cfd7012..a5e5d86794 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -184,11 +184,18 @@ impl InstanceAndVmmUpdate { ); let migration = migration.map( - |MigrationRuntimeState { role, migration_id, state, gen }| { + |MigrationRuntimeState { + role, + migration_id, + state, + gen, + time_updated, + }| { let state = MigrationState::from(state); let find = Box::new( migration_dsl::migration .filter(migration_dsl::id.eq(migration_id)) + .filter(migration_dsl::time_deleted.is_null()) .select(migration_dsl::id), ); let gen = Generation::from(gen); @@ -200,7 +207,11 @@ impl InstanceAndVmmUpdate { migration_dsl::target_propolis_id.eq(vmm_id), ) .filter(migration_dsl::target_gen.lt(gen)) - .set(migration_dsl::target_state.eq(state)), + .set(( + migration_dsl::target_state.eq(state), + migration_dsl::time_target_updated + .eq(time_updated), + )), ), MigrationRole::Source => Box::new( diesel::update(migration_dsl::migration) @@ -209,7 +220,11 @@ impl InstanceAndVmmUpdate { migration_dsl::source_propolis_id.eq(vmm_id), ) .filter(migration_dsl::source_gen.lt(gen)) - .set(migration_dsl::source_state.eq(state)), + .set(( + migration_dsl::source_state.eq(state), + migration_dsl::time_target_updated + .eq(time_updated), + )), ), }; MigrationUpdate { find, update } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 5ed1a2cf4b..0d9e7679c4 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3462,13 +3462,19 @@ }, "state": { "$ref": "#/components/schemas/MigrationState" + }, + "time_updated": { + "description": "Timestamp for the migration state update.", + "type": "string", + "format": "date-time" } }, "required": [ "gen", "migration_id", "role", - "state" + "state", + "time_updated" ] }, "MigrationState": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 90d7acdfa0..3b0d5952e5 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3440,13 +3440,19 @@ }, "state": { "$ref": "#/components/schemas/MigrationState" + }, + "time_updated": { + "description": "Timestamp for the migration state update.", + "type": "string", + "format": "date-time" } }, "required": [ "gen", "migration_id", "role", - "state" + "state", + "time_updated" ] }, "MigrationState": { diff --git a/schema/crdb/add-migration-table/up02.sql b/schema/crdb/add-migration-table/up02.sql index b846cc655e..9e0654c1bc 100644 --- a/schema/crdb/add-migration-table/up02.sql +++ b/schema/crdb/add-migration-table/up02.sql @@ -1,9 +1,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_deleted TIMESTAMPTZ, source_state omicron.public.migration_state NOT NULL, source_propolis_id UUID NOT NULL, source_gen INT8 NOT NULL DEFAULT 1, + time_source_updated TIMESTAMPTZ, target_state omicron.public.migration_state NOT NULL, target_propolis_id UUID NOT NULL, - target_gen INT8 NOT NULL DEFAULT 1 + target_gen INT8 NOT NULL DEFAULT 1, + time_target_updated TIMESTAMPTZ ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index d794ef5de5..c52045f6d6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4073,6 +4073,12 @@ CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( CREATE TABLE IF NOT EXISTS omicron.public.migration ( id UUID PRIMARY KEY, + /* The time this migration record was created. */ + time_created TIMESTAMPTZ NOT NULL, + + /* The time this migration record was deleted. */ + time_deleted TIMESTAMPTZ, + /* The state of the migration source */ source_state omicron.public.migration_state NOT NULL, @@ -4082,6 +4088,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( /* Generation number owned and incremented by the source sled-agent */ source_gen INT8 NOT NULL DEFAULT 1, + /* Timestamp of when the source field was last updated. + * + * This is provided by the sled-agent when publishing a migration state + * update. + */ + time_source_updated TIMESTAMPTZ, + /* The state of the migration target */ target_state omicron.public.migration_state NOT NULL, @@ -4089,7 +4102,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( target_propolis_id UUID NOT NULL, /* Generation number owned and incremented by the target sled-agent */ - target_gen INT8 NOT NULL DEFAULT 1 + target_gen INT8 NOT NULL DEFAULT 1, + + /* Timestamp of when the source field was last updated. + * + * This is provided by the sled-agent when publishing a migration state + * update. + */ + time_target_updated TIMESTAMPTZ ); /* From d57060f0dabb4bc133b3a201edd588a81822476a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 11:28:05 -0700 Subject: [PATCH 09/32] actually create and delete migration records --- nexus/db-model/src/migration.rs | 21 +++++++ .../db-queries/src/db/datastore/migration.rs | 57 +++++++++++++++++++ nexus/src/app/sagas/instance_migrate.rs | 54 ++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 100644 nexus/db-queries/src/db/datastore/migration.rs diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index cfc973574e..5160f44782 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -52,3 +52,24 @@ pub struct Migration { /// The time the target VMM state was most recently updated. pub time_target_updated: Option>, } + +impl Migration { + pub fn new( + migration_id: Uuid, + source_propolis_id: Uuid, + target_propolis_id: Uuid, + ) -> Self { + Self { + id: migration_id, + time_created: Utc::now(), + source_state: MigrationState::InProgress, + source_propolis_id, + source_gen: Generation::new(), + time_source_updated: None, + target_state: MigrationState::InProgress, + target_propolis_id, + target_gen: Generation::new(), + time_target_updated: None, + } + } +} diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs new file mode 100644 index 0000000000..46f94b609c --- /dev/null +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`Migration`]s. + +use super::DataStore; +use crate::context::OpContext; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::Migration; +use crate::db::schema::migration::dsl; +use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateStatus; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::Utc; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::Error; +use omicron_common::api::external::UpdateResult; +use uuid::Uuid; + +impl DataStore { + /// Insert a database record for a migration. + pub async fn migration_insert( + &self, + opctx: &OpContext, + migration: Migration, + ) -> CreateResult { + diesel::insert_into(dsl::migration) + .values(migration) + .on_conflict(dsl::id) + .do_update() + .returning(Migration::as_returning()) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn migration_mark_deleted( + &self, + opctx: &OpContext, + migration_id: Uuid, + ) -> UpdateResult { + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration_id)) + .filter(dsl::time_deleted.is_null()) + .set(dsl::time_deleted.eq(Utc::now())) + .check_if_exists::(migration_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } +} diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index db1d838014..7ec6d9b61b 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -60,6 +60,11 @@ declare_saga_actions! { + sim_allocate_propolis_ip } + CREATE_MIGRATION_RECORD -> "migration_record" { + + sim_create_migration_record + - sim_delete_migration_record + } + CREATE_VMM_RECORD -> "dst_vmm_record" { + sim_create_vmm_record - sim_destroy_vmm_record @@ -127,6 +132,7 @@ impl NexusSaga for SagaInstanceMigrate { builder.append(reserve_resources_action()); builder.append(allocate_propolis_ip_action()); + builder.append(create_migration_record_action()); builder.append(create_vmm_record_action()); builder.append(set_migration_ids_action()); builder.append(ensure_destination_propolis_action()); @@ -189,6 +195,54 @@ async fn sim_allocate_propolis_ip( .await } +async fn sim_create_migration_record( + sagactx: NexusActionContext, +) -> Result { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let source_propolis_id = params.src_vmm.id; + let migration_id = sagactx.lookup::("migrate_id")?; + let target_propolis_id = sagactx.lookup::("dst_propolis_id")?; + + info!(osagactx.log(), "creating migration record"; + "migration_id" => %migration_id, + "source_propolis_id" => %source_propolis_id, + "target_propolis_id" => %target_propolis_id); + + osagactx + .datastore() + .migration_insert(db::model::Migration::new( + migration_id, + source_propolis_id, + target_propolis_id, + )) + .await + .map_err(ActionError::action_failed) +} + +async fn sim_delete_migration_record( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx: &std::sync::Arc = + sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let migration_id = sagactx.lookup::("migrate_id")?; + + info!(osagactx.log(), "deleting migration record"; + "migration_id" => %migration_id); + osagactx.datastore().migration_mark_deleted(migration_id).await?; + Ok(()) +} + async fn sim_create_vmm_record( sagactx: NexusActionContext, ) -> Result { From 77e308007412660dd07c2a10e2de6456564a0184 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 11:59:27 -0700 Subject: [PATCH 10/32] sled-agent api plumbing --- clients/nexus-client/src/lib.rs | 43 ++++- clients/sled-agent-client/src/lib.rs | 41 ++++- nexus/db-model/src/migration.rs | 5 +- .../db-queries/src/db/datastore/migration.rs | 4 +- nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/src/app/sagas/instance_migrate.rs | 15 +- sled-agent/src/common/instance.rs | 152 ++++++++++++------ 7 files changed, 197 insertions(+), 64 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 976d0e5d46..ed0dab9556 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -149,7 +149,48 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: None, + migration_state: s.migration_state.map(Into::into), + } + } +} + +impl From + for types::MigrationRuntimeState +{ + fn from( + s: omicron_common::api::internal::nexus::MigrationRuntimeState, + ) -> Self { + Self { + migration_id: s.migration_id, + role: s.role.into(), + state: s.state.into(), + gen: s.gen.into(), + time_updated: s.time_updated, + } + } +} + +impl From + for types::MigrationRole +{ + fn from(s: omicron_common::api::internal::nexus::MigrationRole) -> Self { + use omicron_common::api::internal::nexus::MigrationRole as Input; + match s { + Input::Source => Self::Source, + Input::Target => Self::Target, + } + } +} + +impl From + for types::MigrationState +{ + fn from(s: omicron_common::api::internal::nexus::MigrationState) -> Self { + use omicron_common::api::internal::nexus::MigrationState as Input; + match s { + Input::InProgress => Self::InProgress, + Input::Completed => Self::Completed, + Input::Failed => Self::Failed, } } } diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 48ae59704f..7d72cab76a 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -327,7 +327,46 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: None, + migration_state: s.migration_state.map(Into::into), + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationRuntimeState +{ + fn from(s: types::MigrationRuntimeState) -> Self { + Self { + migration_id: s.migration_id, + state: s.state.into(), + role: s.role.into(), + gen: s.gen, + time_updated: s.time_updated, + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationRole +{ + fn from(r: types::MigrationRole) -> Self { + use omicron_common::api::internal::nexus::MigrationRole as Output; + match r { + types::MigrationRole::Source => Output::Source, + types::MigrationRole::Target => Output::Target, + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationState +{ + fn from(s: types::MigrationState) -> Self { + use omicron_common::api::internal::nexus::MigrationState as Output; + match s { + types::MigrationState::InProgress => Output::InProgress, + types::MigrationState::Failed => Output::Failed, + types::MigrationState::Completed => Output::Completed, } } } diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 5160f44782..ecfb0756ba 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -7,6 +7,7 @@ use crate::schema::migration; use crate::MigrationState; use chrono::DateTime; use chrono::Utc; +use omicron_common::api::internal::nexus; use serde::Deserialize; use serde::Serialize; use uuid::Uuid; @@ -62,11 +63,11 @@ impl Migration { Self { id: migration_id, time_created: Utc::now(), - source_state: MigrationState::InProgress, + source_state: nexus::MigrationState::InProgress.into(), source_propolis_id, source_gen: Generation::new(), time_source_updated: None, - target_state: MigrationState::InProgress, + target_state: nexus::MigrationState::InProgress.into(), target_propolis_id, target_gen: Generation::new(), time_target_updated: None, diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 46f94b609c..5aa404b5e4 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -14,8 +14,8 @@ use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use diesel::prelude::*; use omicron_common::api::external::CreateResult; -use omicron_common::api::external::Error; use omicron_common::api::external::UpdateResult; use uuid::Uuid; @@ -30,6 +30,8 @@ impl DataStore { .values(migration) .on_conflict(dsl::id) .do_update() + // I don't know what this does but it somehow + .set(dsl::time_created.eq(dsl::time_created)) .returning(Migration::as_returning()) .get_result_async(&*self.pool_connection_authorized(opctx).await?) .await diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 9ec3575860..df85d62c69 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -69,6 +69,7 @@ pub mod instance; mod inventory; mod ip_pool; mod ipv4_nat_entry; +mod migration; mod network_interface; mod oximeter; mod physical_disk; diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 7ec6d9b61b..86454dd27f 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -216,11 +216,14 @@ async fn sim_create_migration_record( osagactx .datastore() - .migration_insert(db::model::Migration::new( - migration_id, - source_propolis_id, - target_propolis_id, - )) + .migration_insert( + &opctx, + db::model::Migration::new( + migration_id, + source_propolis_id, + target_propolis_id, + ), + ) .await .map_err(ActionError::action_failed) } @@ -239,7 +242,7 @@ async fn sim_delete_migration_record( info!(osagactx.log(), "deleting migration record"; "migration_id" => %migration_id); - osagactx.datastore().migration_mark_deleted(migration_id).await?; + osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await?; Ok(()) } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index e4935e9cbb..e7e49919d5 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -6,12 +6,14 @@ use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; +use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmRuntimeState, VmmState, + InstanceRuntimeState, MigrationRole, MigrationRuntimeState, MigrationState, + SledInstanceState, VmmRuntimeState, VmmState, }; use propolis_client::types::{ InstanceState as PropolisApiState, InstanceStateMonitorResponse, - MigrationState, + MigrationState as PropolisMigrationState, }; use uuid::Uuid; @@ -21,6 +23,7 @@ pub struct InstanceStates { instance: InstanceRuntimeState, vmm: VmmRuntimeState, propolis_id: Uuid, + migration: Option, } /// Newtype to allow conversion from Propolis API states (returned by the @@ -123,10 +126,10 @@ impl ObservedPropolisState { if this_id == propolis_migration.migration_id => { match propolis_migration.state { - MigrationState::Finish => { + PropolisMigrationState::Finish => { ObservedMigrationStatus::Succeeded } - MigrationState::Error => { + PropolisMigrationState::Error => { ObservedMigrationStatus::Failed } _ => ObservedMigrationStatus::InProgress, @@ -208,7 +211,7 @@ impl InstanceStates { vmm: VmmRuntimeState, propolis_id: Uuid, ) -> Self { - InstanceStates { instance, vmm, propolis_id } + InstanceStates { instance, vmm, propolis_id, migration: None } } pub fn instance(&self) -> &InstanceRuntimeState { @@ -231,7 +234,7 @@ impl InstanceStates { instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, - migration_state: None, // TODO(eliza): add this + migration_state: self.migration.clone(), } } @@ -257,56 +260,82 @@ impl InstanceStates { // Update the instance record to reflect the result of any completed // migration. match observed.migration_status { - ObservedMigrationStatus::Succeeded => match self.propolis_role() { - // This is a successful migration out. Point the instance to the - // target VMM, but don't clear migration IDs; let the target do - // that so that the instance will continue to appear to be - // migrating until it is safe to migrate again. - PropolisRole::Active => { - self.switch_propolis_id_to_target(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Retired); + ObservedMigrationStatus::Succeeded => { + if let Some(ref mut migration) = self.migration { + migration.state = MigrationState::Completed; + migration.gen = migration.gen.next(); + migration.time_updated = observed.time; + } else { + // XXX(eliza): we don't have an active migration state, but + // we saw a migration succeed. this is almost certainly a + // bug, but i don't *really* want to panic here...but, we + // also don't have a `slog` logger in this function, so ... + // what do. } + match self.propolis_role() { + // This is a successful migration out. Point the instance to the + // target VMM, but don't clear migration IDs; let the target do + // that so that the instance will continue to appear to be + // migrating until it is safe to migrate again. + PropolisRole::Active => { + self.switch_propolis_id_to_target(observed.time); + + assert_eq!(self.propolis_role(), PropolisRole::Retired); + } - // This is a successful migration in. Point the instance to the - // target VMM and clear migration IDs so that another migration - // in can begin. Propolis will continue reporting that this - // migration was successful, but because its ID has been - // discarded the observed migration status will change from - // Succeeded to NoMigration. - // - // Note that these calls increment the instance's generation - // number twice. This is by design and allows the target's - // migration-ID-clearing update to overtake the source's update. - PropolisRole::MigrationTarget => { - self.switch_propolis_id_to_target(observed.time); - self.clear_migration_ids(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Active); - } + // This is a successful migration in. Point the instance to the + // target VMM and clear migration IDs so that another migration + // in can begin. Propolis will continue reporting that this + // migration was successful, but because its ID has been + // discarded the observed migration status will change from + // Succeeded to NoMigration. + // + // Note that these calls increment the instance's generation + // number twice. This is by design and allows the target's + // migration-ID-clearing update to overtake the source's update. + PropolisRole::MigrationTarget => { + self.switch_propolis_id_to_target(observed.time); + self.clear_migration_ids(observed.time); + + assert_eq!(self.propolis_role(), PropolisRole::Active); + } - // This is a migration source that previously reported success - // and removed itself from the active Propolis position. Don't - // touch the instance. - PropolisRole::Retired => {} - }, - ObservedMigrationStatus::Failed => match self.propolis_role() { - // This is a failed migration out. CLear migration IDs so that - // Nexus can try again. - PropolisRole::Active => { - self.clear_migration_ids(observed.time); + // This is a migration source that previously reported success + // and removed itself from the active Propolis position. Don't + // touch the instance. + PropolisRole::Retired => {} + } + } + ObservedMigrationStatus::Failed => { + if let Some(ref mut migration) = self.migration { + migration.state = MigrationState::Failed; + migration.gen = migration.gen.next(); + migration.time_updated = observed.time; + } else { + // XXX(eliza): we don't have an active migration state, but + // we saw a migration succeed. this is almost certainly a + // bug, but i don't *really* want to panic here...but, we + // also don't have a `slog` logger in this function, so ... + // what do. } + match self.propolis_role() { + // This is a failed migration out. CLear migration IDs so that + // Nexus can try again. + PropolisRole::Active => { + self.clear_migration_ids(observed.time); + } - // This is a failed migration in. Leave the migration IDs alone - // so that the migration won't appear to have concluded until - // the source is ready to start a new one. - PropolisRole::MigrationTarget => {} + // This is a failed migration in. Leave the migration IDs alone + // so that the migration won't appear to have concluded until + // the source is ready to start a new one. + PropolisRole::MigrationTarget => {} - // This VMM was part of a failed migration and was subsequently - // removed from the instance record entirely. There's nothing to - // update. - PropolisRole::Retired => {} - }, + // This VMM was part of a failed migration and was subsequently + // removed from the instance record entirely. There's nothing to + // update. + PropolisRole::Retired => {} + } + } ObservedMigrationStatus::NoMigration | ObservedMigrationStatus::InProgress | ObservedMigrationStatus::Pending => {} @@ -432,12 +461,29 @@ impl InstanceStates { ids: &Option, now: DateTime, ) { - if let Some(ids) = ids { - self.instance.migration_id = Some(ids.migration_id); - self.instance.dst_propolis_id = Some(ids.dst_propolis_id); + if let Some(InstanceMigrationSourceParams { + migration_id, + dst_propolis_id, + }) = *ids + { + self.instance.migration_id = Some(migration_id); + self.instance.dst_propolis_id = Some(dst_propolis_id); + let role = if dst_propolis_id == self.propolis_id { + MigrationRole::Target + } else { + MigrationRole::Source + }; + self.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role, + gen: Generation::new(), + time_updated: now, + }) } else { self.instance.migration_id = None; self.instance.dst_propolis_id = None; + self.migration = None; } self.instance.gen = self.instance.gen.next(); From 6b63d22c9e306057697a8780c7480db851be7702 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 13:37:40 -0700 Subject: [PATCH 11/32] fix CTE syntax error --- nexus/db-queries/src/db/queries/instance.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index a5e5d86794..6cde96fce8 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -342,9 +342,10 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(instance_dsl::id::NAME)?; out.push_sql(" = instance_updated."); out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(") "); + out.push_sql(")"); if self.migration.is_some() { + out.push_sql(", "); out.push_sql("migration_result AS ("); out.push_sql("SELECT migration_found."); out.push_identifier(migration_dsl::id::NAME)?; @@ -357,8 +358,9 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(migration_dsl::id::NAME)?; out.push_sql(" = migration_updated."); out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(") "); + out.push_sql(")"); } + out.push_sql(" "); out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); out.push_sql("instance_result.found, instance_result.updated, "); From 7f3c7faafa1a3ef77484c4eed4f699d5fbe9db58 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 14:29:11 -0700 Subject: [PATCH 12/32] add test that migration state updates --- nexus/tests/integration_tests/instances.rs | 60 +++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 565e2fbafb..9ced72e2c7 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -673,6 +673,33 @@ async fn test_instance_start_creates_networking_state( #[nexus_test] async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { + use nexus_db_model::Migration; + use omicron_common::api::internal::nexus::MigrationState; + async fn migration_fetch( + cptestctx: &ControlPlaneTestContext, + migration_id: Uuid, + ) -> Migration { + use async_bb8_diesel::AsyncRunQueryDsl; + use diesel::prelude::*; + use nexus_db_queries::db::schema::migration::dsl; + + let datastore = + cptestctx.server.server_context().nexus.datastore().clone(); + let db_state = dsl::migration + .filter(dsl::id.eq(migration_id)) + .select(Migration::as_select()) + .get_results_async::( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + + info!(&cptestctx.logctx.log, "refetched migration info from db"; + "migration" => ?db_state); + + db_state.into_iter().next().unwrap() + } + let client = &cptestctx.external_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; @@ -729,7 +756,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let migrate_url = format!("/v1/instances/{}/migrate", &instance_id.to_string()); - let _ = NexusRequest::new( + let instance = NexusRequest::new( RequestBuilder::new(client, Method::POST, &migrate_url) .body(Some(¶ms::InstanceMigrate { dst_sled_id })) .expect_status(Some(StatusCode::OK)), @@ -749,6 +776,30 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(current_sled, original_sled); + // Ensure that both sled agents report that the migration is in progress. + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.target_state, MigrationState::InProgress.into()); + assert_eq!(migration.source_state, MigrationState::InProgress.into()); + // Explicitly simulate the migration action on the target. Simulated // migrations always succeed. The state transition on the target is // sufficient to move the instance back into a Running state (strictly @@ -765,6 +816,13 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { .expect("migrated instance should still have a sled"); assert_eq!(current_sled, dst_sled_id); + + // Ensure that both sled agents report that the migration has completed. + instance_simulate_on_sled(cptestctx, nexus, default_sled_id, instance_id) + .await; + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.target_state, MigrationState::Completed.into()); + assert_eq!(migration.source_state, MigrationState::Completed.into()); } #[nexus_test] From 45d7844070abf5d45bf11ebcdc72af8e06f27107 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 6 Jun 2024 14:29:22 -0700 Subject: [PATCH 13/32] clippy --- clients/nexus-client/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index ed0dab9556..a0893747a1 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -164,7 +164,7 @@ impl From migration_id: s.migration_id, role: s.role.into(), state: s.state.into(), - gen: s.gen.into(), + gen: s.gen, time_updated: s.time_updated, } } From c6311fb593e097de4b7490239b50878b7505e5f9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 10:28:04 -0700 Subject: [PATCH 14/32] make simulated sled agent complete src migration --- sled-agent/src/common/instance.rs | 21 ++++++++++++++++++++- sled-agent/src/sim/instance.rs | 24 +++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index e7e49919d5..b16bdf64f5 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -211,7 +211,22 @@ impl InstanceStates { vmm: VmmRuntimeState, propolis_id: Uuid, ) -> Self { - InstanceStates { instance, vmm, propolis_id, migration: None } + let migration = instance.migration_id.map(|migration_id| { + let dst_propolis_id = instance.dst_propolis_id.expect("if an instance has a migration ID, it should also have a target VMM ID"); + let role = if dst_propolis_id == propolis_id { + MigrationRole::Target + } else { + MigrationRole::Source + }; + MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role, + gen: Generation::new(), + time_updated: Utc::now(), + } + }); + InstanceStates { instance, vmm, propolis_id, migration } } pub fn instance(&self) -> &InstanceRuntimeState { @@ -226,6 +241,10 @@ impl InstanceStates { self.propolis_id } + pub(crate) fn migration(&self) -> Option<&MigrationRuntimeState> { + self.migration.as_ref() + } + /// Creates a `SledInstanceState` structure containing the entirety of this /// structure's runtime state. This requires cloning; for simple read access /// use the `instance` or `vmm` accessors instead. diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index ed88dbcc6f..f28e162458 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -16,7 +16,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmState, + InstanceRuntimeState, MigrationRole, SledInstanceState, VmmState, }; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateStatus, @@ -360,6 +360,28 @@ impl SimInstanceInner { } self.state.set_migration_ids(ids, Utc::now()); + let role = self.state.migration().expect("we just got a `put_migration_ids` request, so we should have a migration").role; + if role == MigrationRole::Source { + // Propolis transitions to the Migrating state once before + // actually starting migration. + self.queue_propolis_state(PropolisInstanceState::Migrating); + let migration_id = + self.state.instance().migration_id.unwrap_or_else(|| { + panic!( + "should have migration ID set before getting request to + migrate in (current state: {:?})", + self + ) + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Sync, + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Finish, + }); + } Ok(self.state.sled_instance_state()) } } From 43254ad67bac0d418c3e03ff92db95a68fa56880 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 10:55:46 -0700 Subject: [PATCH 15/32] do the actual correct simulated migration --- nexus/tests/integration_tests/instances.rs | 2 +- sled-agent/src/sim/instance.rs | 103 ++++++++++----------- 2 files changed, 49 insertions(+), 56 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 9ced72e2c7..3377375eac 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -818,7 +818,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(current_sled, dst_sled_id); // Ensure that both sled agents report that the migration has completed. - instance_simulate_on_sled(cptestctx, nexus, default_sled_id, instance_id) + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.target_state, MigrationState::Completed.into()); diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index f28e162458..13385cb531 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -78,6 +78,46 @@ impl SimInstanceInner { self.queue.push_back(MonitorChange::MigrateStatus(migrate_status)) } + /// Queue a successful simulated migration. + /// + fn queue_successful_migration(&mut self, role: MigrationRole) { + // Propolis transitions to the Migrating state once before + // actually starting migration. + self.queue_propolis_state(PropolisInstanceState::Migrating); + let migration_id = + self.state.instance().migration_id.unwrap_or_else(|| { + panic!( + "should have migration ID set before getting request to + migrate in (current state: {:?})", + self + ) + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Sync, + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Finish, + }); + + // The state we transition to after the migration completes will depend + // on whether we are the source or destination. + match role { + MigrationRole::Target => { + self.queue_propolis_state(PropolisInstanceState::Running) + } + MigrationRole::Source => self.queue_graceful_stop(), + } + } + + fn queue_graceful_stop(&mut self) { + self.state.transition_vmm(PublishedVmmState::Stopping, Utc::now()); + self.queue_propolis_state(PropolisInstanceState::Stopping); + self.queue_propolis_state(PropolisInstanceState::Stopped); + self.queue_propolis_state(PropolisInstanceState::Destroyed); + } + /// Searches the queue for its last Propolis state change transition. If /// one exists, returns the associated Propolis state. fn last_queued_instance_state(&self) -> Option { @@ -118,26 +158,7 @@ impl SimInstanceInner { ))); } - // Propolis transitions to the Migrating state once before - // actually starting migration. - self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = - self.state.instance().migration_id.unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to - migrate in (current state: {:?})", - self - ) - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Sync, - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Finish, - }); - self.queue_propolis_state(PropolisInstanceState::Running); + self.queue_successful_migration(MigrationRole::Target) } InstanceStateRequested::Running => { match self.next_resting_state() { @@ -171,21 +192,7 @@ impl SimInstanceInner { VmmState::Starting => { self.state.terminate_rudely(); } - VmmState::Running => { - self.state.transition_vmm( - PublishedVmmState::Stopping, - Utc::now(), - ); - self.queue_propolis_state( - PropolisInstanceState::Stopping, - ); - self.queue_propolis_state( - PropolisInstanceState::Stopped, - ); - self.queue_propolis_state( - PropolisInstanceState::Destroyed, - ); - } + VmmState::Running => self.queue_graceful_stop(), // Idempotently allow requests to stop an instance that is // already stopping. VmmState::Stopping @@ -360,27 +367,13 @@ impl SimInstanceInner { } self.state.set_migration_ids(ids, Utc::now()); - let role = self.state.migration().expect("we just got a `put_migration_ids` request, so we should have a migration").role; + let role = self + .state + .migration() + .expect("we just got a `put_migration_ids` request, so we should have a migration") + .role; if role == MigrationRole::Source { - // Propolis transitions to the Migrating state once before - // actually starting migration. - self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = - self.state.instance().migration_id.unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to - migrate in (current state: {:?})", - self - ) - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Sync, - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Finish, - }); + self.queue_successful_migration(MigrationRole::Target) } Ok(self.state.sled_instance_state()) } From fd5e49c43c91fb1fccac01627badd26f0cb27643 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 11:25:30 -0700 Subject: [PATCH 16/32] Update common/src/api/internal/nexus.rs --- common/src/api/internal/nexus.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index d71f1c62ef..433a25b811 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -125,7 +125,6 @@ pub struct MigrationRuntimeState { pub gen: Generation, /// Timestamp for the migration state update. - // TODO(eliza): could this just be the VMM state timestamp? pub time_updated: DateTime, } From 631ba0f18edd9f1ad83e8d64073bcc243b9bac3f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 12:42:58 -0700 Subject: [PATCH 17/32] add pending state --- clients/nexus-client/src/lib.rs | 1 + clients/sled-agent-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 14 +++++++++++++- nexus/db-model/src/migration.rs | 4 ++-- nexus/db-model/src/migration_state.rs | 1 + nexus/tests/integration_tests/instances.rs | 11 ++++++----- openapi/nexus-internal.json | 7 +++++++ openapi/sled-agent.json | 7 +++++++ schema/crdb/add-migration-table/up01.sql | 1 + schema/crdb/dbinit.sql | 1 + sled-agent/src/common/instance.rs | 8 +++++++- 11 files changed, 47 insertions(+), 9 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index a0893747a1..767d8f53d5 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -188,6 +188,7 @@ impl From fn from(s: omicron_common::api::internal::nexus::MigrationState) -> Self { use omicron_common::api::internal::nexus::MigrationState as Input; match s { + Input::Pending => Self::Pending, Input::InProgress => Self::InProgress, Input::Completed => Self::Completed, Input::Failed => Self::Failed, diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 7d72cab76a..d5f777ac83 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -364,6 +364,7 @@ impl From fn from(s: types::MigrationState) -> Self { use omicron_common::api::internal::nexus::MigrationState as Output; match s { + types::MigrationState::Pending => Output::Pending, types::MigrationState::InProgress => Output::InProgress, types::MigrationState::Failed => Output::Failed, types::MigrationState::Completed => Output::Completed, diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 433a25b811..13ee8fe394 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -130,10 +130,21 @@ pub struct MigrationRuntimeState { /// The state of an instance's live migration. #[derive( - Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, + Clone, + Copy, + Debug, + Default, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, )] #[serde(rename_all = "snake_case")] pub enum MigrationState { + /// The migration has not started for this VMM. + #[default] + Pending, /// The migration is in progress. InProgress, /// The migration has failed. @@ -145,6 +156,7 @@ pub enum MigrationState { impl MigrationState { pub fn label(&self) -> &'static str { match self { + Self::Pending => "pending", Self::InProgress => "in_progress", Self::Completed => "completed", Self::Failed => "failed", diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index ecfb0756ba..e7635549af 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -63,11 +63,11 @@ impl Migration { Self { id: migration_id, time_created: Utc::now(), - source_state: nexus::MigrationState::InProgress.into(), + source_state: nexus::MigrationState::Pending.into(), source_propolis_id, source_gen: Generation::new(), time_source_updated: None, - target_state: nexus::MigrationState::InProgress.into(), + target_state: nexus::MigrationState::Pending.into(), target_propolis_id, target_gen: Generation::new(), time_target_updated: None, diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index d60b8e552d..5a532708a6 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -21,6 +21,7 @@ impl_enum_wrapper!( pub struct MigrationState(pub nexus::MigrationState); // Enum values + Pending => b"pending" InProgress => b"in_progress" Completed => b"completed" Failed => b"failed" diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 3377375eac..c0aa3b61fc 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -797,8 +797,8 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { .expect("since we've started a migration, the instance record must have a migration id!") }; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); - assert_eq!(migration.target_state, MigrationState::InProgress.into()); - assert_eq!(migration.source_state, MigrationState::InProgress.into()); + assert_eq!(migration.target_state, MigrationState::Pending.into()); + assert_eq!(migration.source_state, MigrationState::Pending.into()); // Explicitly simulate the migration action on the target. Simulated // migrations always succeed. The state transition on the target is @@ -806,6 +806,10 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { // speaking no further updates from the source are required if the target // successfully takes over). instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + // Ensure that both sled agents report that the migration has completed. + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + let instance = instance_get(&client, &instance_url).await; assert_eq!(instance.runtime.run_state, InstanceState::Running); @@ -817,9 +821,6 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(current_sled, dst_sled_id); - // Ensure that both sled agents report that the migration has completed. - instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) - .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.target_state, MigrationState::Completed.into()); assert_eq!(migration.source_state, MigrationState::Completed.into()); diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 0d9e7679c4..59e2d128cc 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3480,6 +3480,13 @@ "MigrationState": { "description": "The state of an instance's live migration.", "oneOf": [ + { + "description": "The migration has not started for this VMM.", + "type": "string", + "enum": [ + "pending" + ] + }, { "description": "The migration is in progress.", "type": "string", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 3b0d5952e5..056c94a56a 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3458,6 +3458,13 @@ "MigrationState": { "description": "The state of an instance's live migration.", "oneOf": [ + { + "description": "The migration has not started for this VMM.", + "type": "string", + "enum": [ + "pending" + ] + }, { "description": "The migration is in progress.", "type": "string", diff --git a/schema/crdb/add-migration-table/up01.sql b/schema/crdb/add-migration-table/up01.sql index 4de18e0641..7659d7dca3 100644 --- a/schema/crdb/add-migration-table/up01.sql +++ b/schema/crdb/add-migration-table/up01.sql @@ -1,4 +1,5 @@ CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'pending', 'in_progress', 'failed', 'completed' diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index bf1ccab98c..8b74858134 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4065,6 +4065,7 @@ ON CONFLICT (id) DO NOTHING; CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'pending', 'in_progress', 'failed', 'completed' diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index b16bdf64f5..09c45173df 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -355,8 +355,14 @@ impl InstanceStates { PropolisRole::Retired => {} } } + ObservedMigrationStatus::InProgress => { + if let Some(ref mut migration) = self.migration { + migration.state = MigrationState::InProgress; + migration.gen = migration.gen.next(); + migration.time_updated = observed.time; + } + } ObservedMigrationStatus::NoMigration - | ObservedMigrationStatus::InProgress | ObservedMigrationStatus::Pending => {} } From 1aecc88ba5488619a374d5f4deb60e2fbaa223fa Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 12:46:27 -0700 Subject: [PATCH 18/32] fix sled-agent sim panicking when unsetting migration --- sled-agent/src/sim/instance.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 13385cb531..d5b022625b 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -367,14 +367,24 @@ impl SimInstanceInner { } self.state.set_migration_ids(ids, Utc::now()); - let role = self + + // If we set migration IDs and are the migration source, ensure that we + // will perform the correct state transitions to simulate a successful + // migration. + if ids.is_some() { + let role = self .state .migration() - .expect("we just got a `put_migration_ids` request, so we should have a migration") + .expect( + "we just got a `put_migration_ids` request with `Some` IDs, \ + so we should have a migration" + ) .role; - if role == MigrationRole::Source { - self.queue_successful_migration(MigrationRole::Target) + if role == MigrationRole::Source { + self.queue_successful_migration(MigrationRole::Target) + } } + Ok(self.state.sled_instance_state()) } } From 270368f0ab8141b2e385d9fd48f4f50228a25c0b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 7 Jun 2024 13:34:44 -0700 Subject: [PATCH 19/32] fix unexpected null --- nexus/db-model/src/migration.rs | 4 ++++ nexus/db-queries/src/db/datastore/migration.rs | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index e7635549af..79eb20708c 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -27,6 +27,9 @@ pub struct Migration { /// The time at which this migration record was created. pub time_created: DateTime, + /// The time at which this migration record was deleted, + pub time_deleted: Option>, + /// The time at which the source VMM state was last updated. /// The state of the migration source VMM. @@ -63,6 +66,7 @@ impl Migration { Self { id: migration_id, time_created: Utc::now(), + time_deleted: None, source_state: nexus::MigrationState::Pending.into(), source_propolis_id, source_gen: Generation::new(), diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 5aa404b5e4..7876db3d5b 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -30,7 +30,6 @@ impl DataStore { .values(migration) .on_conflict(dsl::id) .do_update() - // I don't know what this does but it somehow .set(dsl::time_created.eq(dsl::time_created)) .returning(Migration::as_returning()) .get_result_async(&*self.pool_connection_authorized(opctx).await?) From 47cfd550baa891a02b694e1a262012beac479a81 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 10 Jun 2024 14:25:41 -0700 Subject: [PATCH 20/32] fix places where i got source/target backwards --- nexus/db-queries/src/db/queries/instance.rs | 2 +- sled-agent/src/sim/instance.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 6cde96fce8..282e5a6a50 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -222,7 +222,7 @@ impl InstanceAndVmmUpdate { .filter(migration_dsl::source_gen.lt(gen)) .set(( migration_dsl::source_state.eq(state), - migration_dsl::time_target_updated + migration_dsl::time_source_updated .eq(time_updated), )), ), diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index d5b022625b..2ac8618399 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -381,7 +381,7 @@ impl SimInstanceInner { ) .role; if role == MigrationRole::Source { - self.queue_successful_migration(MigrationRole::Target) + self.queue_successful_migration(MigrationRole::Source) } } From f399eb1b036a0d6846004ef3d93883bca11d44d3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 10 Jun 2024 14:39:00 -0700 Subject: [PATCH 21/32] misc. @gjcolombo review feedback --- nexus/src/app/sagas/instance_migrate.rs | 11 +++---- sled-agent/src/common/instance.rs | 39 ++++++++++--------------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index ffc5cc0723..cbcad41a4f 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -60,15 +60,16 @@ declare_saga_actions! { + sim_allocate_propolis_ip } + CREATE_VMM_RECORD -> "dst_vmm_record" { + + sim_create_vmm_record + - sim_destroy_vmm_record + } + CREATE_MIGRATION_RECORD -> "migration_record" { + sim_create_migration_record - sim_delete_migration_record } - CREATE_VMM_RECORD -> "dst_vmm_record" { - + sim_create_vmm_record - - sim_destroy_vmm_record - } // This step the instance's migration ID and destination Propolis ID // fields. Because the instance is active, its current sled agent maintains @@ -132,8 +133,8 @@ impl NexusSaga for SagaInstanceMigrate { builder.append(reserve_resources_action()); builder.append(allocate_propolis_ip_action()); - builder.append(create_migration_record_action()); builder.append(create_vmm_record_action()); + builder.append(create_migration_record_action()); builder.append(set_migration_ids_action()); builder.append(ensure_destination_propolis_action()); builder.append(instance_migrate_action()); diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 09c45173df..c302ac2646 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -280,17 +280,13 @@ impl InstanceStates { // migration. match observed.migration_status { ObservedMigrationStatus::Succeeded => { - if let Some(ref mut migration) = self.migration { - migration.state = MigrationState::Completed; - migration.gen = migration.gen.next(); - migration.time_updated = observed.time; - } else { - // XXX(eliza): we don't have an active migration state, but - // we saw a migration succeed. this is almost certainly a - // bug, but i don't *really* want to panic here...but, we - // also don't have a `slog` logger in this function, so ... - // what do. - } + let migration = self.migration.as_mut().expect( + "an ObservedMigrationStatus::Completed is only \ + constructed if a migration state exists", + ); + migration.state = MigrationState::Completed; + migration.gen = migration.gen.next(); + migration.time_updated = observed.time; match self.propolis_role() { // This is a successful migration out. Point the instance to the // target VMM, but don't clear migration IDs; let the target do @@ -321,22 +317,19 @@ impl InstanceStates { // This is a migration source that previously reported success // and removed itself from the active Propolis position. Don't - // touch the instance. + // touch the instance.' PropolisRole::Retired => {} } } ObservedMigrationStatus::Failed => { - if let Some(ref mut migration) = self.migration { - migration.state = MigrationState::Failed; - migration.gen = migration.gen.next(); - migration.time_updated = observed.time; - } else { - // XXX(eliza): we don't have an active migration state, but - // we saw a migration succeed. this is almost certainly a - // bug, but i don't *really* want to panic here...but, we - // also don't have a `slog` logger in this function, so ... - // what do. - } + let migration = self.migration.as_mut().expect( + "an ObservedMigrationStatus::Failed is only \ + constructed if a migration state exists", + ); + migration.state = MigrationState::Failed; + migration.gen = migration.gen.next(); + migration.time_updated = observed.time; + match self.propolis_role() { // This is a failed migration out. CLear migration IDs so that // Nexus can try again. From 8e756586d275b6f537f7f0a1500d02ca953eb936 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 10 Jun 2024 15:49:12 -0700 Subject: [PATCH 22/32] @smklein's review feedback --- nexus/db-model/src/migration.rs | 2 - nexus/db-queries/src/db/datastore/instance.rs | 38 +++++++--- nexus/src/app/instance.rs | 72 +++++++++---------- 3 files changed, 64 insertions(+), 48 deletions(-) diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 79eb20708c..2abb6e3ee0 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -30,8 +30,6 @@ pub struct Migration { /// The time at which this migration record was deleted, pub time_deleted: Option>, - /// The time at which the source VMM state was last updated. - /// The state of the migration source VMM. pub source_state: MigrationState, diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 07d09ebb5c..e090210cbe 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -142,6 +142,25 @@ pub enum UpdaterLockError { Query(#[from] Error), } +/// The result of an [`DataStore::instance_and_vmm_update_runtime`] call, +/// indicating which records were updated. +#[derive(Copy, Clone, Debug)] +pub struct InstanceUpdateResult { + /// `true` if the instance record was updated, `false` otherwise. + pub instance_updated: bool, + /// `true` if the VMM record was updated, `false` otherwise. + pub vmm_updated: bool, + /// Indicates whether a migration record for this instance was updated, if a + /// [`MigrationRuntimeState`] was provided to + /// [`DataStore::instance_and_vmm_update_runtime`]. + /// + /// - `Some(true)` if a migration record was updated + /// - `Some(false)` if a [`MigrationRuntimeState`] was provided, but the + /// migration record was not updated + /// - `None` if no [`MigrationRuntimeState`] was provided + pub migration_updated: Option, +} + impl DataStore { /// Idempotently insert a database record for an Instance /// @@ -373,12 +392,11 @@ impl DataStore { /// /// # Return value /// - /// - `Ok((instance_updated, vmm_updated))` if the query was issued - /// successfully. `instance_updated` and `vmm_updated` are each true if - /// the relevant item was updated and false otherwise. Note that an update - /// can fail because it was inapplicable (i.e. the database has state with - /// a newer generation already) or because the relevant record was not - /// found. + /// - `Ok(`[`InstanceUpdateResult`]`)` if the query was issued + /// successfully. The returned [`InstanceUpdateResult`] indicates which + /// database record(s) were updated. Note that an update can fail because + /// it was inapplicable (i.e. the database has state with a newer + /// generation already) or because the relevant record was not found. /// - `Err` if another error occurred while accessing the database. pub async fn instance_and_vmm_update_runtime( &self, @@ -387,7 +405,7 @@ impl DataStore { vmm_id: &Uuid, new_vmm: &VmmRuntimeState, migration: &Option, - ) -> Result<(bool, bool, Option), Error> { + ) -> Result { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( *instance_id, new_instance.clone(), @@ -427,7 +445,11 @@ impl DataStore { None }; - Ok((instance_updated, vmm_updated, migration_updated)) + Ok(InstanceUpdateResult { + instance_updated, + vmm_updated, + migration_updated, + }) } /// Lists all instances on in-service sleds with active Propolis VMM diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index ef90925421..98b4ea91e2 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -25,6 +25,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; @@ -563,28 +564,27 @@ impl super::Nexus { // outright fails, this operation fails. If the operation nominally // succeeds but nothing was updated, this action is outdated and the // caller should not proceed with migration. - let (updated, _, _) = match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await? - - // TODO(eliza): this is where we would also want to insert to the - // migration table... - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; + let InstanceUpdateResult { instance_updated, .. } = + match instance_put_result { + Ok(state) => { + self.write_returned_instance_state(&instance_id, state) + .await? } - return Err(e.into()); - } - }; + Err(e) => { + if e.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &instance_id, + &prev_instance_runtime, + &e, + ) + .await; + } + return Err(e.into()); + } + }; - if updated { + if instance_updated { Ok(self .db_datastore .instance_refetch(opctx, &authz_instance) @@ -1324,7 +1324,7 @@ impl super::Nexus { &self, instance_id: &Uuid, state: Option, - ) -> Result<(bool, bool, Option), Error> { + ) -> Result { slog::debug!(&self.log, "writing instance state returned from sled agent"; "instance_id" => %instance_id, @@ -1350,7 +1350,13 @@ impl super::Nexus { update_result } else { - Ok((false, false, None)) + // There was no instance state to write back, so --- perhaps + // obviously --- nothing happened. + Ok(InstanceUpdateResult { + instance_updated: false, + vmm_updated: false, + migration_updated: None, + }) } } @@ -1958,16 +1964,6 @@ impl super::Nexus { } } -/// Records what aspects of an instance's state were actually changed in a -/// [`notify_instance_updated`] call. -/// -/// This is (presently) used for debugging purposes only. -#[derive(Copy, Clone)] -pub(crate) struct InstanceUpdated { - pub instance_updated: bool, - pub vmm_updated: bool, -} - /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. #[allow(clippy::too_many_arguments)] // :( @@ -1980,7 +1976,7 @@ pub(crate) async fn notify_instance_updated( instance_id: &Uuid, new_runtime_state: &nexus::SledInstanceState, v2p_notification_tx: tokio::sync::watch::Sender<()>, -) -> Result, Error> { +) -> Result, Error> { let propolis_id = new_runtime_state.propolis_id; info!(log, "received new runtime state from sled agent"; @@ -2118,14 +2114,14 @@ pub(crate) async fn notify_instance_updated( } match result { - Ok((instance_updated, vmm_updated, migration_updated)) => { + Ok(result) => { info!(log, "instance and vmm updated by sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, - "instance_updated" => instance_updated, - "vmm_updated" => vmm_updated, - "migration_updated" => ?migration_updated); - Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) + "instance_updated" => result.instance_updated, + "vmm_updated" => result.vmm_updated, + "migration_updated" => ?result.migration_updated); + Ok(Some(result)) } // The update command should swallow object-not-found errors and From 762febc22645279fb347e549da398ae2f486ffdf Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 09:32:16 -0700 Subject: [PATCH 23/32] update tests to expect migration states --- sled-agent/src/common/instance.rs | 87 +++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index c302ac2646..3d90b72288 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -603,17 +603,37 @@ mod test { fn make_migration_source_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; - state.instance.migration_id = Some(Uuid::new_v4()); + let migration_id = Uuid::new_v4(); + state.instance.migration_id = Some(migration_id); state.instance.dst_propolis_id = Some(Uuid::new_v4()); + state.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role: MigrationRole::Source, + // advance the generation once, since we are starting out in the + // `InProgress` state. + gen: Generation::new().next(), + time_updated: Utc::now(), + }); state } fn make_migration_target_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; - state.instance.migration_id = Some(Uuid::new_v4()); + let migration_id = Uuid::new_v4(); + state.instance.migration_id = Some(migration_id); state.propolis_id = Uuid::new_v4(); state.instance.dst_propolis_id = Some(state.propolis_id); + state.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role: MigrationRole::Target, + // advance the generation once, since we are starting out in the + // `InProgress` state. + gen: Generation::new().next(), + time_updated: Utc::now(), + }); state } @@ -741,6 +761,14 @@ mod test { assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Destroyed); assert!(state.vmm.gen > prev.vmm.gen); + + // The migration state should transition. + let migration = + state.migration.expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > prev_migration.gen); } #[test] @@ -764,6 +792,14 @@ mod test { assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Failed); assert!(state.vmm.gen > prev.vmm.gen); + + // The migration state should transition. + let migration = + state.migration.expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > prev_migration.gen); } // Verifies that the rude-termination state change doesn't update the @@ -782,6 +818,14 @@ mod test { assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.instance.gen, prev.instance.gen); + + // The migration state should transition. + let migration = + state.migration.expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > prev_migration.gen); } #[test] @@ -804,11 +848,22 @@ mod test { assert_eq!(state.vmm.state, VmmState::Running); assert!(state.vmm.gen > prev.vmm.gen); + // The migration state should transition to completed. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + // Pretend Nexus set some new migration IDs. + let migration_id = Uuid::new_v4(); let prev = state.clone(); state.set_migration_ids( &Some(InstanceMigrationSourceParams { - migration_id: Uuid::new_v4(), + migration_id, dst_propolis_id: Uuid::new_v4(), }), Utc::now(), @@ -817,6 +872,15 @@ mod test { assert!(state.instance.gen > prev.instance.gen); assert_eq!(state.vmm.gen, prev.vmm.gen); + // There should be a new, pending migration state. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Pending); + assert_eq!(migration.migration_id, migration_id); + let prev_migration = migration; + // Mark that the new migration out is in progress. This doesn't change // anything in the instance runtime state, but does update the VMM state // generation. @@ -837,6 +901,15 @@ mod test { assert!(state.vmm.gen > prev.vmm.gen); assert_eq!(state.instance.gen, prev.instance.gen); + // The migration state should transition to in progress. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::InProgress); + assert!(migration.gen > prev_migration.gen); + let prev_migration = migration; + // Propolis will publish that the migration succeeds before changing any // state. This should transfer control to the target but should not // touch the migration ID (that is the new target's job). @@ -855,6 +928,14 @@ mod test { assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); assert!(state.instance.gen > prev.instance.gen); + // The migration state should transition to completed. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + // The rest of the destruction sequence is covered by other tests. } From 7634307bc906bef484ce338d8a31c30499e00bf3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 09:43:14 -0700 Subject: [PATCH 24/32] update tests, start out in pending --- sled-agent/src/common/instance.rs | 35 ++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 3d90b72288..009f765721 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -493,7 +493,7 @@ impl InstanceStates { }; self.migration = Some(MigrationRuntimeState { migration_id, - state: MigrationState::InProgress, + state: MigrationState::Pending, role, gen: Generation::new(), time_updated: now, @@ -736,6 +736,17 @@ mod test { assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); assert!(state.instance.migration_id.is_some()); + // The migration state should transition to "completed" + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + let prev_migration = migration; + // Once a successful migration is observed, the VMM's state should // continue to update, but the instance's state shouldn't change // anymore. @@ -751,6 +762,15 @@ mod test { assert_eq!(state.vmm.state, VmmState::Stopping); assert!(state.vmm.gen > prev.vmm.gen); + // Now that the migration has completed, it should not transition again. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); + let prev_migration = migration; + let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Destroyed); assert!(matches!( @@ -762,13 +782,12 @@ mod test { assert_eq!(state.vmm.state, VmmState::Destroyed); assert!(state.vmm.gen > prev.vmm.gen); - // The migration state should transition. - let migration = - state.migration.expect("instance must have a migration state"); - let prev_migration = - prev.migration.expect("previous state must have a migration"); - assert_eq!(migration.state, MigrationState::Failed); - assert!(migration.gen > prev_migration.gen); + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); } #[test] From e4f7c6b7352ddc803571e9205d0efc03b5a9749b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 09:51:47 -0700 Subject: [PATCH 25/32] don't generate spurious state transitions --- sled-agent/src/common/instance.rs | 45 +++++++++++++++++++------------ 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 009f765721..f3270112ff 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -257,6 +257,24 @@ impl InstanceStates { } } + fn transition_migration( + &mut self, + state: MigrationState, + time_updated: DateTime, + ) { + let migration = self.migration.as_mut().expect( + "an ObservedMigrationState should only be constructed when the \ + VMM has an active migration", + ); + // Don't generate spurious state updates if the migration is already in + // the state we're transitioning to. + if migration.state != state { + migration.state = state; + migration.time_updated = time_updated; + migration.gen = migration.gen.next(); + } + } + /// Update the known state of an instance based on an observed state from /// Propolis. pub(crate) fn apply_propolis_observation( @@ -280,13 +298,10 @@ impl InstanceStates { // migration. match observed.migration_status { ObservedMigrationStatus::Succeeded => { - let migration = self.migration.as_mut().expect( - "an ObservedMigrationStatus::Completed is only \ - constructed if a migration state exists", + self.transition_migration( + MigrationState::Completed, + observed.time, ); - migration.state = MigrationState::Completed; - migration.gen = migration.gen.next(); - migration.time_updated = observed.time; match self.propolis_role() { // This is a successful migration out. Point the instance to the // target VMM, but don't clear migration IDs; let the target do @@ -322,13 +337,10 @@ impl InstanceStates { } } ObservedMigrationStatus::Failed => { - let migration = self.migration.as_mut().expect( - "an ObservedMigrationStatus::Failed is only \ - constructed if a migration state exists", + self.transition_migration( + MigrationState::Failed, + observed.time, ); - migration.state = MigrationState::Failed; - migration.gen = migration.gen.next(); - migration.time_updated = observed.time; match self.propolis_role() { // This is a failed migration out. CLear migration IDs so that @@ -349,11 +361,10 @@ impl InstanceStates { } } ObservedMigrationStatus::InProgress => { - if let Some(ref mut migration) = self.migration { - migration.state = MigrationState::InProgress; - migration.gen = migration.gen.next(); - migration.time_updated = observed.time; - } + self.transition_migration( + MigrationState::InProgress, + observed.time, + ); } ObservedMigrationStatus::NoMigration | ObservedMigrationStatus::Pending => {} From 9d0de2b763848a56df102dde8039016ef12e5eec Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 12:06:37 -0700 Subject: [PATCH 26/32] delete records when a migration completes --- common/src/api/internal/nexus.rs | 6 ++++ nexus/db-model/src/migration_state.rs | 9 +++++ .../db-queries/src/db/datastore/migration.rs | 36 ++++++++++++++++++- nexus/src/app/instance.rs | 34 ++++++++++++++++++ nexus/tests/integration_tests/instances.rs | 4 +++ 5 files changed, 88 insertions(+), 1 deletion(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 13ee8fe394..4f990c56e1 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -162,6 +162,12 @@ impl MigrationState { Self::Failed => "failed", } } + /// Returns `true` if this migration state means that the migration is no + /// longer in progress (it has either succeeded or failed). + #[must_use] + pub fn is_terminal(&self) -> bool { + matches!(self, MigrationState::Completed | MigrationState::Failed) + } } impl fmt::Display for MigrationState { diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index 5a532708a6..694198eb56 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -27,6 +27,15 @@ impl_enum_wrapper!( Failed => b"failed" ); +impl MigrationState { + /// Returns `true` if this migration state means that the migration is no + /// longer in progress (it has either succeeded or failed). + #[must_use] + pub fn is_terminal(&self) -> bool { + self.0.is_terminal() + } +} + impl fmt::Display for MigrationState { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 7876db3d5b..ba8a4e0392 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -8,7 +8,7 @@ use super::DataStore; use crate::context::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::model::Migration; +use crate::db::model::{Migration, MigrationState}; use crate::db::schema::migration::dsl; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; @@ -17,6 +17,7 @@ use chrono::Utc; use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::UpdateResult; +use omicron_common::api::internal::nexus; use uuid::Uuid; impl DataStore { @@ -37,6 +38,39 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Marks a migration record as deleted if and only if both sides of the + /// migration are in a terminal state. + pub async fn migration_terminate( + &self, + opctx: &OpContext, + migration_id: Uuid, + ) -> UpdateResult { + const TERMINAL_STATES: &[MigrationState] = &[ + MigrationState(nexus::MigrationState::Completed), + MigrationState(nexus::MigrationState::Failed), + ]; + + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::source_state.eq_any(TERMINAL_STATES)) + .filter(dsl::target_state.eq_any(TERMINAL_STATES)) + .set(dsl::time_deleted.eq(Utc::now())) + .check_if_exists::(migration_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Unconditionally mark a migration record as deleted. + /// + /// This is distinct from [`DataStore::migration_terminate`], as it will + /// mark a migration as deleted regardless of the states of the source and + /// target VMMs. pub async fn migration_mark_deleted( &self, opctx: &OpContext, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 98b4ea91e2..943665cab3 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -2076,6 +2076,40 @@ pub(crate) async fn notify_instance_updated( ) .await; + // Has a migration terminated? If so,mark the migration record as deleted if + // and only if both sides of the migration are in a terminal state. + if let Some(nexus::MigrationRuntimeState { + migration_id, + state, + role, + .. + }) = new_runtime_state.migration_state + { + if state.is_terminal() { + info!( + log, + "migration has terminated, trying to delete it..."; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "migration_id" => %propolis_id, + "migration_state" => %state, + "migration_role" => %role, + ); + if !datastore.migration_terminate(opctx, migration_id).await? { + info!( + log, + "did not mark migration record as deleted (the other half \ + may not yet have reported termination)"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "migration_id" => %propolis_id, + "migration_state" => %state, + "migration_role" => %role, + ); + } + } + } + // If the VMM is now in a terminal state, make sure its resources get // cleaned up. // diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index c0aa3b61fc..948f8a18f3 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -686,6 +686,9 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let datastore = cptestctx.server.server_context().nexus.datastore().clone(); let db_state = dsl::migration + // N.B. that for the purposes of this test, we explicitly should + // *not* filter out migrations that are marked as deleted, as the + // migration record is marked as deleted once the migration completes. .filter(dsl::id.eq(migration_id)) .select(Migration::as_select()) .get_results_async::( @@ -824,6 +827,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.target_state, MigrationState::Completed.into()); assert_eq!(migration.source_state, MigrationState::Completed.into()); + assert!(migration.time_deleted.is_some()); } #[nexus_test] From f6c98753e4020ed285310b65fde203c44e1fbec5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 13:14:19 -0700 Subject: [PATCH 27/32] report migration failure if an in-progress vmm vanishes --- sled-agent/src/common/instance.rs | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index f3270112ff..8f6ab2c4cb 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -386,6 +386,16 @@ impl InstanceStates { self.clear_migration_ids(observed.time); self.retire_active_propolis(observed.time); } + // If there's an active migration and the VMM is suddenly gone, + // that should constitute a migration failure! + if let Some(MigrationState::Pending | MigrationState::InProgress) = + self.migration.as_ref().map(|m| m.state) + { + self.transition_migration( + MigrationState::Failed, + observed.time, + ); + } Some(Action::Destroy) } else { None @@ -719,6 +729,37 @@ mod test { } } + fn test_termination_fails_in_progress_migration( + mk_instance: impl Fn() -> InstanceStates, + ) { + for state in [Observed::Destroyed, Observed::Failed] { + let mut instance_state = mk_instance(); + let original_migration = instance_state.clone().migration.unwrap(); + let requested_action = instance_state + .apply_propolis_observation(&make_observed_state(state.into())); + + let migration = + instance_state.migration.expect("state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > original_migration.gen); + assert!(matches!(requested_action, Some(Action::Destroy))); + } + } + + #[test] + fn source_termination_fails_in_progress_migration() { + test_termination_fails_in_progress_migration( + make_migration_source_instance, + ) + } + + #[test] + fn target_termination_fails_in_progress_migration() { + test_termination_fails_in_progress_migration( + make_migration_target_instance, + ) + } + #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); From 77a6dfebee8f7c3202b1026fdd2574ad5cf3a10e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 13:18:09 -0700 Subject: [PATCH 28/32] update comment --- nexus/db-queries/src/db/queries/instance.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 282e5a6a50..491dab7403 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -70,6 +70,12 @@ use crate::db::update_and_check::UpdateStatus; // SELECT vmm_result.found, vmm_result.updated, instance_result.found, // instance_result.updated // FROM vmm_result, instance_result; +/// +/// If a [`MigrationUpdate`] is provided, similar "found" and "update" clauses +/// are also added to join the `migration` record for the isntance's active +/// migration, if one exists, and update the migration record. If no migration +/// record is provided, this part of the query is skipped, and the +/// `migration_found` and `migration_updated` portions are always `false`. // // The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL // result in the final output instead of failing the entire query if the target From 29eae363c038000e8a23ccd5ae14198a8a96170e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 15:42:50 -0700 Subject: [PATCH 29/32] fix comment --- nexus/db-queries/src/db/datastore/instance.rs | 65 +++++++++++++++++++ nexus/db-queries/src/db/queries/instance.rs | 8 +-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index e090210cbe..234158f094 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -21,6 +21,7 @@ use crate::db::lookup::LookupPath; use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceRuntimeState; +use crate::db::model::Migration; use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; @@ -119,6 +120,16 @@ impl From for omicron_common::api::external::Instance { } } +/// Wraps a record of an [`Instance`] along with its active [`Vmm`], migration +/// target [`Vmm`], and [`Migration`] record, if a migration exists. +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] +pub struct InstanceSnapshot { + pub instance: Instance, + pub active_vmm: Option, + pub target_vmm: Option, + pub migration: Option, +} + /// A token which represents that a saga holds the instance-updater lock on a /// particular instance. /// @@ -330,6 +341,60 @@ impl DataStore { Ok(InstanceAndActiveVmm { instance, vmm }) } + pub async fn instance_fetch_all( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, authz_instance).await?; + + use db::schema::instance::dsl as instance_dsl; + use db::schema::migration::dsl as migration_dsl; + use db::schema::vmm::dsl as vmm_dsl; + + let (instance, active_vmm, target_vmm, migration) = instance_dsl::instance + .filter(instance_dsl::id.eq(authz_instance.id())) + .filter(instance_dsl::time_deleted.is_null()) + .left_join( + vmm_dsl::vmm.on((vmm_dsl::id + .nullable() + .eq(instance_dsl::active_propolis_id)) + .and(vmm_dsl::time_deleted.is_null())), + ) + .left_join( + vmm_dsl::vmm.on((vmm_dsl::id + .nullable() + .eq(instance_dsl::target_propolis_id)) + .and(vmm_dsl::time_deleted.is_null())), + ) + .left_join( + migration_dsl::migration + .on(migration_dsl::id + .nullable() + .eq(instance_dsl::migration_id)), + ) + .select(( + Instance::as_select(), + Option::::as_select(), + Option::::as_select(), + Option::::as_select(), + )) + .get_result_async::<(Instance, Option, Option, Option)>( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(authz_instance.id()), + ), + ) + })?; + Ok(InstanceSnapshot { instance, active_vmm, target_vmm, migration }) + } + // TODO-design It's tempting to return the updated state of the Instance // here because it's convenient for consumers and by using a RETURNING // clause, we could ensure that the "update" and "fetch" are atomic. diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 491dab7403..ed584c6ce6 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -71,10 +71,10 @@ use crate::db::update_and_check::UpdateStatus; // instance_result.updated // FROM vmm_result, instance_result; /// -/// If a [`MigrationUpdate`] is provided, similar "found" and "update" clauses -/// are also added to join the `migration` record for the isntance's active -/// migration, if one exists, and update the migration record. If no migration -/// record is provided, this part of the query is skipped, and the +/// If a [`MigrationRuntimeState`] is provided, similar "found" and "update" +/// clauses are also added to join the `migration` record for the instance's +/// active migration, if one exists, and update the migration record. If no +/// migration record is provided, this part of the query is skipped, and the /// `migration_found` and `migration_updated` portions are always `false`. // // The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL From 18ccc7e0bb376c8074eb22bf1a9a971602a6b42d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 16:02:24 -0700 Subject: [PATCH 30/32] rm code from other branch --- nexus/db-queries/src/db/datastore/instance.rs | 64 ------------------- 1 file changed, 64 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 234158f094..5d4b7ad483 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -120,16 +120,6 @@ impl From for omicron_common::api::external::Instance { } } -/// Wraps a record of an [`Instance`] along with its active [`Vmm`], migration -/// target [`Vmm`], and [`Migration`] record, if a migration exists. -#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] -pub struct InstanceSnapshot { - pub instance: Instance, - pub active_vmm: Option, - pub target_vmm: Option, - pub migration: Option, -} - /// A token which represents that a saga holds the instance-updater lock on a /// particular instance. /// @@ -341,60 +331,6 @@ impl DataStore { Ok(InstanceAndActiveVmm { instance, vmm }) } - pub async fn instance_fetch_all( - &self, - opctx: &OpContext, - authz_instance: &authz::Instance, - ) -> LookupResult { - opctx.authorize(authz::Action::Read, authz_instance).await?; - - use db::schema::instance::dsl as instance_dsl; - use db::schema::migration::dsl as migration_dsl; - use db::schema::vmm::dsl as vmm_dsl; - - let (instance, active_vmm, target_vmm, migration) = instance_dsl::instance - .filter(instance_dsl::id.eq(authz_instance.id())) - .filter(instance_dsl::time_deleted.is_null()) - .left_join( - vmm_dsl::vmm.on((vmm_dsl::id - .nullable() - .eq(instance_dsl::active_propolis_id)) - .and(vmm_dsl::time_deleted.is_null())), - ) - .left_join( - vmm_dsl::vmm.on((vmm_dsl::id - .nullable() - .eq(instance_dsl::target_propolis_id)) - .and(vmm_dsl::time_deleted.is_null())), - ) - .left_join( - migration_dsl::migration - .on(migration_dsl::id - .nullable() - .eq(instance_dsl::migration_id)), - ) - .select(( - Instance::as_select(), - Option::::as_select(), - Option::::as_select(), - Option::::as_select(), - )) - .get_result_async::<(Instance, Option, Option, Option)>( - &*self.pool_connection_authorized(opctx).await?, - ) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByLookup( - ResourceType::Instance, - LookupType::ById(authz_instance.id()), - ), - ) - })?; - Ok(InstanceSnapshot { instance, active_vmm, target_vmm, migration }) - } - // TODO-design It's tempting to return the updated state of the Instance // here because it's convenient for consumers and by using a RETURNING // clause, we could ensure that the "update" and "fetch" are atomic. From 00ad1912692da605b29c49983ccf85676030b760 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 11 Jun 2024 16:15:32 -0700 Subject: [PATCH 31/32] oops --- nexus/db-queries/src/db/datastore/instance.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 5d4b7ad483..e090210cbe 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -21,7 +21,6 @@ use crate::db::lookup::LookupPath; use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceRuntimeState; -use crate::db::model::Migration; use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; From a1ca9460aaf5bb552c5883ec8ebba8a12044fa2d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 12 Jun 2024 10:42:41 -0700 Subject: [PATCH 32/32] Update sled-agent/src/common/instance.rs --- sled-agent/src/common/instance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 8f6ab2c4cb..b2c135fcf8 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -332,7 +332,7 @@ impl InstanceStates { // This is a migration source that previously reported success // and removed itself from the active Propolis position. Don't - // touch the instance.' + // touch the instance. PropolisRole::Retired => {} } }