Skip to content

Commit

Permalink
remove most of instance_set_migration_ids
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Jun 20, 2024
1 parent 8cc15a5 commit aba9f19
Show file tree
Hide file tree
Showing 12 changed files with 294 additions and 356 deletions.
40 changes: 40 additions & 0 deletions clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
use anyhow::Context;
use async_trait::async_trait;
use omicron_common::api::internal::shared::NetworkInterface;
use schemars::JsonSchema;
use serde::Deserialize;
use serde::Serialize;
use std::convert::TryFrom;
use std::fmt;
use std::hash::Hash;
Expand Down Expand Up @@ -571,6 +574,11 @@ impl From<omicron_common::api::internal::shared::NetworkInterfaceKind>
#[async_trait]
pub trait TestInterfaces {
async fn instance_finish_transition(&self, id: Uuid);
async fn instance_simulate_migration_source(
&self,
id: Uuid,
params: SimulateMigrationSource,
);
async fn disk_finish_transition(&self, id: Uuid);
}

Expand All @@ -597,4 +605,36 @@ impl TestInterfaces for Client {
.await
.expect("disk_finish_transition() failed unexpectedly");
}

async fn instance_simulate_migration_source(
&self,
id: Uuid,
params: SimulateMigrationSource,
) {
let baseurl = self.baseurl();
let client = self.client();
let url = format!("{baseurl}/instances/{id}/sim-migration-source");
client
.post(url)
.send()
.await
.expect("instance_simulate_migration_source() failed unexpectedly");
}
}

// N.B. that this needs to be kept in sync with the types defined in
// `sled_agent::sim`! AFAICT this is the first simulated-only interface that has
// a body, so I wasn't sure whether there was a nice way to do this without
// creating a cyclic dependency or taking a giant pile of query params instead
// of JSON...
#[derive(Serialize, Deserialize, JsonSchema)]
pub struct SimulateMigrationSource {
pub migration_id: Uuid,
pub result: SimulatedMigrationResult,
}

#[derive(Serialize, Deserialize, JsonSchema)]
pub enum SimulatedMigrationResult {
Success,
Failure,
}
73 changes: 3 additions & 70 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ impl super::Nexus {
&self,
opctx: &OpContext,
instance_id: InstanceUuid,
sled_id: SledUuid,
src_propolis_id: PropolisUuid,
prev_instance_runtime: &db::model::InstanceRuntimeState,
migration_params: InstanceMigrationSourceParams,
) -> UpdateResult<db::model::Instance> {
Expand All @@ -550,42 +550,7 @@ impl super::Nexus {
.lookup_for(authz::Action::Modify)
.await?;

let sa = self.sled_client(&sled_id).await?;
let instance_put_result = sa
.instance_put_migration_ids(
&instance_id,
&InstancePutMigrationIdsBody {
old_runtime: prev_instance_runtime.clone().into(),
migration_params: Some(migration_params),
},
)
.await
.map(|res| Some(res.into_inner().into()))
.map_err(|e| SledAgentInstancePutError(e));

// Write the updated instance runtime state back to CRDB. If this
// outright fails, this operation fails. If the operation nominally
// succeeds but nothing was updated, this action is outdated and the
// caller should not proceed with migration.
let InstanceUpdateResult { instance_updated, .. } =
match instance_put_result {
Ok(state) => {
self.write_returned_instance_state(&instance_id, state)
.await?
}
Err(e) => {
if e.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&instance_id,
&prev_instance_runtime,
&e,
)
.await;
}
return Err(e.into());
}
};
let instance_updated = todo!("eliza: do this transition purely in nexus rather than in sled-agent...");

if instance_updated {
Ok(self
Expand Down Expand Up @@ -617,44 +582,12 @@ impl super::Nexus {
pub(crate) async fn instance_clear_migration_ids(
&self,
instance_id: InstanceUuid,
sled_id: SledUuid,
prev_instance_runtime: &db::model::InstanceRuntimeState,
) -> Result<(), Error> {
assert!(prev_instance_runtime.migration_id.is_some());
assert!(prev_instance_runtime.dst_propolis_id.is_some());

let sa = self.sled_client(&sled_id).await?;
let instance_put_result = sa
.instance_put_migration_ids(
&instance_id,
&InstancePutMigrationIdsBody {
old_runtime: prev_instance_runtime.clone().into(),
migration_params: None,
},
)
.await
.map(|res| Some(res.into_inner().into()))
.map_err(|e| SledAgentInstancePutError(e));

match instance_put_result {
Ok(state) => {
self.write_returned_instance_state(&instance_id, state).await?;
}
Err(e) => {
if e.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&instance_id,
&prev_instance_runtime,
&e,
)
.await;
}
return Err(e.into());
}
}

Ok(())
todo!("eliza: do this transition in the DB rather than in sled-agent")
}

/// Reboot the specified instance.
Expand Down
22 changes: 6 additions & 16 deletions nexus/src/app/sagas/instance_migrate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,7 @@ declare_saga_actions! {


// This step the instance's migration ID and destination Propolis ID
// fields. Because the instance is active, its current sled agent maintains
// its most recent runtime state, so to update it, the saga calls into the
// sled and asks it to produce an updated instance record with the
// appropriate migration IDs and a new generation number.
//
// The source sled agent synchronizes concurrent attempts to set these IDs.
// Setting a new migration ID and re-setting an existing ID are allowed, but
// trying to set an ID when a different ID is already present fails.
// fields.
SET_MIGRATION_IDS -> "set_migration_ids" {
+ sim_set_migration_ids
- sim_clear_migration_ids
Expand Down Expand Up @@ -322,14 +315,15 @@ async fn sim_set_migration_ids(

let db_instance = &params.instance;
let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id());
let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id);
let src_propolis_id =
PropolisUuid::from_untyped_uuid(params.src_vmm.sled_id);
let migration_id = sagactx.lookup::<Uuid>("migrate_id")?;
let dst_propolis_id = sagactx.lookup::<PropolisUuid>("dst_propolis_id")?;

info!(osagactx.log(), "setting migration IDs on migration source sled";
"instance_id" => %db_instance.id(),
"sled_id" => %src_sled_id,
"migration_id" => %migration_id,
"src_propolis_id" => %src_propolis_id,
"dst_propolis_id" => %dst_propolis_id,
"prev_runtime_state" => ?db_instance.runtime());

Expand All @@ -338,7 +332,7 @@ async fn sim_set_migration_ids(
.instance_set_migration_ids(
&opctx,
instance_id,
src_sled_id,
src_propolis_id,
db_instance.runtime(),
InstanceMigrationSourceParams { dst_propolis_id, migration_id },
)
Expand Down Expand Up @@ -377,11 +371,7 @@ async fn sim_clear_migration_ids(
// as failed.
if let Err(e) = osagactx
.nexus()
.instance_clear_migration_ids(
instance_id,
src_sled_id,
db_instance.runtime(),
)
.instance_clear_migration_ids(instance_id, db_instance.runtime())
.await
{
warn!(osagactx.log(),
Expand Down
75 changes: 74 additions & 1 deletion nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -813,10 +813,18 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
// sufficient to move the instance back into a Running state (strictly
// speaking no further updates from the source are required if the target
// successfully takes over).
instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;
instance_simulate_migration_source(
cptestctx,
nexus,
original_sled,
instance_id,
migration_id,
)
.await;
// Ensure that both sled agents report that the migration has completed.
instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id)
.await;
instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;

let instance = instance_get(&client, &instance_url).await;
assert_eq!(instance.runtime.run_state, InstanceState::Running);
Expand Down Expand Up @@ -943,8 +951,40 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) {
.parsed_body::<Instance>()
.unwrap();

let migration_id = {
let datastore = apictx.nexus.datastore();
let opctx = OpContext::for_tests(
cptestctx.logctx.log.new(o!()),
datastore.clone(),
);
let (.., authz_instance) = LookupPath::new(&opctx, &datastore)
.instance_id(instance.identity.id)
.lookup_for(nexus_db_queries::authz::Action::Read)
.await
.unwrap();
datastore
.instance_refetch(&opctx, &authz_instance)
.await
.unwrap()
.runtime_state
.migration_id
.expect("since we've started a migration, the instance record must have a migration id!")
};

// Tell both sled-agents to pretend to do the migration.
instance_simulate_migration_source(
cptestctx,
nexus,
original_sled,
instance_id,
migration_id,
)
.await;
instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id)
.await;
instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;
let instance = instance_get(&client, &instance_url).await;

assert_eq!(instance.runtime.run_state, InstanceState::Running);
let current_sled = nexus
.instance_sled_id(&instance_id)
Expand Down Expand Up @@ -4804,3 +4844,36 @@ async fn instance_simulate_on_sled(
let sa = nexus.sled_client(&sled_id).await.unwrap();
sa.instance_finish_transition(instance_id.into_untyped_uuid()).await;
}

/// Simulates a migration source for the provided instance ID, sled ID, and
/// migration ID.
//
// XXX(eliza): I had really wanted to have the migration target's simulated
// sled-agent do this automagically when it's told to start a migration in, but
// unfortunately, I wasn't able to figure out a way for it to get the simulated
// *sled-agent*'s IP --- it just gets the Propolis IP in the migration target
// params, and the propolis doesn't actually exist...
async fn instance_simulate_migration_source(
cptestctx: &ControlPlaneTestContext,
nexus: &Arc<Nexus>,
sled_id: SledUuid,
instance_id: InstanceUuid,
migration_id: Uuid,
) {
info!(
&cptestctx.logctx.log,
"Simulating migration source sled";
"instance_id" => %instance_id,
"sled_id" => %sled_id,
"migration_id" => %migration_id,
);
let sa = nexus.sled_client(&sled_id).await.unwrap();
sa.instance_simulate_migrationSource(
instance_id.into_untyped_uuid(),
sled_agent_client::SimulateMigrationSource {
migration_id,
result: sled_agent_client::SimulatedMigrationResult::Success,
},
)
.await;
}
30 changes: 8 additions & 22 deletions sled-agent/src/common/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

//! Describes the states of VM instances.

use crate::params::InstanceMigrationSourceParams;
use chrono::{DateTime, Utc};
use omicron_common::api::external::Generation;
use omicron_common::api::internal::nexus::{
Expand Down Expand Up @@ -191,6 +190,14 @@ impl InstanceStates {
self.propolis_id
}

pub fn migration_in(&self) -> Option<&MigrationRuntimeState> {
self.migration_in.as_ref()
}

pub fn migration_out(&self) -> Option<&MigrationRuntimeState> {
self.migration_out.as_ref()
}

/// Creates a `SledInstanceState` structure containing the entirety of this
/// structure's runtime state. This requires cloning; for simple read access
/// use the `instance` or `vmm` accessors instead.
Expand Down Expand Up @@ -342,27 +349,6 @@ impl InstanceStates {

self.apply_propolis_observation(&fake_observed);
}

/// Sets or clears this instance's migration IDs and advances its Propolis
/// generation number.
#[deprecated(note = "eliza get rid of this")]
pub(crate) fn set_migration_ids(
&mut self,
ids: &Option<InstanceMigrationSourceParams>,
now: DateTime<Utc>,
) {
}

/// Returns true if the migration IDs in this instance are already set as they
/// would be on a successful transition from the migration IDs in
/// `old_runtime` to the ones in `migration_ids`.
#[deprecated(note = "eliza get rid of this")]
pub(crate) fn migration_ids_already_set(
&self,
migration_ids: &Option<InstanceMigrationSourceParams>,
) -> bool {
false
}
}

#[cfg(test)]
Expand Down
19 changes: 6 additions & 13 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -492,20 +492,13 @@ async fn instance_get_state(
path = "/instances/{instance_id}/migration-ids",
}]
async fn instance_put_migration_ids(
rqctx: RequestContext<SledAgent>,
path_params: Path<InstancePathParam>,
body: TypedBody<InstancePutMigrationIdsBody>,
_: RequestContext<SledAgent>,
_: Path<InstancePathParam>,
_: TypedBody<InstancePutMigrationIdsBody>,
) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
let sa = rqctx.context();
let instance_id = path_params.into_inner().instance_id;
let body_args = body.into_inner();
Ok(HttpResponseOk(
sa.instance_put_migration_ids(
instance_id,
&body_args.old_runtime,
&body_args.migration_params,
)
.await?,
Err(HttpError::for_bad_request(
None,
"operation no longer supported".to_string(),
))
}

Expand Down
Loading

0 comments on commit aba9f19

Please sign in to comment.