diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index acf282a1f9..767d8f53d5 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -149,6 +149,49 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), + migration_state: s.migration_state.map(Into::into), + } + } +} + +impl From + for types::MigrationRuntimeState +{ + fn from( + s: omicron_common::api::internal::nexus::MigrationRuntimeState, + ) -> Self { + Self { + migration_id: s.migration_id, + role: s.role.into(), + state: s.state.into(), + gen: s.gen, + time_updated: s.time_updated, + } + } +} + +impl From + for types::MigrationRole +{ + fn from(s: omicron_common::api::internal::nexus::MigrationRole) -> Self { + use omicron_common::api::internal::nexus::MigrationRole as Input; + match s { + Input::Source => Self::Source, + Input::Target => Self::Target, + } + } +} + +impl From + for types::MigrationState +{ + fn from(s: omicron_common::api::internal::nexus::MigrationState) -> Self { + use omicron_common::api::internal::nexus::MigrationState as Input; + match s { + Input::Pending => Self::Pending, + Input::InProgress => Self::InProgress, + Input::Completed => Self::Completed, + Input::Failed => Self::Failed, } } } diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 1edd464cd5..aa42429089 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -328,6 +328,47 @@ impl From instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), + migration_state: s.migration_state.map(Into::into), + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationRuntimeState +{ + fn from(s: types::MigrationRuntimeState) -> Self { + Self { + migration_id: s.migration_id, + state: s.state.into(), + role: s.role.into(), + gen: s.gen, + time_updated: s.time_updated, + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationRole +{ + fn from(r: types::MigrationRole) -> Self { + use omicron_common::api::internal::nexus::MigrationRole as Output; + match r { + types::MigrationRole::Source => Output::Source, + types::MigrationRole::Target => Output::Target, + } + } +} + +impl From + for omicron_common::api::internal::nexus::MigrationState +{ + fn from(s: types::MigrationState) -> Self { + use omicron_common::api::internal::nexus::MigrationState as Output; + match s { + types::MigrationState::Pending => Output::Pending, + types::MigrationState::InProgress => Output::InProgress, + types::MigrationState::Failed => Output::Failed, + types::MigrationState::Completed => Output::Completed, } } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index b569437f43..4f990c56e1 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -16,6 +16,7 @@ use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::fmt; use std::net::SocketAddr; use std::time::Duration; use strum::{EnumIter, IntoEnumIterator}; @@ -108,6 +109,97 @@ pub struct SledInstanceState { /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, + + /// The current state of any in-progress migration for this instance, as + /// understood by this sled. + pub migration_state: Option, +} + +/// An update from a sled regarding the state of a migration, indicating the +/// role of the VMM whose migration state was updated. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct MigrationRuntimeState { + pub migration_id: Uuid, + pub state: MigrationState, + pub role: MigrationRole, + pub gen: Generation, + + /// Timestamp for the migration state update. + pub time_updated: DateTime, +} + +/// The state of an instance's live migration. +#[derive( + Clone, + Copy, + Debug, + Default, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum MigrationState { + /// The migration has not started for this VMM. + #[default] + Pending, + /// The migration is in progress. + InProgress, + /// The migration has failed. + Failed, + /// The migration has completed. + Completed, +} + +impl MigrationState { + pub fn label(&self) -> &'static str { + match self { + Self::Pending => "pending", + Self::InProgress => "in_progress", + Self::Completed => "completed", + Self::Failed => "failed", + } + } + /// Returns `true` if this migration state means that the migration is no + /// longer in progress (it has either succeeded or failed). + #[must_use] + pub fn is_terminal(&self) -> bool { + matches!(self, MigrationState::Completed | MigrationState::Failed) + } +} + +impl fmt::Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.label()) + } +} + +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum MigrationRole { + /// This update concerns the source VMM of a migration. + Source, + /// This update concerns the target VMM of a migration. + Target, +} + +impl MigrationRole { + pub fn label(&self) -> &'static str { + match self { + Self::Source => "source", + Self::Target => "target", + } + } +} + +impl fmt::Display for MigrationRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.label()) + } } // Oximeter producer/collector objects. diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 040882a8f0..adec410e42 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -42,6 +42,8 @@ pub mod ipv6; mod ipv6net; mod l4_port_range; mod macaddr; +mod migration; +mod migration_state; mod name; mod network_interface; mod oximeter_info; @@ -152,6 +154,8 @@ pub use ipv4net::*; pub use ipv6::*; pub use ipv6net::*; pub use l4_port_range::*; +pub use migration::*; +pub use migration_state::*; pub use name::*; pub use network_interface::*; pub use oximeter_info::*; diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs new file mode 100644 index 0000000000..2abb6e3ee0 --- /dev/null +++ b/nexus/db-model/src/migration.rs @@ -0,0 +1,78 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::Generation; +use crate::schema::migration; +use crate::MigrationState; +use chrono::DateTime; +use chrono::Utc; +use omicron_common::api::internal::nexus; +use serde::Deserialize; +use serde::Serialize; +use uuid::Uuid; + +/// The state of a migration as understood by Nexus. +#[derive( + Clone, Debug, Queryable, Insertable, Selectable, Serialize, Deserialize, +)] +#[diesel(table_name = migration)] +pub struct Migration { + /// The migration's UUID. + /// + /// This is the primary key of the migration table and is referenced by the + /// `instance` table's `migration_id` field. + pub id: Uuid, + + /// The time at which this migration record was created. + pub time_created: DateTime, + + /// The time at which this migration record was deleted, + pub time_deleted: Option>, + + /// The state of the migration source VMM. + pub source_state: MigrationState, + + /// The ID of the migration source VMM. + pub source_propolis_id: Uuid, + + /// The generation number for the source state. + pub source_gen: Generation, + + /// The time the source VMM state was most recently updated. + pub time_source_updated: Option>, + + /// The state of the migration target VMM. + pub target_state: MigrationState, + + /// The ID of the migration target VMM. + pub target_propolis_id: Uuid, + + /// The generation number for the target state. + pub target_gen: Generation, + + /// The time the target VMM state was most recently updated. + pub time_target_updated: Option>, +} + +impl Migration { + pub fn new( + migration_id: Uuid, + source_propolis_id: Uuid, + target_propolis_id: Uuid, + ) -> Self { + Self { + id: migration_id, + time_created: Utc::now(), + time_deleted: None, + source_state: nexus::MigrationState::Pending.into(), + source_propolis_id, + source_gen: Generation::new(), + time_source_updated: None, + target_state: nexus::MigrationState::Pending.into(), + target_propolis_id, + target_gen: Generation::new(), + time_target_updated: None, + } + } +} diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs new file mode 100644 index 0000000000..694198eb56 --- /dev/null +++ b/nexus/db-model/src/migration_state.rs @@ -0,0 +1,49 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Database representation of a migration's state as understood by Nexus. + +use super::impl_enum_wrapper; +use omicron_common::api::internal::nexus; +use serde::Deserialize; +use serde::Serialize; +use std::fmt; +use std::io::Write; + +impl_enum_wrapper!( + #[derive(Clone, SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "migration_state", schema = "public"))] + pub struct MigrationStateEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq)] + #[diesel(sql_type = MigrationStateEnum)] + pub struct MigrationState(pub nexus::MigrationState); + + // Enum values + Pending => b"pending" + InProgress => b"in_progress" + Completed => b"completed" + Failed => b"failed" +); + +impl MigrationState { + /// Returns `true` if this migration state means that the migration is no + /// longer in progress (it has either succeeded or failed). + #[must_use] + pub fn is_terminal(&self) -> bool { + self.0.is_terminal() + } +} + +impl fmt::Display for MigrationState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +impl From for MigrationState { + fn from(s: nexus::MigrationState) -> Self { + Self(s) + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index b91fa72aff..4c07f6065b 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1759,6 +1759,25 @@ table! { } } +table! { + migration (id) { + id -> Uuid, + time_created -> Timestamptz, + time_deleted -> Nullable, + source_state -> crate::MigrationStateEnum, + source_propolis_id -> Uuid, + source_gen -> Int8, + time_source_updated -> Nullable, + target_state -> crate::MigrationStateEnum, + target_propolis_id -> Uuid, + target_gen -> Int8, + time_target_updated -> Nullable, + } +} + +allow_tables_to_appear_in_same_query!(instance, migration); +joinable!(instance -> migration (migration_id)); + allow_tables_to_appear_in_same_query!( ip_pool_range, ip_pool, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index b1b2697bb3..b174809fae 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(73, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(74, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(74, "add-migration-table"), KnownVersion::new(73, "add-vlan-to-uplink"), KnownVersion::new(72, "fix-provisioning-counters"), KnownVersion::new(71, "add-saga-unwound-vmm-state"), diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index b9989fe31c..e090210cbe 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -46,6 +46,7 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::MigrationRuntimeState; use omicron_common::bail_unless; use ref_cast::RefCast; use uuid::Uuid; @@ -141,6 +142,25 @@ pub enum UpdaterLockError { Query(#[from] Error), } +/// The result of an [`DataStore::instance_and_vmm_update_runtime`] call, +/// indicating which records were updated. +#[derive(Copy, Clone, Debug)] +pub struct InstanceUpdateResult { + /// `true` if the instance record was updated, `false` otherwise. + pub instance_updated: bool, + /// `true` if the VMM record was updated, `false` otherwise. + pub vmm_updated: bool, + /// Indicates whether a migration record for this instance was updated, if a + /// [`MigrationRuntimeState`] was provided to + /// [`DataStore::instance_and_vmm_update_runtime`]. + /// + /// - `Some(true)` if a migration record was updated + /// - `Some(false)` if a [`MigrationRuntimeState`] was provided, but the + /// migration record was not updated + /// - `None` if no [`MigrationRuntimeState`] was provided + pub migration_updated: Option, +} + impl DataStore { /// Idempotently insert a database record for an Instance /// @@ -372,12 +392,11 @@ impl DataStore { /// /// # Return value /// - /// - `Ok((instance_updated, vmm_updated))` if the query was issued - /// successfully. `instance_updated` and `vmm_updated` are each true if - /// the relevant item was updated and false otherwise. Note that an update - /// can fail because it was inapplicable (i.e. the database has state with - /// a newer generation already) or because the relevant record was not - /// found. + /// - `Ok(`[`InstanceUpdateResult`]`)` if the query was issued + /// successfully. The returned [`InstanceUpdateResult`] indicates which + /// database record(s) were updated. Note that an update can fail because + /// it was inapplicable (i.e. the database has state with a newer + /// generation already) or because the relevant record was not found. /// - `Err` if another error occurred while accessing the database. pub async fn instance_and_vmm_update_runtime( &self, @@ -385,12 +404,14 @@ impl DataStore { new_instance: &InstanceRuntimeState, vmm_id: &Uuid, new_vmm: &VmmRuntimeState, - ) -> Result<(bool, bool), Error> { + migration: &Option, + ) -> Result { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( *instance_id, new_instance.clone(), *vmm_id, new_vmm.clone(), + migration.clone(), ); // The InstanceAndVmmUpdate query handles and indicates failure to find @@ -413,7 +434,22 @@ impl DataStore { None => false, }; - Ok((instance_updated, vmm_updated)) + let migration_updated = if migration.is_some() { + Some(match result.migration_status { + Some(UpdateStatus::Updated) => true, + Some(UpdateStatus::NotUpdatedButExists) => false, + None => false, + }) + } else { + debug_assert_eq!(result.migration_status, None); + None + }; + + Ok(InstanceUpdateResult { + instance_updated, + vmm_updated, + migration_updated, + }) } /// Lists all instances on in-service sleds with active Propolis VMM diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs new file mode 100644 index 0000000000..ba8a4e0392 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -0,0 +1,92 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`Migration`]s. + +use super::DataStore; +use crate::context::OpContext; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::{Migration, MigrationState}; +use crate::db::schema::migration::dsl; +use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateStatus; +use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::Utc; +use diesel::prelude::*; +use omicron_common::api::external::CreateResult; +use omicron_common::api::external::UpdateResult; +use omicron_common::api::internal::nexus; +use uuid::Uuid; + +impl DataStore { + /// Insert a database record for a migration. + pub async fn migration_insert( + &self, + opctx: &OpContext, + migration: Migration, + ) -> CreateResult { + diesel::insert_into(dsl::migration) + .values(migration) + .on_conflict(dsl::id) + .do_update() + .set(dsl::time_created.eq(dsl::time_created)) + .returning(Migration::as_returning()) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks a migration record as deleted if and only if both sides of the + /// migration are in a terminal state. + pub async fn migration_terminate( + &self, + opctx: &OpContext, + migration_id: Uuid, + ) -> UpdateResult { + const TERMINAL_STATES: &[MigrationState] = &[ + MigrationState(nexus::MigrationState::Completed), + MigrationState(nexus::MigrationState::Failed), + ]; + + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::source_state.eq_any(TERMINAL_STATES)) + .filter(dsl::target_state.eq_any(TERMINAL_STATES)) + .set(dsl::time_deleted.eq(Utc::now())) + .check_if_exists::(migration_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Unconditionally mark a migration record as deleted. + /// + /// This is distinct from [`DataStore::migration_terminate`], as it will + /// mark a migration as deleted regardless of the states of the source and + /// target VMMs. + pub async fn migration_mark_deleted( + &self, + opctx: &OpContext, + migration_id: Uuid, + ) -> UpdateResult { + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration_id)) + .filter(dsl::time_deleted.is_null()) + .set(dsl::time_deleted.eq(Utc::now())) + .check_if_exists::(migration_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 9ec3575860..df85d62c69 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -69,6 +69,7 @@ pub mod instance; mod inventory; mod ip_pool; mod ipv4_nat_entry; +mod migration; mod network_interface; mod oximeter; mod physical_disk; diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index ea40877450..ed584c6ce6 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -12,8 +12,14 @@ use diesel::sql_types::{Nullable, Uuid as SqlUuid}; use diesel::{pg::Pg, query_builder::AstPass}; use diesel::{Column, ExpressionMethods, QueryDsl, RunQueryDsl}; use nexus_db_model::{ - schema::{instance::dsl as instance_dsl, vmm::dsl as vmm_dsl}, - InstanceRuntimeState, VmmRuntimeState, + schema::{ + instance::dsl as instance_dsl, migration::dsl as migration_dsl, + vmm::dsl as vmm_dsl, + }, + Generation, InstanceRuntimeState, MigrationState, VmmRuntimeState, +}; +use omicron_common::api::internal::nexus::{ + MigrationRole, MigrationRuntimeState, }; use uuid::Uuid; @@ -64,6 +70,12 @@ use crate::db::update_and_check::UpdateStatus; // SELECT vmm_result.found, vmm_result.updated, instance_result.found, // instance_result.updated // FROM vmm_result, instance_result; +/// +/// If a [`MigrationRuntimeState`] is provided, similar "found" and "update" +/// clauses are also added to join the `migration` record for the instance's +/// active migration, if one exists, and update the migration record. If no +/// migration record is provided, this part of the query is skipped, and the +/// `migration_found` and `migration_updated` portions are always `false`. // // The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL // result in the final output instead of failing the entire query if the target @@ -76,6 +88,12 @@ pub struct InstanceAndVmmUpdate { vmm_find: Box + Send>, instance_update: Box + Send>, vmm_update: Box + Send>, + migration: Option, +} + +struct MigrationUpdate { + find: Box + Send>, + update: Box + Send>, } /// Contains the result of a combined instance-and-VMM update operation. @@ -89,6 +107,11 @@ pub struct InstanceAndVmmUpdateResult { /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` /// indicates whether the row was updated. `None` if the VMM was not found. pub vmm_status: Option, + + /// `Some(status)` if the target migration was found; the wrapped `UpdateStatus` + /// indicates whether the row was updated. `None` if the migration was not + /// found, or no migration update was performed. + pub migration_status: Option, } /// Computes the update status to return from the results of queries that find @@ -135,6 +158,7 @@ impl InstanceAndVmmUpdate { new_instance_runtime_state: InstanceRuntimeState, vmm_id: Uuid, new_vmm_runtime_state: VmmRuntimeState, + migration: Option, ) -> Self { let instance_find = Box::new( instance_dsl::instance @@ -165,24 +189,90 @@ impl InstanceAndVmmUpdate { .set(new_vmm_runtime_state), ); - Self { instance_find, vmm_find, instance_update, vmm_update } + let migration = migration.map( + |MigrationRuntimeState { + role, + migration_id, + state, + gen, + time_updated, + }| { + let state = MigrationState::from(state); + let find = Box::new( + migration_dsl::migration + .filter(migration_dsl::id.eq(migration_id)) + .filter(migration_dsl::time_deleted.is_null()) + .select(migration_dsl::id), + ); + let gen = Generation::from(gen); + let update: Box + Send> = match role { + MigrationRole::Target => Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::target_propolis_id.eq(vmm_id), + ) + .filter(migration_dsl::target_gen.lt(gen)) + .set(( + migration_dsl::target_state.eq(state), + migration_dsl::time_target_updated + .eq(time_updated), + )), + ), + MigrationRole::Source => Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::source_propolis_id.eq(vmm_id), + ) + .filter(migration_dsl::source_gen.lt(gen)) + .set(( + migration_dsl::source_state.eq(state), + migration_dsl::time_source_updated + .eq(time_updated), + )), + ), + }; + MigrationUpdate { find, update } + }, + ); + + Self { instance_find, vmm_find, instance_update, vmm_update, migration } } pub async fn execute_and_check( self, conn: &(impl async_bb8_diesel::AsyncConnection + Sync), ) -> Result { - let (vmm_found, vmm_updated, instance_found, instance_updated) = - self.get_result_async::<(Option, - Option, - Option, - Option)>(conn).await?; + let ( + vmm_found, + vmm_updated, + instance_found, + instance_updated, + migration_found, + migration_updated, + ) = self + .get_result_async::<( + Option, + Option, + Option, + Option, + Option, + Option, + )>(conn) + .await?; let instance_status = compute_update_status(instance_found, instance_updated); let vmm_status = compute_update_status(vmm_found, vmm_updated); + let migration_status = + compute_update_status(migration_found, migration_updated); - Ok(InstanceAndVmmUpdateResult { instance_status, vmm_status }) + Ok(InstanceAndVmmUpdateResult { + instance_status, + vmm_status, + migration_status, + }) } } @@ -197,6 +287,8 @@ impl Query for InstanceAndVmmUpdate { Nullable, Nullable, Nullable, + Nullable, + Nullable, ); } @@ -212,6 +304,12 @@ impl QueryFragment for InstanceAndVmmUpdate { self.vmm_find.walk_ast(out.reborrow())?; out.push_sql(") AS id), "); + if let Some(MigrationUpdate { ref find, .. }) = self.migration { + out.push_sql("migration_found AS (SELECT ("); + find.walk_ast(out.reborrow())?; + out.push_sql(") AS id), "); + } + out.push_sql("instance_updated AS ("); self.instance_update.walk_ast(out.reborrow())?; out.push_sql(" RETURNING id), "); @@ -220,6 +318,12 @@ impl QueryFragment for InstanceAndVmmUpdate { self.vmm_update.walk_ast(out.reborrow())?; out.push_sql(" RETURNING id), "); + if let Some(MigrationUpdate { ref update, .. }) = self.migration { + out.push_sql("migration_updated AS ("); + update.walk_ast(out.reborrow())?; + out.push_sql(" RETURNING id), "); + } + out.push_sql("vmm_result AS ("); out.push_sql("SELECT vmm_found."); out.push_identifier(vmm_dsl::id::NAME)?; @@ -244,11 +348,37 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(instance_dsl::id::NAME)?; out.push_sql(" = instance_updated."); out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(") "); + out.push_sql(")"); + + if self.migration.is_some() { + out.push_sql(", "); + out.push_sql("migration_result AS ("); + out.push_sql("SELECT migration_found."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" AS found, migration_updated."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" AS updated"); + out.push_sql( + " FROM migration_found LEFT JOIN migration_updated ON migration_found.", + ); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(" = migration_updated."); + out.push_identifier(migration_dsl::id::NAME)?; + out.push_sql(")"); + } + out.push_sql(" "); out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - out.push_sql("instance_result.found, instance_result.updated "); - out.push_sql("FROM vmm_result, instance_result;"); + out.push_sql("instance_result.found, instance_result.updated, "); + if self.migration.is_some() { + out.push_sql("migration_result.found, migration_result.updated "); + } else { + out.push_sql("NULL, NULL "); + } + out.push_sql("FROM vmm_result, instance_result"); + if self.migration.is_some() { + out.push_sql(", migration_result"); + } Ok(()) } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 27f62036b1..943665cab3 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -25,6 +25,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; @@ -563,25 +564,27 @@ impl super::Nexus { // outright fails, this operation fails. If the operation nominally // succeeds but nothing was updated, this action is outdated and the // caller should not proceed with migration. - let (updated, _) = match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await? - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; + let InstanceUpdateResult { instance_updated, .. } = + match instance_put_result { + Ok(state) => { + self.write_returned_instance_state(&instance_id, state) + .await? } - return Err(e.into()); - } - }; + Err(e) => { + if e.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &instance_id, + &prev_instance_runtime, + &e, + ) + .await; + } + return Err(e.into()); + } + }; - if updated { + if instance_updated { Ok(self .db_datastore .instance_refetch(opctx, &authz_instance) @@ -1321,7 +1324,7 @@ impl super::Nexus { &self, instance_id: &Uuid, state: Option, - ) -> Result<(bool, bool), Error> { + ) -> Result { slog::debug!(&self.log, "writing instance state returned from sled agent"; "instance_id" => %instance_id, @@ -1335,6 +1338,7 @@ impl super::Nexus { &state.instance_state.into(), &state.propolis_id, &state.vmm_state.into(), + &state.migration_state, ) .await; @@ -1346,7 +1350,13 @@ impl super::Nexus { update_result } else { - Ok((false, false)) + // There was no instance state to write back, so --- perhaps + // obviously --- nothing happened. + Ok(InstanceUpdateResult { + instance_updated: false, + vmm_updated: false, + migration_updated: None, + }) } } @@ -1954,16 +1964,6 @@ impl super::Nexus { } } -/// Records what aspects of an instance's state were actually changed in a -/// [`notify_instance_updated`] call. -/// -/// This is (presently) used for debugging purposes only. -#[derive(Copy, Clone)] -pub(crate) struct InstanceUpdated { - pub instance_updated: bool, - pub vmm_updated: bool, -} - /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. #[allow(clippy::too_many_arguments)] // :( @@ -1976,14 +1976,15 @@ pub(crate) async fn notify_instance_updated( instance_id: &Uuid, new_runtime_state: &nexus::SledInstanceState, v2p_notification_tx: tokio::sync::watch::Sender<()>, -) -> Result, Error> { +) -> Result, Error> { let propolis_id = new_runtime_state.propolis_id; info!(log, "received new runtime state from sled agent"; "instance_id" => %instance_id, "instance_state" => ?new_runtime_state.instance_state, "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migration_state); // Grab the current state of the instance in the DB to reason about // whether this update is stale or not. @@ -2071,9 +2072,44 @@ pub(crate) async fn notify_instance_updated( &db::model::VmmRuntimeState::from( new_runtime_state.vmm_state.clone(), ), + &new_runtime_state.migration_state, ) .await; + // Has a migration terminated? If so,mark the migration record as deleted if + // and only if both sides of the migration are in a terminal state. + if let Some(nexus::MigrationRuntimeState { + migration_id, + state, + role, + .. + }) = new_runtime_state.migration_state + { + if state.is_terminal() { + info!( + log, + "migration has terminated, trying to delete it..."; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "migration_id" => %propolis_id, + "migration_state" => %state, + "migration_role" => %role, + ); + if !datastore.migration_terminate(opctx, migration_id).await? { + info!( + log, + "did not mark migration record as deleted (the other half \ + may not yet have reported termination)"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "migration_id" => %propolis_id, + "migration_state" => %state, + "migration_role" => %role, + ); + } + } + } + // If the VMM is now in a terminal state, make sure its resources get // cleaned up. // @@ -2112,13 +2148,14 @@ pub(crate) async fn notify_instance_updated( } match result { - Ok((instance_updated, vmm_updated)) => { + Ok(result) => { info!(log, "instance and vmm updated by sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, - "instance_updated" => instance_updated, - "vmm_updated" => vmm_updated); - Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) + "instance_updated" => result.instance_updated, + "vmm_updated" => result.vmm_updated, + "migration_updated" => ?result.migration_updated); + Ok(Some(result)) } // The update command should swallow object-not-found errors and diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 1434064666..cbcad41a4f 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -65,6 +65,12 @@ declare_saga_actions! { - sim_destroy_vmm_record } + CREATE_MIGRATION_RECORD -> "migration_record" { + + sim_create_migration_record + - sim_delete_migration_record + } + + // This step the instance's migration ID and destination Propolis ID // fields. Because the instance is active, its current sled agent maintains // its most recent runtime state, so to update it, the saga calls into the @@ -128,6 +134,7 @@ impl NexusSaga for SagaInstanceMigrate { builder.append(reserve_resources_action()); builder.append(allocate_propolis_ip_action()); builder.append(create_vmm_record_action()); + builder.append(create_migration_record_action()); builder.append(set_migration_ids_action()); builder.append(ensure_destination_propolis_action()); builder.append(instance_migrate_action()); @@ -189,6 +196,57 @@ async fn sim_allocate_propolis_ip( .await } +async fn sim_create_migration_record( + sagactx: NexusActionContext, +) -> Result { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let source_propolis_id = params.src_vmm.id; + let migration_id = sagactx.lookup::("migrate_id")?; + let target_propolis_id = sagactx.lookup::("dst_propolis_id")?; + + info!(osagactx.log(), "creating migration record"; + "migration_id" => %migration_id, + "source_propolis_id" => %source_propolis_id, + "target_propolis_id" => %target_propolis_id); + + osagactx + .datastore() + .migration_insert( + &opctx, + db::model::Migration::new( + migration_id, + source_propolis_id, + target_propolis_id, + ), + ) + .await + .map_err(ActionError::action_failed) +} + +async fn sim_delete_migration_record( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx: &std::sync::Arc = + sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let migration_id = sagactx.lookup::("migrate_id")?; + + info!(osagactx.log(), "deleting migration record"; + "migration_id" => %migration_id); + osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await?; + Ok(()) +} + async fn sim_create_vmm_record( sagactx: NexusActionContext, ) -> Result { diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 565e2fbafb..948f8a18f3 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -673,6 +673,36 @@ async fn test_instance_start_creates_networking_state( #[nexus_test] async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { + use nexus_db_model::Migration; + use omicron_common::api::internal::nexus::MigrationState; + async fn migration_fetch( + cptestctx: &ControlPlaneTestContext, + migration_id: Uuid, + ) -> Migration { + use async_bb8_diesel::AsyncRunQueryDsl; + use diesel::prelude::*; + use nexus_db_queries::db::schema::migration::dsl; + + let datastore = + cptestctx.server.server_context().nexus.datastore().clone(); + let db_state = dsl::migration + // N.B. that for the purposes of this test, we explicitly should + // *not* filter out migrations that are marked as deleted, as the + // migration record is marked as deleted once the migration completes. + .filter(dsl::id.eq(migration_id)) + .select(Migration::as_select()) + .get_results_async::( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + + info!(&cptestctx.logctx.log, "refetched migration info from db"; + "migration" => ?db_state); + + db_state.into_iter().next().unwrap() + } + let client = &cptestctx.external_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; @@ -729,7 +759,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let migrate_url = format!("/v1/instances/{}/migrate", &instance_id.to_string()); - let _ = NexusRequest::new( + let instance = NexusRequest::new( RequestBuilder::new(client, Method::POST, &migrate_url) .body(Some(¶ms::InstanceMigrate { dst_sled_id })) .expect_status(Some(StatusCode::OK)), @@ -749,12 +779,40 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(current_sled, original_sled); + // Ensure that both sled agents report that the migration is in progress. + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.target_state, MigrationState::Pending.into()); + assert_eq!(migration.source_state, MigrationState::Pending.into()); + // Explicitly simulate the migration action on the target. Simulated // migrations always succeed. The state transition on the target is // sufficient to move the instance back into a Running state (strictly // speaking no further updates from the source are required if the target // successfully takes over). instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + // Ensure that both sled agents report that the migration has completed. + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + let instance = instance_get(&client, &instance_url).await; assert_eq!(instance.runtime.run_state, InstanceState::Running); @@ -765,6 +823,11 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { .expect("migrated instance should still have a sled"); assert_eq!(current_sled, dst_sled_id); + + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.target_state, MigrationState::Completed.into()); + assert_eq!(migration.source_state, MigrationState::Completed.into()); + assert!(migration.time_deleted.is_some()); } #[nexus_test] diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 17330b0974..b27dc6ce00 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3428,6 +3428,88 @@ "minLength": 5, "maxLength": 17 }, + "MigrationRole": { + "oneOf": [ + { + "description": "This update concerns the source VMM of a migration.", + "type": "string", + "enum": [ + "source" + ] + }, + { + "description": "This update concerns the target VMM of a migration.", + "type": "string", + "enum": [ + "target" + ] + } + ] + }, + "MigrationRuntimeState": { + "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", + "type": "object", + "properties": { + "gen": { + "$ref": "#/components/schemas/Generation" + }, + "migration_id": { + "type": "string", + "format": "uuid" + }, + "role": { + "$ref": "#/components/schemas/MigrationRole" + }, + "state": { + "$ref": "#/components/schemas/MigrationState" + }, + "time_updated": { + "description": "Timestamp for the migration state update.", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "gen", + "migration_id", + "role", + "state", + "time_updated" + ] + }, + "MigrationState": { + "description": "The state of an instance's live migration.", + "oneOf": [ + { + "description": "The migration has not started for this VMM.", + "type": "string", + "enum": [ + "pending" + ] + }, + { + "description": "The migration is in progress.", + "type": "string", + "enum": [ + "in_progress" + ] + }, + { + "description": "The migration has failed.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "The migration has completed.", + "type": "string", + "enum": [ + "completed" + ] + } + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -4572,6 +4654,15 @@ } ] }, + "migration_state": { + "nullable": true, + "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "type": "string", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 79e3bac727..25e8d1c5da 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3406,6 +3406,88 @@ "minLength": 5, "maxLength": 17 }, + "MigrationRole": { + "oneOf": [ + { + "description": "This update concerns the source VMM of a migration.", + "type": "string", + "enum": [ + "source" + ] + }, + { + "description": "This update concerns the target VMM of a migration.", + "type": "string", + "enum": [ + "target" + ] + } + ] + }, + "MigrationRuntimeState": { + "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", + "type": "object", + "properties": { + "gen": { + "$ref": "#/components/schemas/Generation" + }, + "migration_id": { + "type": "string", + "format": "uuid" + }, + "role": { + "$ref": "#/components/schemas/MigrationRole" + }, + "state": { + "$ref": "#/components/schemas/MigrationState" + }, + "time_updated": { + "description": "Timestamp for the migration state update.", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "gen", + "migration_id", + "role", + "state", + "time_updated" + ] + }, + "MigrationState": { + "description": "The state of an instance's live migration.", + "oneOf": [ + { + "description": "The migration has not started for this VMM.", + "type": "string", + "enum": [ + "pending" + ] + }, + { + "description": "The migration is in progress.", + "type": "string", + "enum": [ + "in_progress" + ] + }, + { + "description": "The migration has failed.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "The migration has completed.", + "type": "string", + "enum": [ + "completed" + ] + } + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -4187,6 +4269,15 @@ } ] }, + "migration_state": { + "nullable": true, + "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "type": "string", diff --git a/schema/crdb/add-migration-table/up01.sql b/schema/crdb/add-migration-table/up01.sql new file mode 100644 index 0000000000..7659d7dca3 --- /dev/null +++ b/schema/crdb/add-migration-table/up01.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'pending', + 'in_progress', + 'failed', + 'completed' +); diff --git a/schema/crdb/add-migration-table/up02.sql b/schema/crdb/add-migration-table/up02.sql new file mode 100644 index 0000000000..9e0654c1bc --- /dev/null +++ b/schema/crdb/add-migration-table/up02.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.migration ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_deleted TIMESTAMPTZ, + source_state omicron.public.migration_state NOT NULL, + source_propolis_id UUID NOT NULL, + source_gen INT8 NOT NULL DEFAULT 1, + time_source_updated TIMESTAMPTZ, + target_state omicron.public.migration_state NOT NULL, + target_propolis_id UUID NOT NULL, + target_gen INT8 NOT NULL DEFAULT 1, + time_target_updated TIMESTAMPTZ +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ddbe0dcd44..8bc7e190d8 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4065,6 +4065,55 @@ VALUES ( ON CONFLICT (id) DO NOTHING; +CREATE TYPE IF NOT EXISTS omicron.public.migration_state AS ENUM ( + 'pending', + 'in_progress', + 'failed', + 'completed' +); + +-- A table of the states of current migrations. +CREATE TABLE IF NOT EXISTS omicron.public.migration ( + id UUID PRIMARY KEY, + + /* The time this migration record was created. */ + time_created TIMESTAMPTZ NOT NULL, + + /* The time this migration record was deleted. */ + time_deleted TIMESTAMPTZ, + + /* The state of the migration source */ + source_state omicron.public.migration_state NOT NULL, + + /* The ID of the migration source Propolis */ + source_propolis_id UUID NOT NULL, + + /* Generation number owned and incremented by the source sled-agent */ + source_gen INT8 NOT NULL DEFAULT 1, + + /* Timestamp of when the source field was last updated. + * + * This is provided by the sled-agent when publishing a migration state + * update. + */ + time_source_updated TIMESTAMPTZ, + + /* The state of the migration target */ + target_state omicron.public.migration_state NOT NULL, + + /* The ID of the migration target Propolis */ + target_propolis_id UUID NOT NULL, + + /* Generation number owned and incremented by the target sled-agent */ + target_gen INT8 NOT NULL DEFAULT 1, + + /* Timestamp of when the source field was last updated. + * + * This is provided by the sled-agent when publishing a migration state + * update. + */ + time_target_updated TIMESTAMPTZ +); /* * Keep this at the end of file so that the database does not contain a version @@ -4077,7 +4126,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '73.0.0', NULL) + (TRUE, NOW(), NOW(), '74.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 62af337c4c..b2c135fcf8 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -6,12 +6,14 @@ use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; +use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmRuntimeState, VmmState, + InstanceRuntimeState, MigrationRole, MigrationRuntimeState, MigrationState, + SledInstanceState, VmmRuntimeState, VmmState, }; use propolis_client::types::{ InstanceState as PropolisApiState, InstanceStateMonitorResponse, - MigrationState, + MigrationState as PropolisMigrationState, }; use uuid::Uuid; @@ -21,6 +23,7 @@ pub struct InstanceStates { instance: InstanceRuntimeState, vmm: VmmRuntimeState, propolis_id: Uuid, + migration: Option, } /// Newtype to allow conversion from Propolis API states (returned by the @@ -123,10 +126,10 @@ impl ObservedPropolisState { if this_id == propolis_migration.migration_id => { match propolis_migration.state { - MigrationState::Finish => { + PropolisMigrationState::Finish => { ObservedMigrationStatus::Succeeded } - MigrationState::Error => { + PropolisMigrationState::Error => { ObservedMigrationStatus::Failed } _ => ObservedMigrationStatus::InProgress, @@ -208,7 +211,22 @@ impl InstanceStates { vmm: VmmRuntimeState, propolis_id: Uuid, ) -> Self { - InstanceStates { instance, vmm, propolis_id } + let migration = instance.migration_id.map(|migration_id| { + let dst_propolis_id = instance.dst_propolis_id.expect("if an instance has a migration ID, it should also have a target VMM ID"); + let role = if dst_propolis_id == propolis_id { + MigrationRole::Target + } else { + MigrationRole::Source + }; + MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role, + gen: Generation::new(), + time_updated: Utc::now(), + } + }); + InstanceStates { instance, vmm, propolis_id, migration } } pub fn instance(&self) -> &InstanceRuntimeState { @@ -223,6 +241,10 @@ impl InstanceStates { self.propolis_id } + pub(crate) fn migration(&self) -> Option<&MigrationRuntimeState> { + self.migration.as_ref() + } + /// Creates a `SledInstanceState` structure containing the entirety of this /// structure's runtime state. This requires cloning; for simple read access /// use the `instance` or `vmm` accessors instead. @@ -231,6 +253,25 @@ impl InstanceStates { instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, + migration_state: self.migration.clone(), + } + } + + fn transition_migration( + &mut self, + state: MigrationState, + time_updated: DateTime, + ) { + let migration = self.migration.as_mut().expect( + "an ObservedMigrationState should only be constructed when the \ + VMM has an active migration", + ); + // Don't generate spurious state updates if the migration is already in + // the state we're transitioning to. + if migration.state != state { + migration.state = state; + migration.time_updated = time_updated; + migration.gen = migration.gen.next(); } } @@ -256,58 +297,76 @@ impl InstanceStates { // Update the instance record to reflect the result of any completed // migration. match observed.migration_status { - ObservedMigrationStatus::Succeeded => match self.propolis_role() { - // This is a successful migration out. Point the instance to the - // target VMM, but don't clear migration IDs; let the target do - // that so that the instance will continue to appear to be - // migrating until it is safe to migrate again. - PropolisRole::Active => { - self.switch_propolis_id_to_target(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Retired); - } + ObservedMigrationStatus::Succeeded => { + self.transition_migration( + MigrationState::Completed, + observed.time, + ); + match self.propolis_role() { + // This is a successful migration out. Point the instance to the + // target VMM, but don't clear migration IDs; let the target do + // that so that the instance will continue to appear to be + // migrating until it is safe to migrate again. + PropolisRole::Active => { + self.switch_propolis_id_to_target(observed.time); + + assert_eq!(self.propolis_role(), PropolisRole::Retired); + } - // This is a successful migration in. Point the instance to the - // target VMM and clear migration IDs so that another migration - // in can begin. Propolis will continue reporting that this - // migration was successful, but because its ID has been - // discarded the observed migration status will change from - // Succeeded to NoMigration. - // - // Note that these calls increment the instance's generation - // number twice. This is by design and allows the target's - // migration-ID-clearing update to overtake the source's update. - PropolisRole::MigrationTarget => { - self.switch_propolis_id_to_target(observed.time); - self.clear_migration_ids(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Active); - } + // This is a successful migration in. Point the instance to the + // target VMM and clear migration IDs so that another migration + // in can begin. Propolis will continue reporting that this + // migration was successful, but because its ID has been + // discarded the observed migration status will change from + // Succeeded to NoMigration. + // + // Note that these calls increment the instance's generation + // number twice. This is by design and allows the target's + // migration-ID-clearing update to overtake the source's update. + PropolisRole::MigrationTarget => { + self.switch_propolis_id_to_target(observed.time); + self.clear_migration_ids(observed.time); + + assert_eq!(self.propolis_role(), PropolisRole::Active); + } - // This is a migration source that previously reported success - // and removed itself from the active Propolis position. Don't - // touch the instance. - PropolisRole::Retired => {} - }, - ObservedMigrationStatus::Failed => match self.propolis_role() { - // This is a failed migration out. CLear migration IDs so that - // Nexus can try again. - PropolisRole::Active => { - self.clear_migration_ids(observed.time); + // This is a migration source that previously reported success + // and removed itself from the active Propolis position. Don't + // touch the instance. + PropolisRole::Retired => {} } + } + ObservedMigrationStatus::Failed => { + self.transition_migration( + MigrationState::Failed, + observed.time, + ); - // This is a failed migration in. Leave the migration IDs alone - // so that the migration won't appear to have concluded until - // the source is ready to start a new one. - PropolisRole::MigrationTarget => {} + match self.propolis_role() { + // This is a failed migration out. CLear migration IDs so that + // Nexus can try again. + PropolisRole::Active => { + self.clear_migration_ids(observed.time); + } - // This VMM was part of a failed migration and was subsequently - // removed from the instance record entirely. There's nothing to - // update. - PropolisRole::Retired => {} - }, + // This is a failed migration in. Leave the migration IDs alone + // so that the migration won't appear to have concluded until + // the source is ready to start a new one. + PropolisRole::MigrationTarget => {} + + // This VMM was part of a failed migration and was subsequently + // removed from the instance record entirely. There's nothing to + // update. + PropolisRole::Retired => {} + } + } + ObservedMigrationStatus::InProgress => { + self.transition_migration( + MigrationState::InProgress, + observed.time, + ); + } ObservedMigrationStatus::NoMigration - | ObservedMigrationStatus::InProgress | ObservedMigrationStatus::Pending => {} } @@ -327,6 +386,16 @@ impl InstanceStates { self.clear_migration_ids(observed.time); self.retire_active_propolis(observed.time); } + // If there's an active migration and the VMM is suddenly gone, + // that should constitute a migration failure! + if let Some(MigrationState::Pending | MigrationState::InProgress) = + self.migration.as_ref().map(|m| m.state) + { + self.transition_migration( + MigrationState::Failed, + observed.time, + ); + } Some(Action::Destroy) } else { None @@ -431,12 +500,29 @@ impl InstanceStates { ids: &Option, now: DateTime, ) { - if let Some(ids) = ids { - self.instance.migration_id = Some(ids.migration_id); - self.instance.dst_propolis_id = Some(ids.dst_propolis_id); + if let Some(InstanceMigrationSourceParams { + migration_id, + dst_propolis_id, + }) = *ids + { + self.instance.migration_id = Some(migration_id); + self.instance.dst_propolis_id = Some(dst_propolis_id); + let role = if dst_propolis_id == self.propolis_id { + MigrationRole::Target + } else { + MigrationRole::Source + }; + self.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::Pending, + role, + gen: Generation::new(), + time_updated: now, + }) } else { self.instance.migration_id = None; self.instance.dst_propolis_id = None; + self.migration = None; } self.instance.gen = self.instance.gen.next(); @@ -538,17 +624,37 @@ mod test { fn make_migration_source_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; - state.instance.migration_id = Some(Uuid::new_v4()); + let migration_id = Uuid::new_v4(); + state.instance.migration_id = Some(migration_id); state.instance.dst_propolis_id = Some(Uuid::new_v4()); + state.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role: MigrationRole::Source, + // advance the generation once, since we are starting out in the + // `InProgress` state. + gen: Generation::new().next(), + time_updated: Utc::now(), + }); state } fn make_migration_target_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; - state.instance.migration_id = Some(Uuid::new_v4()); + let migration_id = Uuid::new_v4(); + state.instance.migration_id = Some(migration_id); state.propolis_id = Uuid::new_v4(); state.instance.dst_propolis_id = Some(state.propolis_id); + state.migration = Some(MigrationRuntimeState { + migration_id, + state: MigrationState::InProgress, + role: MigrationRole::Target, + // advance the generation once, since we are starting out in the + // `InProgress` state. + gen: Generation::new().next(), + time_updated: Utc::now(), + }); state } @@ -623,6 +729,37 @@ mod test { } } + fn test_termination_fails_in_progress_migration( + mk_instance: impl Fn() -> InstanceStates, + ) { + for state in [Observed::Destroyed, Observed::Failed] { + let mut instance_state = mk_instance(); + let original_migration = instance_state.clone().migration.unwrap(); + let requested_action = instance_state + .apply_propolis_observation(&make_observed_state(state.into())); + + let migration = + instance_state.migration.expect("state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > original_migration.gen); + assert!(matches!(requested_action, Some(Action::Destroy))); + } + } + + #[test] + fn source_termination_fails_in_progress_migration() { + test_termination_fails_in_progress_migration( + make_migration_source_instance, + ) + } + + #[test] + fn target_termination_fails_in_progress_migration() { + test_termination_fails_in_progress_migration( + make_migration_target_instance, + ) + } + #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); @@ -651,6 +788,17 @@ mod test { assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); assert!(state.instance.migration_id.is_some()); + // The migration state should transition to "completed" + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + let prev_migration = migration; + // Once a successful migration is observed, the VMM's state should // continue to update, but the instance's state shouldn't change // anymore. @@ -666,6 +814,15 @@ mod test { assert_eq!(state.vmm.state, VmmState::Stopping); assert!(state.vmm.gen > prev.vmm.gen); + // Now that the migration has completed, it should not transition again. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); + let prev_migration = migration; + let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Destroyed); assert!(matches!( @@ -676,6 +833,13 @@ mod test { assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Destroyed); assert!(state.vmm.gen > prev.vmm.gen); + + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); } #[test] @@ -699,6 +863,14 @@ mod test { assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Failed); assert!(state.vmm.gen > prev.vmm.gen); + + // The migration state should transition. + let migration = + state.migration.expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > prev_migration.gen); } // Verifies that the rude-termination state change doesn't update the @@ -717,6 +889,14 @@ mod test { assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.instance.gen, prev.instance.gen); + + // The migration state should transition. + let migration = + state.migration.expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > prev_migration.gen); } #[test] @@ -739,11 +919,22 @@ mod test { assert_eq!(state.vmm.state, VmmState::Running); assert!(state.vmm.gen > prev.vmm.gen); + // The migration state should transition to completed. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + let prev_migration = + prev.migration.expect("previous state must have a migration"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + // Pretend Nexus set some new migration IDs. + let migration_id = Uuid::new_v4(); let prev = state.clone(); state.set_migration_ids( &Some(InstanceMigrationSourceParams { - migration_id: Uuid::new_v4(), + migration_id, dst_propolis_id: Uuid::new_v4(), }), Utc::now(), @@ -752,6 +943,15 @@ mod test { assert!(state.instance.gen > prev.instance.gen); assert_eq!(state.vmm.gen, prev.vmm.gen); + // There should be a new, pending migration state. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Pending); + assert_eq!(migration.migration_id, migration_id); + let prev_migration = migration; + // Mark that the new migration out is in progress. This doesn't change // anything in the instance runtime state, but does update the VMM state // generation. @@ -772,6 +972,15 @@ mod test { assert!(state.vmm.gen > prev.vmm.gen); assert_eq!(state.instance.gen, prev.instance.gen); + // The migration state should transition to in progress. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::InProgress); + assert!(migration.gen > prev_migration.gen); + let prev_migration = migration; + // Propolis will publish that the migration succeeds before changing any // state. This should transfer control to the target but should not // touch the migration ID (that is the new target's job). @@ -790,6 +999,14 @@ mod test { assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); assert!(state.instance.gen > prev.instance.gen); + // The migration state should transition to completed. + let migration = state + .migration + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert!(migration.gen > prev_migration.gen); + // The rest of the destruction sequence is covered by other tests. } diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index f5be31bd37..12e17cc3de 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -451,6 +451,7 @@ mod test { instance_state: instance_vmm, vmm_state, propolis_id, + migration_state: None, }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index ed88dbcc6f..2ac8618399 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -16,7 +16,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmState, + InstanceRuntimeState, MigrationRole, SledInstanceState, VmmState, }; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateStatus, @@ -78,6 +78,46 @@ impl SimInstanceInner { self.queue.push_back(MonitorChange::MigrateStatus(migrate_status)) } + /// Queue a successful simulated migration. + /// + fn queue_successful_migration(&mut self, role: MigrationRole) { + // Propolis transitions to the Migrating state once before + // actually starting migration. + self.queue_propolis_state(PropolisInstanceState::Migrating); + let migration_id = + self.state.instance().migration_id.unwrap_or_else(|| { + panic!( + "should have migration ID set before getting request to + migrate in (current state: {:?})", + self + ) + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Sync, + }); + self.queue_migration_status(PropolisMigrateStatus { + migration_id, + state: propolis_client::types::MigrationState::Finish, + }); + + // The state we transition to after the migration completes will depend + // on whether we are the source or destination. + match role { + MigrationRole::Target => { + self.queue_propolis_state(PropolisInstanceState::Running) + } + MigrationRole::Source => self.queue_graceful_stop(), + } + } + + fn queue_graceful_stop(&mut self) { + self.state.transition_vmm(PublishedVmmState::Stopping, Utc::now()); + self.queue_propolis_state(PropolisInstanceState::Stopping); + self.queue_propolis_state(PropolisInstanceState::Stopped); + self.queue_propolis_state(PropolisInstanceState::Destroyed); + } + /// Searches the queue for its last Propolis state change transition. If /// one exists, returns the associated Propolis state. fn last_queued_instance_state(&self) -> Option { @@ -118,26 +158,7 @@ impl SimInstanceInner { ))); } - // Propolis transitions to the Migrating state once before - // actually starting migration. - self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = - self.state.instance().migration_id.unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to - migrate in (current state: {:?})", - self - ) - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Sync, - }); - self.queue_migration_status(PropolisMigrateStatus { - migration_id, - state: propolis_client::types::MigrationState::Finish, - }); - self.queue_propolis_state(PropolisInstanceState::Running); + self.queue_successful_migration(MigrationRole::Target) } InstanceStateRequested::Running => { match self.next_resting_state() { @@ -171,21 +192,7 @@ impl SimInstanceInner { VmmState::Starting => { self.state.terminate_rudely(); } - VmmState::Running => { - self.state.transition_vmm( - PublishedVmmState::Stopping, - Utc::now(), - ); - self.queue_propolis_state( - PropolisInstanceState::Stopping, - ); - self.queue_propolis_state( - PropolisInstanceState::Stopped, - ); - self.queue_propolis_state( - PropolisInstanceState::Destroyed, - ); - } + VmmState::Running => self.queue_graceful_stop(), // Idempotently allow requests to stop an instance that is // already stopping. VmmState::Stopping @@ -360,6 +367,24 @@ impl SimInstanceInner { } self.state.set_migration_ids(ids, Utc::now()); + + // If we set migration IDs and are the migration source, ensure that we + // will perform the correct state transitions to simulate a successful + // migration. + if ids.is_some() { + let role = self + .state + .migration() + .expect( + "we just got a `put_migration_ids` request with `Some` IDs, \ + so we should have a migration" + ) + .role; + if role == MigrationRole::Source { + self.queue_successful_migration(MigrationRole::Source) + } + } + Ok(self.state.sled_instance_state()) } } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index e2a52bf983..7f07dc199a 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -341,6 +341,7 @@ impl SledAgent { instance_state: instance_runtime, vmm_state: vmm_runtime, propolis_id, + migration_state: None, }, None, )