From 486103b0037c506ed402d190676570282301bd62 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 31 May 2024 11:08:32 -0700 Subject: [PATCH] [nexus] add `InstanceState::SagaUnwound` This branch is part of ongoing work on the `instance-update` saga (see PR #5749); I've factored it out into a separate PR. This is largely because this branch makes mechanical changes to a bunch of different files that aren't directly related to the core change of #5749, and I'd like to avoid the additional merge conflicts that are likely when these changes remain un-merged for a long time. --- nexus/db-model/src/instance_state.rs | 113 +++++++-- nexus/db-model/src/sled_instance.rs | 2 +- nexus/db-queries/src/db/datastore/disk.rs | 10 +- nexus/db-queries/src/db/datastore/instance.rs | 62 +++-- .../src/db/datastore/network_interface.rs | 2 - nexus/db-queries/src/db/datastore/vmm.rs | 12 +- .../db-queries/src/db/queries/external_ip.rs | 23 +- .../src/db/queries/network_interface.rs | 10 +- nexus/src/app/instance.rs | 133 +++++------ nexus/src/app/sagas/instance_common.rs | 17 +- nexus/src/app/sagas/instance_migrate.rs | 6 +- nexus/src/app/sagas/instance_start.rs | 21 +- nexus/src/app/sagas/instance_update/mod.rs | 216 ++++++++++++++++++ nexus/src/app/sagas/snapshot_create.rs | 9 +- 14 files changed, 463 insertions(+), 173 deletions(-) create mode 100644 nexus/src/app/sagas/instance_update/mod.rs diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index dca809758f..416eb1b43b 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -2,21 +2,20 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::impl_enum_wrapper; +use super::impl_enum_type; use omicron_common::api::external; use serde::Deserialize; use serde::Serialize; use std::fmt; -use std::io::Write; -impl_enum_wrapper!( +impl_enum_type!( #[derive(SqlType, Debug)] #[diesel(postgres_type(name = "instance_state", schema = "public"))] pub struct InstanceStateEnum; - #[derive(Clone, Debug, PartialEq, AsExpression, FromSqlRow, Serialize, Deserialize)] + #[derive(Copy, Clone, Debug, PartialEq, AsExpression, FromSqlRow, Serialize, Deserialize)] #[diesel(sql_type = InstanceStateEnum)] - pub struct InstanceState(pub external::InstanceState); + pub enum InstanceState; // Enum values Creating => b"creating" @@ -29,45 +28,115 @@ impl_enum_wrapper!( Repairing => b"repairing" Failed => b"failed" Destroyed => b"destroyed" + SagaUnwound => b"saga_unwound" ); impl InstanceState { pub fn new(state: external::InstanceState) -> Self { - Self(state) + Self::from(state) } - pub fn state(&self) -> &external::InstanceState { - &self.0 + pub fn state(&self) -> external::InstanceState { + external::InstanceState::from(*self) + } + + pub fn label(&self) -> &'static str { + match self { + InstanceState::Creating => "creating", + InstanceState::Starting => "starting", + InstanceState::Running => "running", + InstanceState::Stopping => "stopping", + InstanceState::Stopped => "stopped", + InstanceState::Rebooting => "rebooting", + InstanceState::Migrating => "migrating", + InstanceState::Repairing => "repairing", + InstanceState::Failed => "failed", + InstanceState::Destroyed => "destroyed", + InstanceState::SagaUnwound => "saga_unwound", + } } } impl fmt::Display for InstanceState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) + write!(f, "{}", self.label()) } } impl From for sled_agent_client::types::InstanceState { fn from(s: InstanceState) -> Self { - use external::InstanceState::*; use sled_agent_client::types::InstanceState as Output; - match s.0 { - Creating => Output::Creating, - Starting => Output::Starting, - Running => Output::Running, - Stopping => Output::Stopping, - Stopped => Output::Stopped, - Rebooting => Output::Rebooting, - Migrating => Output::Migrating, - Repairing => Output::Repairing, - Failed => Output::Failed, - Destroyed => Output::Destroyed, + match s { + InstanceState::Creating => Output::Creating, + InstanceState::Starting => Output::Starting, + InstanceState::Running => Output::Running, + InstanceState::Stopping => Output::Stopping, + InstanceState::Stopped => Output::Stopped, + InstanceState::Rebooting => Output::Rebooting, + InstanceState::Migrating => Output::Migrating, + InstanceState::Repairing => Output::Repairing, + InstanceState::Failed => Output::Failed, + InstanceState::Destroyed | InstanceState::SagaUnwound => { + Output::Destroyed + } } } } impl From for InstanceState { fn from(state: external::InstanceState) -> Self { - Self::new(state) + match state { + external::InstanceState::Creating => InstanceState::Creating, + external::InstanceState::Starting => InstanceState::Starting, + external::InstanceState::Running => InstanceState::Running, + external::InstanceState::Stopping => InstanceState::Stopping, + external::InstanceState::Stopped => InstanceState::Stopped, + external::InstanceState::Rebooting => InstanceState::Rebooting, + external::InstanceState::Migrating => InstanceState::Migrating, + external::InstanceState::Repairing => InstanceState::Repairing, + external::InstanceState::Failed => InstanceState::Failed, + external::InstanceState::Destroyed => InstanceState::Destroyed, + } + } +} + +impl From for external::InstanceState { + fn from(state: InstanceState) -> Self { + match state { + InstanceState::Creating => external::InstanceState::Creating, + InstanceState::Starting => external::InstanceState::Starting, + InstanceState::Running => external::InstanceState::Running, + InstanceState::Stopping => external::InstanceState::Stopping, + InstanceState::Stopped => external::InstanceState::Stopped, + InstanceState::Rebooting => external::InstanceState::Rebooting, + InstanceState::Migrating => external::InstanceState::Migrating, + InstanceState::Repairing => external::InstanceState::Repairing, + InstanceState::Failed => external::InstanceState::Failed, + InstanceState::Destroyed | InstanceState::SagaUnwound => { + external::InstanceState::Destroyed + } + } + } +} + +impl PartialEq for InstanceState { + fn eq(&self, other: &external::InstanceState) -> bool { + match (self, other) { + (InstanceState::Creating, external::InstanceState::Creating) + | (InstanceState::Starting, external::InstanceState::Starting) + | (InstanceState::Running, external::InstanceState::Running) + | (InstanceState::Stopping, external::InstanceState::Stopping) + | (InstanceState::Stopped, external::InstanceState::Stopped) + | (InstanceState::Rebooting, external::InstanceState::Rebooting) + | (InstanceState::Migrating, external::InstanceState::Migrating) + | (InstanceState::Repairing, external::InstanceState::Repairing) + | (InstanceState::Failed, external::InstanceState::Failed) + | (InstanceState::Destroyed, external::InstanceState::Destroyed) + | ( + InstanceState::SagaUnwound, + external::InstanceState::Destroyed, + ) => true, + _ => false, + } } } diff --git a/nexus/db-model/src/sled_instance.rs b/nexus/db-model/src/sled_instance.rs index bbc92ddf18..47e4ee4175 100644 --- a/nexus/db-model/src/sled_instance.rs +++ b/nexus/db-model/src/sled_instance.rs @@ -34,7 +34,7 @@ impl From for views::SledInstance { active_sled_id: sled_instance.active_sled_id, silo_name: sled_instance.silo_name.into(), project_name: sled_instance.project_name.into(), - state: *sled_instance.state.state(), + state: sled_instance.state.state(), migration_id: sled_instance.migration_id, ncpus: sled_instance.ncpus, memory: sled_instance.memory, diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index e1d504761c..40b151b527 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -183,8 +183,8 @@ impl DataStore { // // We currently only permit attaching disks to stopped instances. let ok_to_attach_instance_states = vec![ - db::model::InstanceState(api::external::InstanceState::Creating), - db::model::InstanceState(api::external::InstanceState::Stopped), + db::model::InstanceState::Creating, + db::model::InstanceState::Stopped, ]; let attach_update = DiskSetClauseForAttach::new(authz_instance.id()); @@ -321,9 +321,9 @@ impl DataStore { // // We currently only permit detaching disks from stopped instances. let ok_to_detach_instance_states = vec![ - db::model::InstanceState(api::external::InstanceState::Creating), - db::model::InstanceState(api::external::InstanceState::Stopped), - db::model::InstanceState(api::external::InstanceState::Failed), + db::model::InstanceState::Creating, + db::model::InstanceState::Stopped, + db::model::InstanceState::Failed, ]; let detached_label = api::external::DiskState::Detached.label(); diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 60fd5c9dc3..a402590115 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -21,6 +21,7 @@ use crate::db::lookup::LookupPath; use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceRuntimeState; +use crate::db::model::InstanceState; use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; @@ -30,13 +31,14 @@ use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; -use chrono::Utc; +use chrono::{DateTime, Utc}; use diesel::prelude::*; use nexus_db_model::ApplySledFilterExt; use nexus_db_model::Disk; use nexus_db_model::VmmRuntimeState; use nexus_types::deployment::SledFilter; use omicron_common::api; +use omicron_common::api::external; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -70,13 +72,39 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| v.sled_id) } - pub fn effective_state( - &self, - ) -> omicron_common::api::external::InstanceState { - if let Some(vmm) = &self.vmm { - vmm.runtime.state.0 - } else { - self.instance.runtime().nexus_state.0 + pub fn effective_state(&self) -> external::InstanceState { + Self::determine_effective_state(&self.instance, &self.vmm).0 + } + + pub fn determine_effective_state( + instance: &Instance, + vmm: &Option, + ) -> (external::InstanceState, DateTime) { + match vmm.as_ref().map(|v| &v.runtime) { + // If an active VMM is `Stopped`, the instance must be recast as + // `Stopping`, as no start saga can proceed until the active VMM has + // been `Destroyed`. + Some(VmmRuntimeState { + state: InstanceState::Stopped, + time_state_updated, + .. + }) => (external::InstanceState::Stopping, *time_state_updated), + // If the active VMM is `SagaUnwound`, then the instance should be + // treated as `Stopped`. + Some(VmmRuntimeState { + state: InstanceState::SagaUnwound, + time_state_updated, + .. + }) => (external::InstanceState::Stopped, *time_state_updated), + // If the active VMM is `SagaUnwound`, then the instance should be + // treated as `Stopped`. + Some(VmmRuntimeState { state, time_state_updated, .. }) => { + (external::InstanceState::from(*state), *time_state_updated) + } + None => ( + external::InstanceState::Stopped, + instance.runtime_state.time_updated, + ), } } } @@ -89,15 +117,11 @@ impl From<(Instance, Option)> for InstanceAndActiveVmm { impl From for omicron_common::api::external::Instance { fn from(value: InstanceAndActiveVmm) -> Self { - let (run_state, time_run_state_updated) = if let Some(vmm) = value.vmm { - (vmm.runtime.state, vmm.runtime.time_state_updated) - } else { - ( - value.instance.runtime_state.nexus_state.clone(), - value.instance.runtime_state.time_updated, - ) - }; - + let (run_state, time_run_state_updated) = + InstanceAndActiveVmm::determine_effective_state( + &value.instance, + &value.vmm, + ); Self { identity: value.instance.identity(), project_id: value.instance.project_id, @@ -109,7 +133,7 @@ impl From for omicron_common::api::external::Instance { .parse() .expect("found invalid hostname in the database"), runtime: omicron_common::api::external::InstanceRuntimeState { - run_state: *run_state.state(), + run_state, time_run_state_updated, }, } @@ -197,7 +221,7 @@ impl DataStore { bail_unless!( instance.runtime().nexus_state.state() - == &api::external::InstanceState::Creating, + == api::external::InstanceState::Creating, "newly-created Instance has unexpected state: {:?}", instance.runtime().nexus_state ); diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index af3f832e35..d6ac971cc5 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -713,7 +713,6 @@ impl DataStore { self.transaction_retry_wrapper("instance_update_network_interface") .transaction(&conn, |conn| { let err = err.clone(); - let stopped = stopped.clone(); let update_target_query = update_target_query.clone(); async move { let instance_runtime = @@ -759,7 +758,6 @@ impl DataStore { self.transaction_retry_wrapper("instance_update_network_interface") .transaction(&conn, |conn| { let err = err.clone(); - let stopped = stopped.clone(); let update_target_query = update_target_query.clone(); async move { let instance_state = diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index b8fb47de26..a7018bf365 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -22,7 +22,6 @@ use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; -use omicron_common::api::external::InstanceState as ApiInstanceState; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; @@ -55,14 +54,15 @@ impl DataStore { opctx: &OpContext, vmm_id: &Uuid, ) -> UpdateResult { - let valid_states = vec![ - DbInstanceState::new(ApiInstanceState::Destroyed), - DbInstanceState::new(ApiInstanceState::Failed), + const VALID_STATES: &[DbInstanceState] = &[ + DbInstanceState::Destroyed, + DbInstanceState::Failed, + DbInstanceState::SagaUnwound, ]; let updated = diesel::update(dsl::vmm) .filter(dsl::id.eq(*vmm_id)) - .filter(dsl::state.eq_any(valid_states)) + .filter(dsl::state.eq_any(VALID_STATES)) .filter(dsl::time_deleted.is_null()) .set(dsl::time_deleted.eq(Utc::now())) .check_if_exists::(*vmm_id) @@ -190,7 +190,7 @@ impl DataStore { pagparams: &DataPageParams<'_, Uuid>, ) -> ListResultVec { use crate::db::schema::instance::dsl as instance_dsl; - let destroyed = DbInstanceState::new(ApiInstanceState::Destroyed); + let destroyed = DbInstanceState::Destroyed; paginated(dsl::vmm, dsl::id, pagparams) // In order to be considered "abandoned", a VMM must be: // - in the `Destroyed` state diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 7d5e254aac..4de3c926be 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -31,20 +31,17 @@ use nexus_db_model::IpAttachState; use nexus_db_model::IpAttachStateEnum; use omicron_common::address::NUM_SOURCE_NAT_PORTS; use omicron_common::api::external; -use omicron_common::api::external::InstanceState as ApiInstanceState; use uuid::Uuid; // Broadly, we want users to be able to attach/detach at will // once an instance is created and functional. pub const SAFE_TO_ATTACH_INSTANCE_STATES_CREATING: [DbInstanceState; 3] = [ - DbInstanceState(ApiInstanceState::Stopped), - DbInstanceState(ApiInstanceState::Running), - DbInstanceState(ApiInstanceState::Creating), -]; -pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ - DbInstanceState(ApiInstanceState::Stopped), - DbInstanceState(ApiInstanceState::Running), + DbInstanceState::Stopped, + DbInstanceState::Running, + DbInstanceState::Creating, ]; +pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = + [DbInstanceState::Stopped, DbInstanceState::Running]; // If we're in a state which will naturally resolve to either // stopped/running, we want users to know that the request can be // retried safely via Error::unavail. @@ -52,11 +49,11 @@ pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ // There may be a good case for RPWing // external_ip_state -> { NAT RPW, sled-agent } in future. pub const SAFE_TRANSIENT_INSTANCE_STATES: [DbInstanceState; 5] = [ - DbInstanceState(ApiInstanceState::Starting), - DbInstanceState(ApiInstanceState::Stopping), - DbInstanceState(ApiInstanceState::Creating), - DbInstanceState(ApiInstanceState::Rebooting), - DbInstanceState(ApiInstanceState::Migrating), + DbInstanceState::Starting, + DbInstanceState::Stopping, + DbInstanceState::Creating, + DbInstanceState::Rebooting, + DbInstanceState::Migrating, ]; /// The maximum number of disks that can be attached to an instance. diff --git a/nexus/db-queries/src/db/queries/network_interface.rs b/nexus/db-queries/src/db/queries/network_interface.rs index 69c1827b6d..483efa31f5 100644 --- a/nexus/db-queries/src/db/queries/network_interface.rs +++ b/nexus/db-queries/src/db/queries/network_interface.rs @@ -42,25 +42,25 @@ use uuid::Uuid; // States an instance must be in to operate on its network interfaces, in // most situations. static INSTANCE_STOPPED: Lazy = - Lazy::new(|| db::model::InstanceState(external::InstanceState::Stopped)); + Lazy::new(|| db::model::InstanceState::Stopped); static INSTANCE_FAILED: Lazy = - Lazy::new(|| db::model::InstanceState(external::InstanceState::Failed)); + Lazy::new(|| db::model::InstanceState::Failed); // An instance can be in the creating state while we manipulate its // interfaces. The intention is for this only to be the case during sagas. static INSTANCE_CREATING: Lazy = - Lazy::new(|| db::model::InstanceState(external::InstanceState::Creating)); + Lazy::new(|| db::model::InstanceState::Creating); // A sentinel value for the instance state when the instance actually does // not exist. static INSTANCE_DESTROYED: Lazy = - Lazy::new(|| db::model::InstanceState(external::InstanceState::Destroyed)); + Lazy::new(|| db::model::InstanceState::Destroyed); // A sentinel value for the instance state when the instance has an active // VMM, irrespective of that VMM's actual state. static INSTANCE_RUNNING: Lazy = - Lazy::new(|| db::model::InstanceState(external::InstanceState::Running)); + Lazy::new(|| db::model::InstanceState::Running); static NO_INSTANCE_SENTINEL_STRING: Lazy = Lazy::new(|| String::from(NO_INSTANCE_SENTINEL)); diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 63b080b436..c1dc6ec221 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -18,6 +18,7 @@ use crate::external_api::params; use cancel_safe_futures::prelude::*; use futures::future::Fuse; use futures::{FutureExt, SinkExt, StreamExt}; +use nexus_db_model::InstanceState as DbInstanceState; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_queries::authn; @@ -477,7 +478,7 @@ impl super::Nexus { let (instance, vmm) = (state.instance(), state.vmm()); if vmm.is_none() - || vmm.as_ref().unwrap().runtime.state.0 != InstanceState::Running + || vmm.as_ref().unwrap().runtime.state != InstanceState::Running { return Err(Error::invalid_request( "instance must be running before it can migrate", @@ -704,45 +705,29 @@ impl super::Nexus { .db_datastore .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - let (instance, vmm) = (state.instance(), state.vmm()); - - if let Some(vmm) = vmm { - match vmm.runtime.state.0 { - InstanceState::Starting - | InstanceState::Running - | InstanceState::Rebooting => { - debug!(self.log, "asked to start an active instance"; - "instance_id" => %authz_instance.id()); - - return Ok(state); - } - InstanceState::Stopped => { - let propolis_id = instance - .runtime() - .propolis_id - .expect("needed a VMM ID to fetch a VMM record"); - error!(self.log, - "instance is stopped but still has an active VMM"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %propolis_id); - - return Err(Error::internal_error( - "instance is stopped but still has an active VMM", - )); - } - _ => { - return Err(Error::conflict(&format!( - "instance is in state {} but must be {} to be started", - vmm.runtime.state.0, - InstanceState::Stopped - ))); - } + // An instance may only be started if it is in the `Stopped` effective state. + match state.effective_state() { + InstanceState::Starting + | InstanceState::Running + | InstanceState::Rebooting => { + debug!(self.log, "asked to start an active instance"; + "instance_id" => %authz_instance.id()); + + return Ok(state); + } + // Okay to start + InstanceState::Stopped => {} + instance_state => { + return Err(Error::conflict(&format!( + "instance is in state {instance_state} but must be {} to be started", + InstanceState::Stopped + ))) } } let saga_params = sagas::instance_start::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), - db_instance: instance.clone(), + db_instance: state.instance().clone(), }; self.execute_saga::( @@ -840,10 +825,13 @@ impl super::Nexus { vmm_state: &Option, requested: &InstanceStateChangeRequest, ) -> Result { + // N.B. that this *doesn't* use the `InstanceAndVmms::effective_state + // method, because we actually want to inspect the real state of the + // active VMM here. let effective_state = if let Some(vmm) = vmm_state { - vmm.runtime.state.0 + vmm.runtime.state } else { - instance_state.runtime().nexus_state.0 + instance_state.runtime().nexus_state }; // Requests that operate on active instances have to be directed to the @@ -856,7 +844,7 @@ impl super::Nexus { // If there's no active sled because the instance is stopped, // allow requests to stop to succeed silently for idempotency, // but reject requests to do anything else. - InstanceState::Stopped => match requested { + DbInstanceState::Stopped => match requested { InstanceStateChangeRequest::Run => { return Err(Error::invalid_request(&format!( "cannot run an instance in state {} with no VMM", @@ -883,7 +871,7 @@ impl super::Nexus { // If the instance is still being created (such that it hasn't // even begun to start yet), no runtime state change is valid. // Return a specific error message explaining the problem. - InstanceState::Creating => { + DbInstanceState::Creating => { return Err(Error::invalid_request( "cannot change instance state while it is \ still being created" @@ -896,7 +884,7 @@ impl super::Nexus { // TODO(#2825): Failed instances should be allowed to stop, but // this requires a special action because there is no sled to // send the request to. - InstanceState::Failed | InstanceState::Destroyed => { + DbInstanceState::Failed | DbInstanceState::Destroyed | DbInstanceState::SagaUnwound => { return Err(Error::invalid_request(&format!( "instance state cannot be changed from {}", effective_state @@ -926,27 +914,30 @@ impl super::Nexus { InstanceStateChangeRequest::Run | InstanceStateChangeRequest::Reboot | InstanceStateChangeRequest::Stop => match effective_state { - InstanceState::Creating - | InstanceState::Starting - | InstanceState::Running - | InstanceState::Stopping - | InstanceState::Stopped - | InstanceState::Rebooting - | InstanceState::Migrating => true, - InstanceState::Repairing | InstanceState::Failed => false, - InstanceState::Destroyed => false, + DbInstanceState::Creating + | DbInstanceState::Starting + | DbInstanceState::Running + | DbInstanceState::Stopping + | DbInstanceState::Stopped + | DbInstanceState::Rebooting + | DbInstanceState::Migrating => true, + DbInstanceState::Repairing | DbInstanceState::Failed => false, + DbInstanceState::Destroyed | DbInstanceState::SagaUnwound => { + false + } }, InstanceStateChangeRequest::Migrate(_) => match effective_state { - InstanceState::Running - | InstanceState::Rebooting - | InstanceState::Migrating => true, - InstanceState::Creating - | InstanceState::Starting - | InstanceState::Stopping - | InstanceState::Stopped - | InstanceState::Repairing - | InstanceState::Failed - | InstanceState::Destroyed => false, + DbInstanceState::Running + | DbInstanceState::Rebooting + | DbInstanceState::Migrating => true, + DbInstanceState::Creating + | DbInstanceState::Starting + | DbInstanceState::Stopping + | DbInstanceState::Stopped + | DbInstanceState::Repairing + | DbInstanceState::Failed + | DbInstanceState::Destroyed + | DbInstanceState::SagaUnwound => false, }, }; @@ -1647,24 +1638,24 @@ impl super::Nexus { let (instance, vmm) = (state.instance(), state.vmm()); if let Some(vmm) = vmm { - match vmm.runtime.state.0 { - InstanceState::Running - | InstanceState::Rebooting - | InstanceState::Migrating - | InstanceState::Repairing => { + match vmm.runtime.state { + DbInstanceState::Running + | DbInstanceState::Rebooting + | DbInstanceState::Migrating + | DbInstanceState::Repairing => { Ok(SocketAddr::new(vmm.propolis_ip.ip(), vmm.propolis_port.into())) } - InstanceState::Creating - | InstanceState::Starting - | InstanceState::Stopping - | InstanceState::Stopped - | InstanceState::Failed => { + DbInstanceState::Creating + | DbInstanceState::Starting + | DbInstanceState::Stopping + | DbInstanceState::Stopped + | DbInstanceState::Failed => { Err(Error::invalid_request(format!( "cannot connect to serial console of instance in state \"{}\"", - vmm.runtime.state.0, + vmm.runtime.state, ))) } - InstanceState::Destroyed => Err(Error::invalid_request( + DbInstanceState::Destroyed | DbInstanceState::SagaUnwound => Err(Error::invalid_request( "cannot connect to serial console of destroyed instance", )), } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index b941739393..8ab69c5be4 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -9,14 +9,13 @@ use std::net::{IpAddr, Ipv6Addr}; use crate::Nexus; use chrono::Utc; use nexus_db_model::{ - ByteCount, ExternalIp, IpAttachState, Ipv4NatEntry, + ByteCount, ExternalIp, InstanceState, IpAttachState, Ipv4NatEntry, SledReservationConstraints, SledResource, }; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::queries::external_ip::SAFE_TRANSIENT_INSTANCE_STATES; use nexus_db_queries::{authn, context::OpContext, db, db::DataStore}; -use omicron_common::api::external::InstanceState; use omicron_common::api::external::{Error, NameOrId}; use serde::{Deserialize, Serialize}; use steno::ActionError; @@ -113,19 +112,19 @@ pub async fn create_and_insert_vmm_record( Ok(vmm) } -/// Given a previously-inserted VMM record, set its state to Destroyed and then +/// Given a previously-inserted VMM record, set its state to `SagaUnwound` and then /// delete it. /// /// This function succeeds idempotently if called with the same parameters, /// provided that the VMM record was not changed by some other actor after the /// calling saga inserted it. -pub async fn destroy_vmm_record( +pub async fn unwind_vmm_record( datastore: &DataStore, opctx: &OpContext, prev_record: &db::model::Vmm, ) -> Result<(), anyhow::Error> { let new_runtime = db::model::VmmRuntimeState { - state: db::model::InstanceState(InstanceState::Destroyed), + state: db::model::InstanceState::SagaUnwound, time_state_updated: Utc::now(), gen: prev_record.runtime.gen.next().into(), }; @@ -220,7 +219,9 @@ pub async fn instance_ip_get_instance_state( .await .map_err(ActionError::action_failed)?; - let found_state = inst_and_vmm.instance().runtime_state.nexus_state.0; + // XXX(eliza): not sure if this should operate on + // `InstanceAndActiveVmm::effective_state` or not? + let found_state = inst_and_vmm.instance().runtime_state.nexus_state; let mut sled_id = inst_and_vmm.sled_id(); // Arriving here means we started in a correct state (running/stopped). @@ -243,12 +244,12 @@ pub async fn instance_ip_get_instance_state( sled_id = None; } InstanceState::Running => {} - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state.into()) => { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => { return Err(ActionError::action_failed(Error::unavail(&format!( "can't {verb} in transient state {state}" )))) } - InstanceState::Destroyed => { + InstanceState::Destroyed | InstanceState::SagaUnwound => { return Err(ActionError::action_failed(Error::not_found_by_id( omicron_common::api::external::ResourceType::Instance, &authz_instance.id(), diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 1cfd170faf..6f9613e8a5 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -235,7 +235,7 @@ async fn sim_destroy_vmm_record( info!(osagactx.log(), "destroying vmm record for migration unwind"; "propolis_id" => %vmm.id); - super::instance_common::destroy_vmm_record( + super::instance_common::unwind_vmm_record( osagactx.datastore(), &opctx, &vmm, @@ -755,8 +755,8 @@ mod tests { let new_instance = new_state.instance(); let new_vmm = new_state.vmm().as_ref(); assert_eq!( - new_instance.runtime().nexus_state.0, - omicron_common::api::external::InstanceState::Stopped + new_instance.runtime().nexus_state, + nexus_db_model::InstanceState::Stopped ); assert!(new_instance.runtime().propolis_id.is_none()); assert!(new_vmm.is_none()); diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index e7caedfc9c..4d1c62c0b9 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -14,9 +14,10 @@ use crate::app::instance::InstanceRegisterReason; use crate::app::instance::InstanceStateChangeError; use crate::app::sagas::declare_saga_actions; use chrono::Utc; +use nexus_db_model::InstanceState; use nexus_db_queries::db::{identity::Resource, lookup::LookupPath}; use nexus_db_queries::{authn, authz, db}; -use omicron_common::api::external::{Error, InstanceState}; +use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; use slog::info; use steno::{ActionError, Node}; @@ -200,7 +201,7 @@ async fn sis_destroy_vmm_record( ); let vmm = sagactx.lookup::("vmm_record")?; - super::instance_common::destroy_vmm_record( + super::instance_common::unwind_vmm_record( osagactx.datastore(), &opctx, &vmm, @@ -260,9 +261,7 @@ async fn sis_move_to_starting( // be running before Propolis thinks it has started.) None => { let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::new( - InstanceState::Running, - ), + nexus_state: db::model::InstanceState::Running, propolis_id: Some(propolis_id), time_updated: Utc::now(), gen: db_instance.runtime().gen.next().into(), @@ -300,7 +299,7 @@ async fn sis_move_to_starting_undo( "instance_id" => %instance_id); let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::new(InstanceState::Stopped), + nexus_state: InstanceState::Stopped, propolis_id: None, gen: db_instance.runtime_state.gen.next().into(), ..db_instance.runtime_state @@ -762,8 +761,7 @@ mod test { .as_ref() .expect("running instance should have a vmm") .runtime - .state - .0; + .state; assert_eq!(vmm_state, InstanceState::Running); } @@ -818,7 +816,7 @@ mod test { assert!(new_db_instance.runtime().propolis_id.is_none()); assert_eq!( - new_db_instance.runtime().nexus_state.0, + new_db_instance.runtime().nexus_state, InstanceState::Stopped ); @@ -861,8 +859,7 @@ mod test { .as_ref() .expect("running instance should have a vmm") .runtime - .state - .0; + .state; assert_eq!(vmm_state, InstanceState::Running); } @@ -930,7 +927,7 @@ mod test { assert_eq!( db_instance.instance().runtime_state.nexus_state, - nexus_db_model::InstanceState(InstanceState::Stopped) + nexus_db_model::InstanceState::Stopped ); assert!(db_instance.vmm().is_none()); diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs new file mode 100644 index 0000000000..d8ae65a18c --- /dev/null +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -0,0 +1,216 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::db::datastore::instance; +use crate::app::db::datastore::InstanceAndVmms; +use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::{authn, authz}; +use omicron_common::api::external::InstanceState; +use serde::{Deserialize, Serialize}; +use steno::{ActionError, DagBuilder, Node, SagaName}; +use uuid::Uuid; + +mod destroyed; + +// The public interface to this saga is actually a smaller saga that starts the +// "real" update saga, which inherits the lock from the start saga. This is +// because the decision of which subsaga(s) to run depends on the state of the +// instance record read from the database *once the lock has been acquired*, +// and the saga DAG for the "real" instance update saga may be constructed only +// after the instance state has been fetched. However, since the the instance +// state must be read inside the lock, that *also* needs to happen in a saga, +// so that the lock is always dropped when unwinding. Thus, we have a second, +// smaller saga which starts our real saga, and then the real saga, which +// decides what DAG to build based on the instance fetched by the start saga. +// +// Don't worry, this won't be on the test. +mod start; +pub(crate) use self::start::{Params, SagaInstanceUpdate}; + +/// Parameters to the "real" instance update saga. +#[derive(Debug, Deserialize, Serialize)] +struct RealParams { + serialized_authn: authn::saga::Serialized, + + authz_instance: authz::Instance, + + state: InstanceAndVmms, + + orig_lock: instance::UpdaterLock, +} + +const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; +const INSTANCE_LOCK: &str = "updater_lock"; + +// instance update saga: actions + +declare_saga_actions! { + instance_update; + + // Become the instance updater + BECOME_UPDATER -> "updater_lock" { + + siu_become_updater + - siu_unbecome_updater + } + + UNLOCK_INSTANCE -> "unlocked" { + + siu_unlock_instance + } +} + +// instance update saga: definition +struct SagaDoActualInstanceUpdate; + +impl NexusSaga for SagaDoActualInstanceUpdate { + const NAME: &'static str = "instance-update"; + type Params = RealParams; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(become_updater_action()); + + // determine which subsaga(s) to execute based on the state of the instance + // and the VMMs associated with it. + if let Some(ref active_vmm) = params.state.active_vmm { + // If the active VMM is `Destroyed`, schedule the active VMM + // destroyed subsaga. + if active_vmm.runtime.state.state() == InstanceState::Destroyed { + const DESTROYED_SUBSAGA_PARAMS: &str = + "params_for_vmm_destroyed_subsaga"; + let subsaga_params = destroyed::Params { + serialized_authn: params.serialized_authn.clone(), + authz_instance: params.authz_instance.clone(), + vmm_id: active_vmm.id, + instance: params.state.instance.clone(), + }; + let subsaga_dag = { + let subsaga_builder = DagBuilder::new(SagaName::new( + destroyed::SagaVmmDestroyed::NAME, + )); + destroyed::SagaVmmDestroyed::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + DESTROYED_SUBSAGA_PARAMS, + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + DESTROYED_SUBSAGA_PARAMS.to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "vmm_destroyed_subsaga_no_result", + subsaga_dag, + DESTROYED_SUBSAGA_PARAMS, + )); + } + } + + builder.append(unlock_instance_action()); + Ok(builder.build()?) + } +} + +async fn siu_become_updater( + sagactx: NexusActionContext, +) -> Result { + let RealParams { + ref serialized_authn, ref authz_instance, orig_lock, .. + } = sagactx.saga_params::()?; + let saga_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + + slog::debug!( + osagactx.log(), + "instance update: trying to become instance updater..."; + "instance_id" => %authz_instance.id(), + "saga_id" => %saga_id, + "parent_lock" => ?orig_lock, + ); + + let lock = osagactx + .datastore() + .instance_updater_inherit_lock( + &opctx, + &authz_instance, + orig_lock, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + slog::info!( + osagactx.log(), + "Now, I am become Updater, the destroyer of VMMs."; + "instance_id" => %authz_instance.id(), + "saga_id" => %saga_id, + ); + + Ok(lock) +} + +async fn siu_unbecome_updater( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await?; + + Ok(()) +} + +async fn siu_unlock_instance( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await +} + +async fn unlock_instance_inner( + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, + sagactx: &NexusActionContext, +) -> Result<(), ActionError> { + let lock = sagactx.lookup::(INSTANCE_LOCK)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + slog::info!( + osagactx.log(), + "instance update: unlocking instance"; + "instance_id" => %authz_instance.id(), + "lock" => ?lock, + ); + + osagactx + .datastore() + .instance_updater_unlock(&opctx, authz_instance, lock) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index f8b56b3522..19768735e7 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1609,6 +1609,7 @@ mod test { ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; use dropshot::test_util::ClientTestContext; + use nexus_db_model::InstanceState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::DataStore; @@ -2041,13 +2042,9 @@ mod test { .as_ref() .expect("running instance should have a sled") .runtime - .state - .0; + .state; - assert_eq!( - new_state, - omicron_common::api::external::InstanceState::Running - ); + assert_eq!(new_state, InstanceState::Running); instance_state }