From 8749c9f6905da1eb76d24fb8ff4480349e3f337d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Jun 2024 16:17:44 -0700 Subject: [PATCH] Add `VmmState::SagaUnwound` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This branch is part of ongoing work on the `instance-update` saga (see PR #5749); I've factored it out into a separate PR. This is largely because this branch makes mechanical changes to a bunch of different files that aren't directly related to the core change of #5749, and I'd like to avoid the additional merge conflicts that are likely when these changes remain un-merged for a long time. --- Depends on #5854. As part of #5749, it is desirable to distinguish between *why* a VMM record was marked as `Destroyed`, in order to determine which saga is responsible for cleaning up that VMM's resources. The `instance-update` saga must be the only entity that can set an instance's VMM IDs (active and target) to NULL. Presently, however, the `instance-start` and `instance-migrate` sagas may also do this when they unwind. This is a requirement to avoid situations where a failed `instance-update` saga cannot unwind, because the instance's generation number has changed while the saga was executing. We want to ensure that if a start or migrate request fails, the instance will appear to be in the correct post-state as soon as the relevant API call returns. In order to do this, without having to also guarantee that an instance update has occurred, we introduce a new VMM state, `SagaUnwound`, with the following rules: - It is legal to start or migrate an instance if the `active_propolis_id` or `destination_propolis_id` (respectively) is either `NULL` or refers to a VMM that’s in the `SagaUnwound` state (the new VMM ID directly replaces the old ID). - If an instance has an `active_propolis_id` in the `SagaUnwound` state, then the instance appears to be `Stopped`. - If an instance has a `destination_propolis_id` in the `SagaUnwound` state, nothing special happens–the instance’s state is still derived from its active VMM’s state. - The update saga treats `SagaUnwound` VMMs as identical to `Destroyed` VMMs for purposes of deciding whether to remove a VMM ID from an instance. This branch adds a new `VmmState::SagaUnwound` variant. The `SagaUnwound` state is an internal implementation detail that shouldn't be exposed to an operator or in the external API. Sled-agents will never report that a VMM is in this state. Instead, this state is set my the `instance-start` and `instance-migrate` sagas when they unwind. When determining the API instance state from an instance and active VMM query so that the `SagaUnwound` state is mapped to `Destroyed`. Closes #5848, which this replaces. --- nexus/db-model/src/schema_versions.rs | 3 ++- nexus/db-model/src/vmm_state.rs | 8 +++++--- nexus/src/app/instance.rs | 2 +- nexus/src/app/sagas/instance_common.rs | 16 +++++++++++----- nexus/src/app/sagas/instance_migrate.rs | 2 +- nexus/src/app/sagas/instance_start.rs | 2 +- schema/crdb/add-saga-unwound-vmm-state/up.sql | 2 ++ schema/crdb/dbinit.sql | 5 +++-- 8 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 schema/crdb/add-saga-unwound-vmm-state/up.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 25c16df9e2e..325d2a66365 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(69, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(70, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(69, "add-saga-unwound-vmm-state"), KnownVersion::new(69, "separate-instance-and-vmm-states"), KnownVersion::new(68, "filter-v2p-mapping-by-instance-state"), KnownVersion::new(67, "add-instance-updater-lock"), diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index f737f48f695..648d2593d48 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -24,6 +24,7 @@ impl_enum_type!( Migrating => b"migrating" Failed => b"failed" Destroyed => b"destroyed" + SagaUnwound => b"saga_unwound" ); impl VmmState { @@ -37,6 +38,7 @@ impl VmmState { VmmState::Migrating => "migrating", VmmState::Failed => "failed", VmmState::Destroyed => "destroyed", + VmmState::SagaUnwound => "saga_unwound", } } } @@ -58,7 +60,7 @@ impl From for omicron_common::api::internal::nexus::VmmState { VmmState::Rebooting => Output::Rebooting, VmmState::Migrating => Output::Migrating, VmmState::Failed => Output::Failed, - VmmState::Destroyed => Output::Destroyed, + VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed, } } } @@ -74,7 +76,7 @@ impl From for sled_agent_client::types::VmmState { VmmState::Rebooting => Output::Rebooting, VmmState::Migrating => Output::Migrating, VmmState::Failed => Output::Failed, - VmmState::Destroyed => Output::Destroyed, + VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed, } } } @@ -108,7 +110,7 @@ impl From for omicron_common::api::external::InstanceState { VmmState::Rebooting => Output::Rebooting, VmmState::Migrating => Output::Migrating, VmmState::Failed => Output::Failed, - VmmState::Destroyed => Output::Destroyed, + VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed, } } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 1132f1f5b89..b5f3f1131ae 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1664,7 +1664,7 @@ impl super::Nexus { vmm.runtime.state, ))) } - DbVmmState::Destroyed => Err(Error::invalid_request( + DbVmmState::Destroyed | DbVmmState::SagaUnwound => Err(Error::invalid_request( "cannot connect to serial console of destroyed instance", )), } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index ff2ebd517f0..1192e47ba7f 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -111,19 +111,22 @@ pub async fn create_and_insert_vmm_record( Ok(vmm) } -/// Given a previously-inserted VMM record, set its state to Destroyed and then -/// delete it. +/// Given a previously-inserted VMM record, set its state to `SagaUnwound` and +/// then delete it. /// /// This function succeeds idempotently if called with the same parameters, /// provided that the VMM record was not changed by some other actor after the /// calling saga inserted it. -pub async fn destroy_vmm_record( +/// +/// This function is intended to be called when a saga which created a VMM +/// record unwinds. +pub async fn unwind_vmm_record( datastore: &DataStore, opctx: &OpContext, prev_record: &db::model::Vmm, ) -> Result<(), anyhow::Error> { let new_runtime = db::model::VmmRuntimeState { - state: db::model::VmmState::Destroyed, + state: db::model::VmmState::SagaUnwound, time_state_updated: Utc::now(), gen: prev_record.runtime.gen.next().into(), }; @@ -324,7 +327,10 @@ pub async fn instance_ip_get_instance_state( ), ))); } - (InstanceState::Vmm, Some(VmmState::Destroyed)) => { + ( + InstanceState::Vmm, + Some(VmmState::Destroyed | VmmState::SagaUnwound), + ) => { return Err(ActionError::action_failed(Error::internal_error( &format!( "instance {} points to destroyed VMM", diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index db1d838014b..14340646660 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -235,7 +235,7 @@ async fn sim_destroy_vmm_record( info!(osagactx.log(), "destroying vmm record for migration unwind"; "propolis_id" => %vmm.id); - super::instance_common::destroy_vmm_record( + super::instance_common::unwind_vmm_record( osagactx.datastore(), &opctx, &vmm, diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index d67ff02c20c..0013a63d1a1 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -200,7 +200,7 @@ async fn sis_destroy_vmm_record( ); let vmm = sagactx.lookup::("vmm_record")?; - super::instance_common::destroy_vmm_record( + super::instance_common::unwind_vmm_record( osagactx.datastore(), &opctx, &vmm, diff --git a/schema/crdb/add-saga-unwound-vmm-state/up.sql b/schema/crdb/add-saga-unwound-vmm-state/up.sql new file mode 100644 index 00000000000..65ab5b5c855 --- /dev/null +++ b/schema/crdb/add-saga-unwound-vmm-state/up.sql @@ -0,0 +1,2 @@ +ALTER TYPE omicron.public.vmm_state + ADD VALUE IF NOT EXISTS 'saga_unwound' AFTER 'destroyed'; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 7de3b015281..18892784dc0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -982,7 +982,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM ( 'rebooting', 'migrating', 'failed', - 'destroyed' + 'destroyed', + 'saga_unwound', ); /* @@ -4054,7 +4055,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '69.0.0', NULL) + (TRUE, NOW(), NOW(), '70.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;