From d2a063668aaa60385592ec3bfa861d8b1f0cf2b6 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Sat, 7 Oct 2023 00:05:32 +0000 Subject: [PATCH] make Nexus sole owner of Instance records' state values Sled agent has one path that updates the `state` column in an instance record (the `fallback_state` field in an `InstanceRuntimeState`): if an active Propolis is retired (i.e. the instance's Propolis ID goes from `Some` to `None`), the instance goes to the Stopped state. This rule is so simple that it can be implemented entirely in Nexus, giving Nexus sole ownership of this table column: - Remove the `fallback_state` field from `api::internal::InstanceRuntimeState`. This completely hides the field from the sled agent. - When Nexus processes a new `SledInstanceState`, check the `active_propolis_id`: if it's `Some`, the state is Running; if it's `None`, the state is Stopped. The only other states that an instance can now have are Creating and Destroyed, both of which are set only by Nexus and only when it is known that no sled agent refers to the instance (either by definition when an instance is being created, or because the instance has no active VMM when it's being destroyed). Rename the `fallback_state` field to `nexus_state` to reflect its new ownership. --- clients/nexus-client/src/lib.rs | 1 - clients/sled-agent-client/src/lib.rs | 2 -- common/src/api/internal/nexus.rs | 2 -- nexus/db-model/src/instance.rs | 13 ++++++++---- nexus/db-queries/src/db/datastore/disk.rs | 8 ++++---- nexus/db-queries/src/db/datastore/instance.rs | 10 +++++----- .../src/db/datastore/network_interface.rs | 4 ++-- .../src/db/queries/network_interface.rs | 2 +- nexus/src/app/instance.rs | 6 +++--- nexus/src/app/sagas/instance_create.rs | 4 ++-- nexus/src/app/sagas/instance_migrate.rs | 2 +- nexus/src/app/sagas/instance_start.rs | 6 +++--- openapi/nexus-internal.json | 9 --------- openapi/sled-agent.json | 9 --------- sled-agent/src/common/instance.rs | 20 ++++--------------- sled-agent/src/sim/collection.rs | 5 +---- 16 files changed, 35 insertions(+), 68 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 642ffcf012..33a68cb3ce 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -89,7 +89,6 @@ impl From ) -> Self { Self { dst_propolis_id: s.dst_propolis_id, - fallback_state: s.fallback_state.into(), gen: s.gen.into(), migration_id: s.migration_id, propolis_id: s.propolis_id, diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 62fcc41f7b..3daac7dd60 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -24,7 +24,6 @@ impl From s: omicron_common::api::internal::nexus::InstanceRuntimeState, ) -> Self { Self { - fallback_state: s.fallback_state.into(), propolis_id: s.propolis_id, dst_propolis_id: s.dst_propolis_id, migration_id: s.migration_id, @@ -79,7 +78,6 @@ impl From { fn from(s: types::InstanceRuntimeState) -> Self { Self { - fallback_state: s.fallback_state.into(), propolis_id: s.propolis_id, dst_propolis_id: s.dst_propolis_id, migration_id: s.migration_id, diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index e2291d32ce..a4a539ad9b 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -45,8 +45,6 @@ pub struct InstanceProperties { /// no active VMM. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct InstanceRuntimeState { - /// The state of the instance if it has no active VMM. - pub fallback_state: InstanceState, /// The instance's currently active VMM ID. pub propolis_id: Option, /// If a migration is active, the ID of the target VMM. diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index 38fd7390b4..9252926547 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -125,7 +125,7 @@ pub struct InstanceRuntimeState { /// /// This field is guarded by the instance's `gen` field. #[diesel(column_name = state)] - pub fallback_state: InstanceState, + pub nexus_state: InstanceState, /// The time at which the runtime state was last updated. This is distinct /// from the time the record was last modified, because some updates don't @@ -166,7 +166,7 @@ pub struct InstanceRuntimeState { impl InstanceRuntimeState { fn new(initial_state: InstanceState, creation_time: DateTime) -> Self { Self { - fallback_state: initial_state, + nexus_state: initial_state, time_updated: creation_time, propolis_id: None, dst_propolis_id: None, @@ -182,8 +182,14 @@ impl From fn from( state: omicron_common::api::internal::nexus::InstanceRuntimeState, ) -> Self { + let nexus_state = if state.propolis_id.is_some() { + omicron_common::api::external::InstanceState::Running + } else { + omicron_common::api::external::InstanceState::Stopped + }; + Self { - fallback_state: InstanceState::new(state.fallback_state), + nexus_state: InstanceState::new(nexus_state), time_updated: state.time_updated, gen: state.gen.into(), propolis_id: state.propolis_id, @@ -199,7 +205,6 @@ impl From fn from(state: InstanceRuntimeState) -> Self { Self { dst_propolis_id: state.dst_propolis_id, - fallback_state: state.fallback_state.into(), gen: state.gen.into(), migration_id: state.migration_id, propolis_id: state.propolis_id, diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 19bed3580d..a0d9bf12c3 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -240,7 +240,7 @@ impl DataStore { ) ); } - match collection.runtime_state.fallback_state.state() { + match collection.runtime_state.nexus_state.state() { // Ok-to-be-attached instance states: api::external::InstanceState::Creating | api::external::InstanceState::Stopped => { @@ -264,7 +264,7 @@ impl DataStore { _ => { Err(Error::invalid_request(&format!( "cannot attach disk to instance in {} state", - collection.runtime_state.fallback_state.state(), + collection.runtime_state.nexus_state.state(), ))) } } @@ -381,7 +381,7 @@ impl DataStore { ) ); } - match collection.runtime_state.fallback_state.state() { + match collection.runtime_state.nexus_state.state() { // Ok-to-be-detached instance states: api::external::InstanceState::Creating | api::external::InstanceState::Stopped => { @@ -395,7 +395,7 @@ impl DataStore { _ => { Err(Error::invalid_request(&format!( "cannot detach disk from instance in {} state", - collection.runtime_state.fallback_state.state(), + collection.runtime_state.nexus_state.state(), ))) } } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index c1748a6a50..66a9c9af48 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -68,7 +68,7 @@ impl InstanceAndActiveVmm { if let Some(vmm) = &self.vmm { vmm.runtime.state.0 } else { - self.instance.runtime().fallback_state.0 + self.instance.runtime().nexus_state.0 } } } @@ -85,7 +85,7 @@ impl From for omicron_common::api::external::Instance { (vmm.runtime.state, vmm.runtime.time_state_updated) } else { ( - value.instance.runtime_state.fallback_state.clone(), + value.instance.runtime_state.nexus_state.clone(), value.instance.runtime_state.time_updated, ) }; @@ -161,10 +161,10 @@ impl DataStore { })?; bail_unless!( - instance.runtime().fallback_state.state() + instance.runtime().nexus_state.state() == &api::external::InstanceState::Creating, "newly-created Instance has unexpected state: {:?}", - instance.runtime().fallback_state + instance.runtime().nexus_state ); bail_unless!( instance.runtime().gen == gen, @@ -435,7 +435,7 @@ impl DataStore { ); } let instance_state = - collection.runtime_state.fallback_state.state(); + collection.runtime_state.nexus_state.state(); match instance_state { api::external::InstanceState::Stopped | api::external::InstanceState::Failed => { diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index 1d9e6312c2..e79f4ec0d1 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -473,7 +473,7 @@ impl DataStore { let instance_runtime = instance_query.get_result_async(&conn).await?.runtime_state; if instance_runtime.propolis_id.is_some() - || instance_runtime.fallback_state != stopped + || instance_runtime.nexus_state != stopped { return Err(TxnError::CustomError( NetworkInterfaceUpdateError::InstanceNotStopped, @@ -516,7 +516,7 @@ impl DataStore { let instance_state = instance_query.get_result_async(&conn).await?.runtime_state; if instance_state.propolis_id.is_some() - || instance_state.fallback_state != stopped + || instance_state.nexus_state != stopped { return Err(TxnError::CustomError( NetworkInterfaceUpdateError::InstanceNotStopped, diff --git a/nexus/db-queries/src/db/queries/network_interface.rs b/nexus/db-queries/src/db/queries/network_interface.rs index d1b1069d7a..7b0c008ce3 100644 --- a/nexus/db-queries/src/db/queries/network_interface.rs +++ b/nexus/db-queries/src/db/queries/network_interface.rs @@ -1789,7 +1789,7 @@ mod tests { state: external::InstanceState, ) -> Instance { let new_runtime = model::InstanceRuntimeState { - fallback_state: model::InstanceState::new(state), + nexus_state: model::InstanceState::new(state), gen: instance.runtime_state.gen.next().into(), ..instance.runtime_state.clone() }; diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 36f75bb036..39e0108ca0 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -661,7 +661,7 @@ impl super::Nexus { let effective_state = if let Some(vmm) = vmm_state { vmm.runtime.state.0 } else { - instance_state.runtime().fallback_state.0 + instance_state.runtime().nexus_state.0 }; // Requests that operate on active instances have to be directed to the @@ -1118,7 +1118,7 @@ impl super::Nexus { // that the VMM is gone, however.) _ => { let new_runtime = db::model::InstanceRuntimeState { - fallback_state: db::model::InstanceState::new( + nexus_state: db::model::InstanceState::new( InstanceState::Failed, ), @@ -1543,7 +1543,7 @@ impl super::Nexus { internal_message: format!( "instance is in state {:?} and has no active serial console \ server", - instance.runtime().fallback_state + instance.runtime().nexus_state ) }) } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index c32be42b6b..58b04a4c95 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -851,7 +851,7 @@ async fn sic_delete_instance_record( }; let runtime_state = db::model::InstanceRuntimeState { - fallback_state: db::model::InstanceState::new(InstanceState::Failed), + nexus_state: db::model::InstanceState::new(InstanceState::Failed), // Must update the generation, or the database query will fail. // // The runtime state of the instance record is only changed as a result @@ -892,7 +892,7 @@ async fn sic_move_to_stopped( // update will (correctly) be ignored because its generation number is out // of date. let new_state = db::model::InstanceRuntimeState { - fallback_state: db::model::InstanceState::new(InstanceState::Stopped), + nexus_state: db::model::InstanceState::new(InstanceState::Stopped), gen: db::model::Generation::from( instance_record.runtime_state.gen.next(), ), diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 16f6c5a41a..d32a20bc40 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -727,7 +727,7 @@ mod tests { let new_instance = new_state.instance(); let new_vmm = new_state.vmm().as_ref(); assert_eq!( - new_instance.runtime().fallback_state.0, + new_instance.runtime().nexus_state.0, omicron_common::api::external::InstanceState::Stopped ); assert!(new_instance.runtime().propolis_id.is_none()); diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 0eef60c6b6..5d02d44b6b 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -246,7 +246,7 @@ async fn sis_move_to_starting( // be running before Propolis thinks it has started.) None => { let new_runtime = db::model::InstanceRuntimeState { - fallback_state: db::model::InstanceState::new( + nexus_state: db::model::InstanceState::new( InstanceState::Running, ), propolis_id: Some(propolis_id), @@ -286,7 +286,7 @@ async fn sis_move_to_starting_undo( "instance_id" => %instance_id); let new_runtime = db::model::InstanceRuntimeState { - fallback_state: db::model::InstanceState::new(InstanceState::Stopped), + nexus_state: db::model::InstanceState::new(InstanceState::Stopped), propolis_id: None, gen: db_instance.runtime_state.gen.next().into(), ..db_instance.runtime_state @@ -703,7 +703,7 @@ mod test { assert!(new_db_instance.runtime().propolis_id.is_none()); assert_eq!( - new_db_instance.runtime().fallback_state.0, + new_db_instance.runtime().nexus_state.0, InstanceState::Stopped ); } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 56fde797f1..67db222155 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3547,14 +3547,6 @@ "type": "string", "format": "uuid" }, - "fallback_state": { - "description": "The state of the instance if it has no active VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceState" - } - ] - }, "gen": { "description": "Generation number for this state.", "allOf": [ @@ -3582,7 +3574,6 @@ } }, "required": [ - "fallback_state", "gen", "time_updated" ] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 91b2d9445e..56437ab283 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1889,14 +1889,6 @@ "type": "string", "format": "uuid" }, - "fallback_state": { - "description": "The state of the instance if it has no active VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceState" - } - ] - }, "gen": { "description": "Generation number for this state.", "allOf": [ @@ -1924,7 +1916,6 @@ } }, "required": [ - "fallback_state", "gen", "time_updated" ] diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index a33680239e..9e285840e0 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -57,9 +57,9 @@ impl From for ApiInstanceState { // Nexus needs to learn when a VM has entered the "destroyed" state // so that it can release its resource reservation. When this // happens, this module also clears the active VMM ID from the - // instance record (and updates the instance's fallback state - // accordingly), so this state won't be used as an - // externally-visible instance state. + // instance record, which will accordingly set the Nexus-owned + // instance state to Stopped, preventing this state from being used + // as an externally-visible instance state. State::Destroyed => ApiInstanceState::Destroyed, } } @@ -358,12 +358,6 @@ impl InstanceStates { fn retire_active_propolis(&mut self, now: DateTime) { assert!(self.propolis_role() == PropolisRole::Active); - self.instance.fallback_state = match self.vmm.state { - ApiInstanceState::Destroyed => ApiInstanceState::Stopped, - ApiInstanceState::Failed => ApiInstanceState::Failed, - _ => unreachable!("Can only retire active Propolis if VMM exited"), - }; - self.instance.propolis_id = None; self.instance.gen = self.instance.gen.next(); self.instance.time_updated = now; @@ -523,7 +517,6 @@ mod test { let propolis_id = Uuid::new_v4(); let now = Utc::now(); let instance = InstanceRuntimeState { - fallback_state: ApiInstanceState::Starting, propolis_id: Some(propolis_id), dst_propolis_id: None, migration_id: None, @@ -585,10 +578,6 @@ mod test { // sleds that expect their own attempts to advance the generation number // to cause new state to be recorded. if prev.instance.gen == next.instance.gen { - assert_eq!( - prev.instance.fallback_state, - next.instance.fallback_state - ); assert_eq!(prev.instance.propolis_id, next.instance.propolis_id); assert_eq!( prev.instance.dst_propolis_id, @@ -597,8 +586,7 @@ mod test { assert_eq!(prev.instance.migration_id, next.instance.migration_id); } else { assert!( - (prev.instance.fallback_state != next.instance.fallback_state) - || (prev.instance.propolis_id != next.instance.propolis_id) + (prev.instance.propolis_id != next.instance.propolis_id) || (prev.instance.dst_propolis_id != next.instance.dst_propolis_id) || (prev.instance.migration_id diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index eb64661de3..bd6ed4aa90 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -424,7 +424,6 @@ mod test { ) -> (SimObject, Receiver<()>) { let propolis_id = Uuid::new_v4(); let instance_vmm = InstanceRuntimeState { - fallback_state: InstanceState::Creating, propolis_id: Some(propolis_id), dst_propolis_id: None, migration_id: None, @@ -484,7 +483,7 @@ mod test { assert!(rx.try_next().is_err()); // Stopping an instance that was never started synchronously destroys - // its VMM and sets the fallback state to Stopped. + // its VMM. let rprev = r1; let dropped = instance.transition(InstanceStateRequested::Stopped).unwrap(); @@ -498,7 +497,6 @@ mod test { >= rprev.instance_state.time_updated ); assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); - assert_eq!(rnext.instance_state.fallback_state, InstanceState::Stopped); assert!(rnext.instance_state.propolis_id.is_none()); assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed); assert!(rx.try_next().is_err()); @@ -624,7 +622,6 @@ mod test { assert_eq!(rprev.vmm_state.state, InstanceState::Stopping); assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed); assert!(rnext.instance_state.gen > rprev.instance_state.gen); - assert_eq!(rnext.instance_state.fallback_state, InstanceState::Stopped); logctx.cleanup_successful(); }