Skip to content

Commit

Permalink
make Nexus sole owner of Instance records' state values
Browse files Browse the repository at this point in the history
Sled agent has one path that updates the `state` column in an instance record
(the `fallback_state` field in an `InstanceRuntimeState`): if an active Propolis
is retired (i.e. the instance's Propolis ID goes from `Some` to `None`), the
instance goes to the Stopped state.

This rule is so simple that it can be implemented entirely in Nexus, giving
Nexus sole ownership of this table column:

- Remove the `fallback_state` field from `api::internal::InstanceRuntimeState`.
  This completely hides the field from the sled agent.
- When Nexus processes a new `SledInstanceState`, check the
  `active_propolis_id`: if it's `Some`, the state is Running; if it's `None`,
  the state is Stopped.

The only other states that an instance can now have are Creating and Destroyed,
both of which are set only by Nexus and only when it is known that no sled agent
refers to the instance (either by definition when an instance is being created,
or because the instance has no active VMM when it's being destroyed).

Rename the `fallback_state` field to `nexus_state` to reflect its new ownership.
  • Loading branch information
gjcolombo committed Oct 7, 2023
1 parent 78f72e5 commit d2a0636
Show file tree
Hide file tree
Showing 16 changed files with 35 additions and 68 deletions.
1 change: 0 additions & 1 deletion clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ impl From<omicron_common::api::internal::nexus::InstanceRuntimeState>
) -> Self {
Self {
dst_propolis_id: s.dst_propolis_id,
fallback_state: s.fallback_state.into(),
gen: s.gen.into(),
migration_id: s.migration_id,
propolis_id: s.propolis_id,
Expand Down
2 changes: 0 additions & 2 deletions clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ impl From<omicron_common::api::internal::nexus::InstanceRuntimeState>
s: omicron_common::api::internal::nexus::InstanceRuntimeState,
) -> Self {
Self {
fallback_state: s.fallback_state.into(),
propolis_id: s.propolis_id,
dst_propolis_id: s.dst_propolis_id,
migration_id: s.migration_id,
Expand Down Expand Up @@ -79,7 +78,6 @@ impl From<types::InstanceRuntimeState>
{
fn from(s: types::InstanceRuntimeState) -> Self {
Self {
fallback_state: s.fallback_state.into(),
propolis_id: s.propolis_id,
dst_propolis_id: s.dst_propolis_id,
migration_id: s.migration_id,
Expand Down
2 changes: 0 additions & 2 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ pub struct InstanceProperties {
/// no active VMM.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceRuntimeState {
/// The state of the instance if it has no active VMM.
pub fallback_state: InstanceState,
/// The instance's currently active VMM ID.
pub propolis_id: Option<Uuid>,
/// If a migration is active, the ID of the target VMM.
Expand Down
13 changes: 9 additions & 4 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ pub struct InstanceRuntimeState {
///
/// This field is guarded by the instance's `gen` field.
#[diesel(column_name = state)]
pub fallback_state: InstanceState,
pub nexus_state: InstanceState,

/// The time at which the runtime state was last updated. This is distinct
/// from the time the record was last modified, because some updates don't
Expand Down Expand Up @@ -166,7 +166,7 @@ pub struct InstanceRuntimeState {
impl InstanceRuntimeState {
fn new(initial_state: InstanceState, creation_time: DateTime<Utc>) -> Self {
Self {
fallback_state: initial_state,
nexus_state: initial_state,
time_updated: creation_time,
propolis_id: None,
dst_propolis_id: None,
Expand All @@ -182,8 +182,14 @@ impl From<omicron_common::api::internal::nexus::InstanceRuntimeState>
fn from(
state: omicron_common::api::internal::nexus::InstanceRuntimeState,
) -> Self {
let nexus_state = if state.propolis_id.is_some() {
omicron_common::api::external::InstanceState::Running
} else {
omicron_common::api::external::InstanceState::Stopped
};

Self {
fallback_state: InstanceState::new(state.fallback_state),
nexus_state: InstanceState::new(nexus_state),
time_updated: state.time_updated,
gen: state.gen.into(),
propolis_id: state.propolis_id,
Expand All @@ -199,7 +205,6 @@ impl From<InstanceRuntimeState>
fn from(state: InstanceRuntimeState) -> Self {
Self {
dst_propolis_id: state.dst_propolis_id,
fallback_state: state.fallback_state.into(),
gen: state.gen.into(),
migration_id: state.migration_id,
propolis_id: state.propolis_id,
Expand Down
8 changes: 4 additions & 4 deletions nexus/db-queries/src/db/datastore/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ impl DataStore {
)
);
}
match collection.runtime_state.fallback_state.state() {
match collection.runtime_state.nexus_state.state() {
// Ok-to-be-attached instance states:
api::external::InstanceState::Creating |
api::external::InstanceState::Stopped => {
Expand All @@ -264,7 +264,7 @@ impl DataStore {
_ => {
Err(Error::invalid_request(&format!(
"cannot attach disk to instance in {} state",
collection.runtime_state.fallback_state.state(),
collection.runtime_state.nexus_state.state(),
)))
}
}
Expand Down Expand Up @@ -381,7 +381,7 @@ impl DataStore {
)
);
}
match collection.runtime_state.fallback_state.state() {
match collection.runtime_state.nexus_state.state() {
// Ok-to-be-detached instance states:
api::external::InstanceState::Creating |
api::external::InstanceState::Stopped => {
Expand All @@ -395,7 +395,7 @@ impl DataStore {
_ => {
Err(Error::invalid_request(&format!(
"cannot detach disk from instance in {} state",
collection.runtime_state.fallback_state.state(),
collection.runtime_state.nexus_state.state(),
)))
}
}
Expand Down
10 changes: 5 additions & 5 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ impl InstanceAndActiveVmm {
if let Some(vmm) = &self.vmm {
vmm.runtime.state.0
} else {
self.instance.runtime().fallback_state.0
self.instance.runtime().nexus_state.0
}
}
}
Expand All @@ -85,7 +85,7 @@ impl From<InstanceAndActiveVmm> for omicron_common::api::external::Instance {
(vmm.runtime.state, vmm.runtime.time_state_updated)
} else {
(
value.instance.runtime_state.fallback_state.clone(),
value.instance.runtime_state.nexus_state.clone(),
value.instance.runtime_state.time_updated,
)
};
Expand Down Expand Up @@ -161,10 +161,10 @@ impl DataStore {
})?;

bail_unless!(
instance.runtime().fallback_state.state()
instance.runtime().nexus_state.state()
== &api::external::InstanceState::Creating,
"newly-created Instance has unexpected state: {:?}",
instance.runtime().fallback_state
instance.runtime().nexus_state
);
bail_unless!(
instance.runtime().gen == gen,
Expand Down Expand Up @@ -435,7 +435,7 @@ impl DataStore {
);
}
let instance_state =
collection.runtime_state.fallback_state.state();
collection.runtime_state.nexus_state.state();
match instance_state {
api::external::InstanceState::Stopped
| api::external::InstanceState::Failed => {
Expand Down
4 changes: 2 additions & 2 deletions nexus/db-queries/src/db/datastore/network_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ impl DataStore {
let instance_runtime =
instance_query.get_result_async(&conn).await?.runtime_state;
if instance_runtime.propolis_id.is_some()
|| instance_runtime.fallback_state != stopped
|| instance_runtime.nexus_state != stopped
{
return Err(TxnError::CustomError(
NetworkInterfaceUpdateError::InstanceNotStopped,
Expand Down Expand Up @@ -516,7 +516,7 @@ impl DataStore {
let instance_state =
instance_query.get_result_async(&conn).await?.runtime_state;
if instance_state.propolis_id.is_some()
|| instance_state.fallback_state != stopped
|| instance_state.nexus_state != stopped
{
return Err(TxnError::CustomError(
NetworkInterfaceUpdateError::InstanceNotStopped,
Expand Down
2 changes: 1 addition & 1 deletion nexus/db-queries/src/db/queries/network_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1789,7 +1789,7 @@ mod tests {
state: external::InstanceState,
) -> Instance {
let new_runtime = model::InstanceRuntimeState {
fallback_state: model::InstanceState::new(state),
nexus_state: model::InstanceState::new(state),
gen: instance.runtime_state.gen.next().into(),
..instance.runtime_state.clone()
};
Expand Down
6 changes: 3 additions & 3 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ impl super::Nexus {
let effective_state = if let Some(vmm) = vmm_state {
vmm.runtime.state.0
} else {
instance_state.runtime().fallback_state.0
instance_state.runtime().nexus_state.0
};

// Requests that operate on active instances have to be directed to the
Expand Down Expand Up @@ -1118,7 +1118,7 @@ impl super::Nexus {
// that the VMM is gone, however.)
_ => {
let new_runtime = db::model::InstanceRuntimeState {
fallback_state: db::model::InstanceState::new(
nexus_state: db::model::InstanceState::new(
InstanceState::Failed,
),

Expand Down Expand Up @@ -1543,7 +1543,7 @@ impl super::Nexus {
internal_message: format!(
"instance is in state {:?} and has no active serial console \
server",
instance.runtime().fallback_state
instance.runtime().nexus_state
)
})
}
Expand Down
4 changes: 2 additions & 2 deletions nexus/src/app/sagas/instance_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ async fn sic_delete_instance_record(
};

let runtime_state = db::model::InstanceRuntimeState {
fallback_state: db::model::InstanceState::new(InstanceState::Failed),
nexus_state: db::model::InstanceState::new(InstanceState::Failed),
// Must update the generation, or the database query will fail.
//
// The runtime state of the instance record is only changed as a result
Expand Down Expand Up @@ -892,7 +892,7 @@ async fn sic_move_to_stopped(
// update will (correctly) be ignored because its generation number is out
// of date.
let new_state = db::model::InstanceRuntimeState {
fallback_state: db::model::InstanceState::new(InstanceState::Stopped),
nexus_state: db::model::InstanceState::new(InstanceState::Stopped),
gen: db::model::Generation::from(
instance_record.runtime_state.gen.next(),
),
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/instance_migrate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ mod tests {
let new_instance = new_state.instance();
let new_vmm = new_state.vmm().as_ref();
assert_eq!(
new_instance.runtime().fallback_state.0,
new_instance.runtime().nexus_state.0,
omicron_common::api::external::InstanceState::Stopped
);
assert!(new_instance.runtime().propolis_id.is_none());
Expand Down
6 changes: 3 additions & 3 deletions nexus/src/app/sagas/instance_start.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ async fn sis_move_to_starting(
// be running before Propolis thinks it has started.)
None => {
let new_runtime = db::model::InstanceRuntimeState {
fallback_state: db::model::InstanceState::new(
nexus_state: db::model::InstanceState::new(
InstanceState::Running,
),
propolis_id: Some(propolis_id),
Expand Down Expand Up @@ -286,7 +286,7 @@ async fn sis_move_to_starting_undo(
"instance_id" => %instance_id);

let new_runtime = db::model::InstanceRuntimeState {
fallback_state: db::model::InstanceState::new(InstanceState::Stopped),
nexus_state: db::model::InstanceState::new(InstanceState::Stopped),
propolis_id: None,
gen: db_instance.runtime_state.gen.next().into(),
..db_instance.runtime_state
Expand Down Expand Up @@ -703,7 +703,7 @@ mod test {

assert!(new_db_instance.runtime().propolis_id.is_none());
assert_eq!(
new_db_instance.runtime().fallback_state.0,
new_db_instance.runtime().nexus_state.0,
InstanceState::Stopped
);
}
Expand Down
9 changes: 0 additions & 9 deletions openapi/nexus-internal.json
Original file line number Diff line number Diff line change
Expand Up @@ -3547,14 +3547,6 @@
"type": "string",
"format": "uuid"
},
"fallback_state": {
"description": "The state of the instance if it has no active VMM.",
"allOf": [
{
"$ref": "#/components/schemas/InstanceState"
}
]
},
"gen": {
"description": "Generation number for this state.",
"allOf": [
Expand Down Expand Up @@ -3582,7 +3574,6 @@
}
},
"required": [
"fallback_state",
"gen",
"time_updated"
]
Expand Down
9 changes: 0 additions & 9 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -1889,14 +1889,6 @@
"type": "string",
"format": "uuid"
},
"fallback_state": {
"description": "The state of the instance if it has no active VMM.",
"allOf": [
{
"$ref": "#/components/schemas/InstanceState"
}
]
},
"gen": {
"description": "Generation number for this state.",
"allOf": [
Expand Down Expand Up @@ -1924,7 +1916,6 @@
}
},
"required": [
"fallback_state",
"gen",
"time_updated"
]
Expand Down
20 changes: 4 additions & 16 deletions sled-agent/src/common/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ impl From<PropolisInstanceState> for ApiInstanceState {
// Nexus needs to learn when a VM has entered the "destroyed" state
// so that it can release its resource reservation. When this
// happens, this module also clears the active VMM ID from the
// instance record (and updates the instance's fallback state
// accordingly), so this state won't be used as an
// externally-visible instance state.
// instance record, which will accordingly set the Nexus-owned
// instance state to Stopped, preventing this state from being used
// as an externally-visible instance state.
State::Destroyed => ApiInstanceState::Destroyed,
}
}
Expand Down Expand Up @@ -358,12 +358,6 @@ impl InstanceStates {
fn retire_active_propolis(&mut self, now: DateTime<Utc>) {
assert!(self.propolis_role() == PropolisRole::Active);

self.instance.fallback_state = match self.vmm.state {
ApiInstanceState::Destroyed => ApiInstanceState::Stopped,
ApiInstanceState::Failed => ApiInstanceState::Failed,
_ => unreachable!("Can only retire active Propolis if VMM exited"),
};

self.instance.propolis_id = None;
self.instance.gen = self.instance.gen.next();
self.instance.time_updated = now;
Expand Down Expand Up @@ -523,7 +517,6 @@ mod test {
let propolis_id = Uuid::new_v4();
let now = Utc::now();
let instance = InstanceRuntimeState {
fallback_state: ApiInstanceState::Starting,
propolis_id: Some(propolis_id),
dst_propolis_id: None,
migration_id: None,
Expand Down Expand Up @@ -585,10 +578,6 @@ mod test {
// sleds that expect their own attempts to advance the generation number
// to cause new state to be recorded.
if prev.instance.gen == next.instance.gen {
assert_eq!(
prev.instance.fallback_state,
next.instance.fallback_state
);
assert_eq!(prev.instance.propolis_id, next.instance.propolis_id);
assert_eq!(
prev.instance.dst_propolis_id,
Expand All @@ -597,8 +586,7 @@ mod test {
assert_eq!(prev.instance.migration_id, next.instance.migration_id);
} else {
assert!(
(prev.instance.fallback_state != next.instance.fallback_state)
|| (prev.instance.propolis_id != next.instance.propolis_id)
(prev.instance.propolis_id != next.instance.propolis_id)
|| (prev.instance.dst_propolis_id
!= next.instance.dst_propolis_id)
|| (prev.instance.migration_id
Expand Down
5 changes: 1 addition & 4 deletions sled-agent/src/sim/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ mod test {
) -> (SimObject<SimInstance>, Receiver<()>) {
let propolis_id = Uuid::new_v4();
let instance_vmm = InstanceRuntimeState {
fallback_state: InstanceState::Creating,
propolis_id: Some(propolis_id),
dst_propolis_id: None,
migration_id: None,
Expand Down Expand Up @@ -484,7 +483,7 @@ mod test {
assert!(rx.try_next().is_err());

// Stopping an instance that was never started synchronously destroys
// its VMM and sets the fallback state to Stopped.
// its VMM.
let rprev = r1;
let dropped =
instance.transition(InstanceStateRequested::Stopped).unwrap();
Expand All @@ -498,7 +497,6 @@ mod test {
>= rprev.instance_state.time_updated
);
assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated);
assert_eq!(rnext.instance_state.fallback_state, InstanceState::Stopped);
assert!(rnext.instance_state.propolis_id.is_none());
assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed);
assert!(rx.try_next().is_err());
Expand Down Expand Up @@ -624,7 +622,6 @@ mod test {
assert_eq!(rprev.vmm_state.state, InstanceState::Stopping);
assert_eq!(rnext.vmm_state.state, InstanceState::Destroyed);
assert!(rnext.instance_state.gen > rprev.instance_state.gen);
assert_eq!(rnext.instance_state.fallback_state, InstanceState::Stopped);
logctx.cleanup_successful();
}

Expand Down

0 comments on commit d2a0636

Please sign in to comment.