Skip to content

Commit

Permalink
Handle Failed active VMMs in instance-update saga
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Aug 26, 2024
1 parent 6d2eda8 commit 75cad74
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 11 deletions.
11 changes: 11 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,17 @@ pub enum VmmState {
Destroyed,
}

impl VmmState {
/// States in which the VMM no longer exists and must be cleaned up.
pub const TERMINAL_STATES: &'static [Self] =
&[Self::Failed, Self::Destroyed];

/// Returns `true` if this VMM is in a terminal state.
pub fn is_terminal(&self) -> bool {
Self::TERMINAL_STATES.contains(self)
}
}

/// The dynamic runtime properties of an individual VMM process.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct VmmRuntimeState {
Expand Down
30 changes: 27 additions & 3 deletions nexus/db-model/src/vmm_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use omicron_common::api::internal::nexus::VmmState as ApiState;
use serde::Deserialize;
use serde::Serialize;
use std::fmt;
Expand Down Expand Up @@ -46,6 +47,13 @@ impl VmmState {
/// it as deleted.
pub const DESTROYABLE_STATES: &'static [Self] =
&[Self::Destroyed, Self::SagaUnwound];

pub const TERMINAL_STATES: &'static [Self] =
&[Self::Destroyed, Self::Failed];

pub fn is_terminal(&self) -> bool {
Self::TERMINAL_STATES.contains(self)
}
}

impl fmt::Display for VmmState {
Expand Down Expand Up @@ -86,9 +94,8 @@ impl From<VmmState> for sled_agent_client::types::VmmState {
}
}

impl From<omicron_common::api::internal::nexus::VmmState> for VmmState {
fn from(value: omicron_common::api::internal::nexus::VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as ApiState;
impl From<ApiState> for VmmState {
fn from(value: ApiState) -> Self {
use VmmState as Output;
match value {
ApiState::Starting => Output::Starting,
Expand Down Expand Up @@ -129,3 +136,20 @@ impl diesel::query_builder::QueryId for VmmStateEnum {
type QueryId = ();
const HAS_STATIC_QUERY_ID: bool = false;
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_all_terminal_api_states_are_terminal_db_states() {
for &api_state in ApiState::TERMINAL_STATES {
let db_state = VmmState::from(api_state);
assert!(
db_state.is_terminal(),
"API VmmState::{api_state:?} is considered terminal, but its \
corresponding DB state ({db_state:?}) is not!"
);
}
}
}
29 changes: 21 additions & 8 deletions nexus/src/app/sagas/instance_update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,12 +413,12 @@ pub fn update_saga_needed(
) -> bool {
// Currently, an instance-update saga is required if (and only if):
//
// - The instance's active VMM has transitioned to `Destroyed`. We don't
// - The instance's active VMM has transitioned to `Destroyed` or `Failed`. We don't
// actually know whether the VMM whose state was updated here was the
// active VMM or not, so we will always attempt to run an instance-update
// saga if the VMM was `Destroyed`.
let vmm_needs_update = result.vmm_updated
&& state.vmm_state.state == nexus::VmmState::Destroyed;
// saga if the VMM was `Destroyed` (or `Failed`).
let vmm_needs_update =
result.vmm_updated && state.vmm_state.state.is_terminal();
// - A migration in to this VMM has transitioned to a terminal state
// (`Failed` or `Completed`).
let migrations = state.migrations();
Expand Down Expand Up @@ -514,20 +514,27 @@ impl UpdatesRequired {
let instance_id = snapshot.instance.id();

let mut update_required = false;
let mut active_vmm_failed = false;
let mut network_config = None;

// Has the active VMM been destroyed?
let destroy_active_vmm =
snapshot.active_vmm.as_ref().and_then(|active_vmm| {
if active_vmm.runtime.state == VmmState::Destroyed {
if active_vmm.runtime.state.is_terminal() {
let id = PropolisUuid::from_untyped_uuid(active_vmm.id);
// Unlink the active VMM ID. If the active VMM was destroyed
// because a migration out completed, the next block, which
// handles migration updates, will set this to the new VMM's ID,
// instead.
new_runtime.propolis_id = None;
update_required = true;
Some(id)

// If the active VMM's state is `Failed`, move the
// instance's new state to `Failed` rather than to `NoVmm`.
if active_vmm.runtime.state == VmmState::Failed {
active_vmm_failed = true;
}
Some((id, active_vmm.runtime.state))
} else {
None
}
Expand All @@ -536,7 +543,9 @@ impl UpdatesRequired {
// Okay, what about the target?
let destroy_target_vmm =
snapshot.target_vmm.as_ref().and_then(|target_vmm| {
if target_vmm.runtime.state == VmmState::Destroyed {
// XXX(eliza): AFAIK, target VMMs don't go to `Failed` until
// they become active, but...IDK. double-check that.
if target_vmm.runtime.state.is_terminal() {
// Unlink the target VMM ID.
new_runtime.dst_propolis_id = None;
update_required = true;
Expand Down Expand Up @@ -672,7 +681,11 @@ impl UpdatesRequired {
// was any actual state change.

// We no longer have a VMM.
new_runtime.nexus_state = InstanceState::NoVmm;
new_runtime.nexus_state = if active_vmm_failed {
InstanceState::Failed
} else {
InstanceState::NoVmm
};
// If the active VMM was destroyed and the instance has not migrated
// out of it, we must delete the instance's network configuration.
//
Expand Down

0 comments on commit 75cad74

Please sign in to comment.