From 637b7802d7e971234a69930ffa40fad618138c2d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Sep 2024 11:49:28 -0700 Subject: [PATCH] make "sled failures only" the default policy --- nexus/db-model/src/instance.rs | 2 +- nexus/db-model/src/instance_auto_restart.rs | 12 +- .../tasks/instance_reincarnation.rs | 468 ++++++++++++++++++ schema/crdb/dbinit.sql | 7 + .../README.adoc | 8 +- .../up01.sql | 7 + .../up03.sql | 2 +- 7 files changed, 498 insertions(+), 8 deletions(-) create mode 100644 nexus/src/app/background/tasks/instance_reincarnation.rs diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index fc6678369c1..512500d15d4 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -109,7 +109,7 @@ impl Instance { ncpus: params.ncpus.into(), memory: params.memory.into(), hostname: params.hostname.to_string(), - auto_restart_policy: InstanceAutoRestart::Never, + auto_restart_policy: InstanceAutoRestart::default(), runtime_state, updater_gen: Generation::new(), diff --git a/nexus/db-model/src/instance_auto_restart.rs b/nexus/db-model/src/instance_auto_restart.rs index 1291a1e527c..15014455953 100644 --- a/nexus/db-model/src/instance_auto_restart.rs +++ b/nexus/db-model/src/instance_auto_restart.rs @@ -18,18 +18,26 @@ impl_enum_type!( // Enum values Never => b"never" + SledFailuresOnly => b"sled_failures_only" AllFailures => b"all_failures" ); impl InstanceAutoRestart { pub fn label(&self) -> &'static str { match self { - InstanceAutoRestart::Never => "never", - InstanceAutoRestart::AllFailures => "all_failures", + Self::Never => "never", + Self::SledFailuresOnly => "sled_failures_only", + Self::AllFailures => "all_failures", } } } +impl Default for InstanceAutoRestart { + fn default() -> Self { + Self::SledFailuresOnly + } +} + impl fmt::Display for InstanceAutoRestart { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.label()) diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs new file mode 100644 index 00000000000..05e0a22e1af --- /dev/null +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -0,0 +1,468 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for automatically restarting failed instances. + +use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas::instance_start; +use crate::app::sagas::NexusSaga; +use futures::future::BoxFuture; +use nexus_db_queries::authn; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::pagination::Paginator; +use nexus_db_queries::db::DataStore; +use nexus_types::identity::Resource; +use nexus_types::internal_api::background::InstanceReincarnationStatus; +use omicron_common::api::external::Error; +use std::num::NonZeroU32; +use std::sync::Arc; +use tokio::task::JoinSet; + +pub struct InstanceReincarnation { + datastore: Arc, + sagas: Arc, +} + +const BATCH_SIZE: NonZeroU32 = unsafe { + // Safety: last time I checked, 100 was greater than zero. + NonZeroU32::new_unchecked(100) +}; + +impl BackgroundTask for InstanceReincarnation { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async move { + let mut status = InstanceReincarnationStatus::default(); + self.actually_activate(opctx, &mut status).await; + if !status.restart_errors.is_empty() || status.query_error.is_some() + { + error!( + &opctx.log, + "instance reincarnation completed with errors!"; + "instances_found" => status.instances_found, + "instances_reincarnated" => status.instances_reincarnated.len(), + "already_reincarnated" => status.already_reincarnated.len(), + "query_error" => ?status.query_error, + "restart_errors" => status.restart_errors.len(), + ); + } else if !status.instances_reincarnated.is_empty() { + info!( + &opctx.log, + "instance reincarnation completed"; + "instances_found" => status.instances_found, + "instances_reincarnated" => status.instances_reincarnated.len(), + "already_reincarnated" => status.already_reincarnated.len(), + ); + } else { + debug!( + &opctx.log, + "instance reincarnation completed; no instances \ + in need of reincarnation"; + "instances_found" => status.instances_found, + "already_reincarnated" => status.already_reincarnated.len(), + ); + }; + serde_json::json!(status) + }) + } +} + +impl InstanceReincarnation { + pub(crate) fn new( + datastore: Arc, + sagas: Arc, + ) -> Self { + Self { datastore, sagas } + } + + async fn actually_activate( + &mut self, + opctx: &OpContext, + status: &mut InstanceReincarnationStatus, + ) { + let mut tasks = JoinSet::new(); + let mut paginator = Paginator::new(BATCH_SIZE); + + while let Some(p) = paginator.next() { + let maybe_batch = self + .datastore + .find_reincarnatable_instances(opctx, &p.current_pagparams()) + .await; + let batch = match maybe_batch { + Ok(batch) => batch, + Err(error) => { + error!( + opctx.log, + "failed to list instances in need of reincarnation"; + "error" => &error, + ); + status.query_error = Some(error.to_string()); + break; + } + }; + + paginator = p.found_batch(&batch, &|instance| instance.id()); + + let found = batch.len(); + if found == 0 { + debug!( + opctx.log, + "no more instances in need of reincarnation"; + "total_found" => status.instances_found, + ); + break; + } + + let prev_sagas_started = tasks.len(); + status.instances_found += found; + + let serialized_authn = authn::saga::Serialized::for_opctx(opctx); + for db_instance in batch { + let instance_id = db_instance.id(); + let prepared_saga = instance_start::SagaInstanceStart::prepare( + &instance_start::Params { + db_instance, + serialized_authn: serialized_authn.clone(), + }, + ); + match prepared_saga { + Ok(saga) => { + let start_saga = self.sagas.clone(); + tasks.spawn(async move { + start_saga + .saga_start(saga) + .await + .map_err(|e| (instance_id, e))?; + Ok(instance_id) + }); + } + Err(error) => { + const ERR_MSG: &'static str = + "failed to prepare instance-start saga"; + error!( + opctx.log, + "{ERR_MSG} for {instance_id}"; + "instance_id" => %instance_id, + "error" => %error, + ); + status + .restart_errors + .push((instance_id, format!("{ERR_MSG}: {error}"))) + } + }; + } + + let total_sagas_started = tasks.len(); + debug!( + opctx.log, + "found instance in need of reincarnation"; + "instances_found" => found, + "total_found" => status.instances_found, + "sagas_started" => total_sagas_started - prev_sagas_started, + "total_sagas_started" => total_sagas_started, + ); + } + + // All sagas started, wait for them to come back... + while let Some(saga_result) = tasks.join_next().await { + match saga_result { + // Start saga completed successfully + Ok(Ok(instance_id)) => { + debug!( + opctx.log, + "welcome back to the realm of the living, {instance_id}!"; + "instance_id" => %instance_id, + ); + status.instances_reincarnated.push(instance_id); + } + // The instance was restarted by another saga, that's fine... + Ok(Err((instance_id, Error::Conflict { message }))) + if message.external_message() + == instance_start::ALREADY_STARTING_ERROR => + { + debug!( + opctx.log, + "instance {instance_id} was already reincarnated"; + "instance_id" => %instance_id, + ); + status.already_reincarnated.push(instance_id); + } + // Start saga failed + Ok(Err((instance_id, error))) => { + const ERR_MSG: &'static str = "instance-start saga failed"; + warn!(opctx.log, + "{ERR_MSG}"; + "instance_id" => %instance_id, + "error" => %error, + ); + status + .restart_errors + .push((instance_id, format!("{ERR_MSG}: {error}"))); + } + Err(e) => { + const JOIN_ERR_MSG: &'static str = + "tasks spawned on the JoinSet should never return a \ + JoinError, as nexus is compiled with panic=\"abort\", \ + and we never cancel them..."; + error!(opctx.log, "{JOIN_ERR_MSG}"; "error" => %e); + if cfg!(debug_assertions) { + unreachable!("{JOIN_ERR_MSG} but, I saw {e}!",) + } + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::app::background::init::test::NoopStartSaga; + use crate::external_api::params; + use chrono::Utc; + use nexus_db_model::Instance; + use nexus_db_model::InstanceAutoRestart; + use nexus_db_model::InstanceRuntimeState; + use nexus_db_model::InstanceState; + use nexus_db_queries::authz; + use nexus_db_queries::db::lookup::LookupPath; + use nexus_test_utils::resource_helpers::{ + create_default_ip_pool, create_project, + }; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::ByteCount; + use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_uuid_kinds::GenericUuid; + use omicron_uuid_kinds::InstanceUuid; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + const PROJECT_NAME: &str = "reincarnation-station"; + + async fn setup_test_project( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + ) -> authz::Project { + create_default_ip_pool(&cptestctx.external_client).await; + let project = + create_project(&cptestctx.external_client, PROJECT_NAME).await; + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let (_, authz_project) = LookupPath::new(opctx, datastore) + .project_id(project.identity.id) + .lookup_for(authz::Action::CreateChild) + .await + .expect("project must exist"); + authz_project + } + + async fn create_instance( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + authz_project: &authz::Project, + restart_policy: InstanceAutoRestart, + state: InstanceState, + ) -> InstanceUuid { + let id = InstanceUuid::from_untyped_uuid(Uuid::new_v4()); + // Use the first chunk of the UUID as the name, to avoid conflicts. + // Start with a lower ascii character to satisfy the name constraints. + let name = format!("instance-{id}").parse().unwrap(); + let mut instance = Instance::new( + id, + authz_project.id(), + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name, + description: "It's an instance".into(), + }, + ncpus: 2i64.try_into().unwrap(), + memory: ByteCount::from_gibibytes_u32(16), + hostname: "myhostname".try_into().unwrap(), + user_data: Vec::new(), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: Vec::new(), + disks: Vec::new(), + ssh_public_keys: None, + start: false, + }, + ); + instance.auto_restart_policy = restart_policy; + let datastore = cptestctx.server.server_context().nexus.datastore(); + + let instance = datastore + .project_create_instance(opctx, authz_project, instance) + .await + .expect("test instance should be created successfully"); + let prev_state = instance.runtime_state; + datastore + .instance_update_runtime( + &id, + &InstanceRuntimeState { + time_updated: Utc::now(), + nexus_state: state, + r#gen: nexus_db_model::Generation(prev_state.r#gen.next()), + ..prev_state + }, + ) + .await + .expect("instance runtime state should update"); + eprintln!("instance {id}: policy={restart_policy:?}; state={state:?}"); + id + } + + #[nexus_test(server = crate::Server)] + async fn test_reincarnates_failed_instances( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let authz_project = setup_test_project(&cptestctx, &opctx).await; + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = + InstanceReincarnation::new(datastore.clone(), starter.clone()); + + // Noop test + let result = task.activate(&opctx).await; + assert_eq!( + result, + serde_json::json!(InstanceReincarnationStatus::default()) + ); + assert_eq!(starter.count_reset(), 0); + + // Create an instance in the `Failed` state that's eligible to be + // restarted. + let instance_id = create_instance( + &cptestctx, + &opctx, + &authz_project, + InstanceAutoRestart::AllFailures, + InstanceState::Failed, + ) + .await; + + // Activate the task again, and check that our instance had an + // instance-start saga started. + let result = task.activate(&opctx).await; + let status = + serde_json::from_value::(result) + .expect("JSON must be correctly shaped"); + eprintln!("activation: {status:#?}"); + + assert_eq!(starter.count_reset(), 1); + assert_eq!(status.instances_found, 1); + assert_eq!( + status.instances_reincarnated, + vec![instance_id.into_untyped_uuid()] + ); + assert_eq!(status.already_reincarnated, Vec::new()); + assert_eq!(status.query_error, None); + assert_eq!(status.restart_errors, Vec::new()); + } + + #[nexus_test(server = crate::Server)] + async fn test_only_reincarnates_eligible_instances( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let authz_project = setup_test_project(&cptestctx, &opctx).await; + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = + InstanceReincarnation::new(datastore.clone(), starter.clone()); + + // Create instances in the `Failed` state that are eligible to be + // restarted. + let mut will_reincarnate = std::collections::BTreeSet::new(); + for _ in 0..3 { + let id = create_instance( + &cptestctx, + &opctx, + &authz_project, + InstanceAutoRestart::AllFailures, + InstanceState::Failed, + ) + .await; + will_reincarnate.insert(id.into_untyped_uuid()); + } + + // Create some instances that will not reicnarnate. + let mut will_not_reincarnate = std::collections::BTreeSet::new(); + // Some instances which are `Failed`` but don't have policies permitting + // them to be reincarnated. + for _ in 0..3 { + let id = create_instance( + &cptestctx, + &opctx, + &authz_project, + InstanceAutoRestart::Never, + InstanceState::Failed, + ) + .await; + will_not_reincarnate.insert(id.into_untyped_uuid()); + } + + // Some instances with policies permitting them to be reincarnated, but + // which are not `Failed`. + for _ in 0..3 { + let id = create_instance( + &cptestctx, + &opctx, + &authz_project, + InstanceAutoRestart::Never, + InstanceState::NoVmm, + ) + .await; + will_not_reincarnate.insert(id.into_untyped_uuid()); + } + + // Activate the task again, and check that our instance had an + // instance-start saga started. + let result = task.activate(&opctx).await; + let status = + serde_json::from_value::(result) + .expect("JSON must be correctly shaped"); + eprintln!("activation: {status:#?}"); + + assert_eq!(starter.count_reset(), will_reincarnate.len() as u64); + assert_eq!(status.instances_found, will_reincarnate.len()); + assert_eq!(status.already_reincarnated, Vec::new()); + assert_eq!(status.query_error, None); + assert_eq!(status.restart_errors, Vec::new()); + + for id in &status.instances_reincarnated { + eprintln!("instance {id} reincarnated"); + assert!( + !will_not_reincarnate.contains(id), + "expected {id} not to reincarnate! reincarnated: {:?}", + status.instances_reincarnated + ); + } + + for id in will_reincarnate { + assert!( + status.instances_reincarnated.contains(&id), + "expected {id} to have reincarnated! reincarnated: {:?}", + status.instances_reincarnated + ) + } + } +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c962ffda601..212346c8ebf 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1026,6 +1026,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( * rebooted by the control plane. */ 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', /* * The instance should be automatically restarted any time a fault is * detected diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc index ce6b6e5429b..bb021c55448 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc @@ -3,9 +3,9 @@ This migration replaces the `omicron.public.instance.boot_on_fault` column, which is a `bool`, with a new `auto_restart_policy` column, which is an enum (`omicron.public.instance_auto_restart`). The new enum type will allow -auto-restart policies other than "always" and "never" to be added in the future. +auto-restart policies other than "always" and "never". Existing instance records are backfilled with the `all_failures` variant of -`instance_auto_restart` if `boot_on_fault` is `true`, or `never` if +`instance_auto_restart` if `boot_on_fault` is `true`, or `sled_failures_only` if `boot_on_fault` is `false`. The migration performs the following operations: @@ -14,8 +14,8 @@ The migration performs the following operations: 2. `up02.sql` adds a (nullable) `auto_restart_policy` column to the `instance` table. 3. `up03.sql` updates instance records by setting `auto_restart_policy` to - `all_failures` if `boot_on_fault` is `true`, or `never` if `boot_on_fault` is - `false`. + `all_failures` if `boot_on_fault` is `true`, or `sled_failures_only` if + `boot_on_fault` is `false`. 4. Now that all instance records have a value for `auto_restart_policy`, `up04.sql` makes the `auto_restart_policy` column non-null. 5. Finally, `up05.sql` drops the now-defunct `boot_on_fault` column. diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql index ad1ca45a428..263fe81844b 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql @@ -4,6 +4,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( * rebooted by the control plane. */ 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', /* * The instance should be automatically restarted any time a fault is * detected diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql index 98a17781a40..c101e0d7151 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql @@ -1,5 +1,5 @@ SET LOCAL disallow_full_table_scans = off; UPDATE omicron.public.instance SET auto_restart_policy = CASE WHEN boot_on_fault = true THEN 'all_failures' - ELSE 'never' + ELSE 'sled_failures_only' END;