diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 49c78dae53b..9d8bf1ac9ba 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -568,6 +568,15 @@ pub struct InstanceUpdaterConfig { /// period (in seconds) for periodic activations of this background task #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, + + /// disable background checks for instances in need of updates. + /// + /// This config is intended for use in testing, and should generally not be + /// enabled in real life. + /// + /// Default: Off + #[serde(default)] + pub disable: bool, } #[serde_as] @@ -859,6 +868,7 @@ mod test { region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 instance_updater.period_secs = 30 + instance_updater.disable = false service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 @@ -1008,6 +1018,7 @@ mod test { }, instance_updater: InstanceUpdaterConfig { period_secs: Duration::from_secs(30), + disable: false, }, service_firewall_propagation: ServiceFirewallPropagationConfig { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 385d95c317c..977067f8fae 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -638,11 +638,19 @@ impl BackgroundTasksInitializer { let updater = instance_updater::InstanceUpdater::new( datastore.clone(), sagas.clone(), + config.instance_updater.disable, ); + let period = if config.instance_updater.disable { + // If we're explicitly disabled by the config, don't waste + // energy activating the background task just to have it do nothing. + std::time::Duration::MAX + } else { + config.instance_updater.period_secs + }; driver.register( TaskDefinition { name: "instance_updater", description: "detects if instances require update sagas and schedules them", - period: config.instance_updater.period_secs, + period, task_impl: Box::new(updater), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 183a12fe24d..46a3bead215 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -26,14 +26,19 @@ use tokio::task::JoinSet; pub struct InstanceUpdater { datastore: Arc, sagas: Arc, + disable: bool, } impl InstanceUpdater { - pub fn new(datastore: Arc, sagas: Arc) -> Self { - InstanceUpdater { datastore, sagas } + pub fn new( + datastore: Arc, + sagas: Arc, + disable: bool, + ) -> Self { + InstanceUpdater { datastore, sagas, disable } } - async fn activate2( + async fn actually_activate( &mut self, opctx: &OpContext, stats: &mut ActivationStats, @@ -205,43 +210,49 @@ impl BackgroundTask for InstanceUpdater { ) -> BoxFuture<'a, serde_json::Value> { async { let mut stats = ActivationStats::default(); - let error = match self.activate2(opctx, &mut stats).await { - Ok(()) => { - slog::info!( - &opctx.log, - "instance updater activation completed"; - "destroyed_active_vmms" => stats.destroyed_active_vmms, - "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_started" => stats.sagas_started, - "update_sagas_completed" => stats.sagas_completed, - ); - debug_assert_eq!( - stats.sagas_failed, - 0, - "if the task completed successfully, then no sagas \ - should have failed", - ); - debug_assert_eq!( - stats.saga_start_failures, - 0, - "if the task completed successfully, all sagas \ - should have started successfully" - ); - None - } - Err(error) => { - slog::warn!( - &opctx.log, - "instance updater activation failed!"; - "error" => %error, - "destroyed_active_vmms" => stats.destroyed_active_vmms, - "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_started" => stats.sagas_started, - "update_sagas_completed" => stats.sagas_completed, - "update_sagas_failed" => stats.sagas_failed, - "update_saga_start_failures" => stats.saga_start_failures, - ); - Some(error.to_string()) + + let error = if self.disable { + slog::info!(&opctx.log, "background instance updater explicitly disabled"); + None + } else { + match self.actually_activate(opctx, &mut stats).await { + Ok(()) => { + slog::info!( + &opctx.log, + "instance updater activation completed"; + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + ); + debug_assert_eq!( + stats.sagas_failed, + 0, + "if the task completed successfully, then no sagas \ + should have failed", + ); + debug_assert_eq!( + stats.saga_start_failures, + 0, + "if the task completed successfully, all sagas \ + should have started successfully" + ); + None + } + Err(error) => { + slog::warn!( + &opctx.log, + "instance updater activation failed!"; + "error" => %error, + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + "update_sagas_failed" => stats.sagas_failed, + "update_saga_start_failures" => stats.saga_start_failures, + ); + Some(error.to_string()) + } } }; json!({ diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index f78aee3a88d..8f65a73204a 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -134,10 +134,9 @@ lookup_region_port.period_secs = 60 # to be executed in a timely manner, so for integration tests, we don't want to # *rely* on the instance-updater background task for running these sagas. # -# Therefore, set a period long enough that this task won't activate during a -# reasonable integration test execution. Tests for the instance-updater task -# will explictly activate it. -instance_updater.period_secs = 600 +# Therefore, disable the background task during tests. +instance_updater.disable = true +instance_updater.period_secs = 60 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the