From 49f6e01001462fe31c09c83ce423dc73cc3cc1ce Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 25 Jun 2024 21:11:29 -0700 Subject: [PATCH] move specific background tasks into submodule (#5930) --- .../app/background/{common.rs => driver.rs} | 155 +-------------- nexus/src/app/background/init.rs | 103 +++++----- nexus/src/app/background/mod.rs | 180 +++++++++++++++--- .../{ => tasks}/abandoned_vmm_reaper.rs | 2 +- nexus/src/app/background/{ => tasks}/bfd.rs | 4 +- .../{ => tasks}/blueprint_execution.rs | 4 +- .../background/{ => tasks}/blueprint_load.rs | 4 +- .../{ => tasks}/crdb_node_id_collector.rs | 2 +- .../app/background/{ => tasks}/dns_config.rs | 6 +- .../background/{ => tasks}/dns_propagation.rs | 8 +- .../app/background/{ => tasks}/dns_servers.rs | 2 +- .../{ => tasks}/external_endpoints.rs | 6 +- .../{ => tasks}/instance_watcher.rs | 2 +- .../{ => tasks}/inventory_collection.rs | 8 +- .../{ => tasks}/metrics_producer_gc.rs | 2 +- nexus/src/app/background/tasks/mod.rs | 28 +++ .../app/background/{ => tasks}/nat_cleanup.rs | 2 +- .../app/background/{ => tasks}/networking.rs | 0 .../background/{ => tasks}/phantom_disks.rs | 2 +- .../{ => tasks}/physical_disk_adoption.rs | 2 +- .../{ => tasks}/region_replacement.rs | 2 +- .../{ => tasks}/region_replacement_driver.rs | 2 +- .../{ => tasks}/service_firewall_rules.rs | 2 +- .../{ => tasks}/sync_service_zone_nat.rs | 2 +- .../{ => tasks}/sync_switch_configuration.rs | 4 +- .../background/{ => tasks}/v2p_mappings.rs | 2 +- 26 files changed, 275 insertions(+), 261 deletions(-) rename nexus/src/app/background/{common.rs => driver.rs} (76%) rename nexus/src/app/background/{ => tasks}/abandoned_vmm_reaper.rs (99%) rename nexus/src/app/background/{ => tasks}/bfd.rs (98%) rename nexus/src/app/background/{ => tasks}/blueprint_execution.rs (99%) rename nexus/src/app/background/{ => tasks}/blueprint_load.rs (99%) rename nexus/src/app/background/{ => tasks}/crdb_node_id_collector.rs (99%) rename nexus/src/app/background/{ => tasks}/dns_config.rs (98%) rename nexus/src/app/background/{ => tasks}/dns_propagation.rs (98%) rename nexus/src/app/background/{ => tasks}/dns_servers.rs (99%) rename nexus/src/app/background/{ => tasks}/external_endpoints.rs (97%) rename nexus/src/app/background/{ => tasks}/instance_watcher.rs (99%) rename nexus/src/app/background/{ => tasks}/inventory_collection.rs (98%) rename nexus/src/app/background/{ => tasks}/metrics_producer_gc.rs (99%) create mode 100644 nexus/src/app/background/tasks/mod.rs rename nexus/src/app/background/{ => tasks}/nat_cleanup.rs (99%) rename nexus/src/app/background/{ => tasks}/networking.rs (100%) rename nexus/src/app/background/{ => tasks}/phantom_disks.rs (98%) rename nexus/src/app/background/{ => tasks}/physical_disk_adoption.rs (99%) rename nexus/src/app/background/{ => tasks}/region_replacement.rs (99%) rename nexus/src/app/background/{ => tasks}/region_replacement_driver.rs (99%) rename nexus/src/app/background/{ => tasks}/service_firewall_rules.rs (98%) rename nexus/src/app/background/{ => tasks}/sync_service_zone_nat.rs (99%) rename nexus/src/app/background/{ => tasks}/sync_switch_configuration.rs (99%) rename nexus/src/app/background/{ => tasks}/v2p_mappings.rs (99%) diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/driver.rs similarity index 76% rename from nexus/src/app/background/common.rs rename to nexus/src/app/background/driver.rs index da595dc4e1..f1982b1ad8 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/driver.rs @@ -2,132 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! # Nexus Background Tasks -//! -//! A **background task** in Nexus is any operation that can be activated both -//! periodically and by an explicit signal. This is aimed at RFD 373-style -//! "reliable persistent workflows", also called "reconcilers" or "controllers". -//! These are a kind of automation that examines some _current_ state, compares -//! it to some _intended_ state, and potentially takes action to try to bring -//! the current state in sync with the intended state. Our canonical example is -//! that we want to have Nexus monitor the intended DNS configuration. When it -//! changes, we want to propagate the new configuration to all DNS servers. We -//! implement this with three different background tasks: -//! -//! 1. `DnsConfigWatcher` reads the DNS configuration from the database, stores -//! it in memory, and makes it available via a `tokio::sync::watch` channel. -//! 2. `DnsServersWatcher` reads the list of DNS servers from the database, -//! stores it in memory, and makes it available via a `tokio::sync::watch` -//! channel. -//! 3. `DnsPropagator` uses the the watch channels provided by the other two -//! background tasks to notice when either the DNS configuration or the list -//! of DNS servers has changed. It uses the latest values to make a request -//! to each server to update its configuration. -//! -//! When Nexus changes the DNS configuration, it will update the database with -//! the new configuration and then explicitly activate the `DnsConfigWatcher`. -//! When it reads the new config, it will send it to its watch channel, and that -//! will activate the `DnsPropagator`. If any of this fails, or if Nexus -//! crashes at any point, then the periodic activation of every background task -//! will eventually cause the latest config to be propagated to all of the -//! current servers. -//! -//! The background task framework here is pretty minimal: essentially what it -//! gives you is that you just write an idempotent function that you want to -//! happen periodically or on-demand, wrap it in an impl of `BackgroundTask`, -//! register that with the `Driver`, and you're done. The framework will take -//! care of: -//! -//! * providing a way for Nexus at-large to activate your task -//! * activating your task periodically -//! * ensuring that the task is activated only once at a time in this Nexus -//! (but note that it may always be running concurrently in other Nexus -//! instances) -//! * providing basic visibility into whether the task is running, when the task -//! last ran, etc. -//! -//! We may well want to extend the framework as we build more tasks in general -//! and reconcilers specifically. But we should be mindful not to create -//! footguns for ourselves! See "Design notes" below. -//! -//! ## Notes for background task implementors -//! -//! Background tasks are not necessarily just for reconcilers. That's just the -//! design center. The first two DNS background tasks above aren't reconcilers -//! in any non-trivial sense. -//! -//! Background task activations do not accept input, by design. See "Design -//! notes" below. -//! -//! Generally, you probably don't want to have your background task do retries. -//! If things fail, you rely on the periodic reactivation to try again. -//! -//! ## Design notes -//! -//! The underlying design for RFD 373-style reconcilers is inspired by a few -//! related principles: -//! -//! * the principle in distributed systems of having exactly one code path to -//! achieve a thing, and then always using that path to do that thing (as -//! opposed to having separate paths for, say, the happy path vs. failover, -//! and having one of those paths rarely used) -//! * the [constant-work pattern][1], which basically suggests that a system can -//! be more robust and scalable if it's constructed in a way that always does -//! the same amount of work. Imagine if we made requests to the DNS servers -//! to incrementally update their config every time the DNS data changed. -//! This system does more work as users make more requests. During overloads, -//! things can fall over. Compare with a system whose frontend merely updates -//! the DNS configuration that _should_ exist and whose backend periodically -//! scans the complete intended state and then sets its own state accordingly. -//! The backend does the same amount of work no matter how many requests were -//! made, making it more resistant to overload. A big downside of this -//! approach is increased latency from the user making a request to seeing it -//! applied. This can be mitigated (sacrificing some, but not all, of the -//! "constant work" property) by triggering a backend scan operation when user -//! requests complete. -//! * the design pattern in distributed systems of keeping two copies of data in -//! sync using both event notifications (like a changelog) _and_ periodic full -//! scans. The hope is that a full scan never finds a change that wasn't -//! correctly sync'd, but incorporating an occasional full scan into the -//! design ensures that such bugs are found and their impact repaired -//! automatically. -//! -//! [1]: https://aws.amazon.com/builders-library/reliability-and-constant-work/ -//! -//! Combining these, we get a design pattern for a "reconciler" where: -//! -//! * The reconciler is activated by explicit request (when we know it has work -//! to do) _and_ periodically (to deal with all manner of transient failures) -//! * The reconciler's activity is idempotent: given the same underlying state -//! (e.g., database state), it always attempts to do the same thing. -//! * Each activation of the reconciler accepts no input. That is, even when we -//! think we know what changed, we do not use that information. This ensures -//! that the reconciler really is idempotent and its actions are based solely -//! on the state that it's watching. Put differently: having reconcilers -//! accept an explicit hint about what changed (and then doing something -//! differently based on that) bifurcates the code: there's the common case -//! where that hint is available and the rarely-exercised case when it's not -//! (e.g., because Nexus crashed and it's the subsequent periodic activation -//! that's propagating this change). This is what we're trying to avoid. -//! * We do allow reconcilers to be triggered by a `tokio::sync::watch` channel -//! -- but again, not using the _data_ from that channel. There are two big -//! advantages here: (1) reduced latency from when a change is made to when -//! the reconciler applies it, and (2) (arguably another way to say the same -//! thing) we can space out the periodic activations much further, knowing -//! that most of the time we're not increasing latency by doing this. This -//! compromises the "constant-work" pattern a bit: we might wind up running -//! the reconciler more often during busy times than during idle times, and we -//! could find that overloads something. However, the _operation_ of the -//! reconciler can still be constant work, and there's no more than that -//! amount of work going on at any given time. -//! -//! `watch` channels are a convenient primitive here because they only store -//! one value. With a little care, we can ensure that the writer never blocks -//! and the readers can all see the latest value. (By design, reconcilers -//! generally only care about the latest state of something, not any -//! intermediate states.) We don't have to worry about an unbounded queue, or -//! handling a full queue, or other forms of backpressure. +//! Manages execution of background tasks +use super::BackgroundTask; +use super::TaskHandle; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -149,16 +27,6 @@ use tokio::sync::watch; use tokio::sync::Notify; use tokio::time::MissedTickBehavior; -/// An operation activated both periodically and by an explicit signal -/// -/// See module-level documentation for details. -pub trait BackgroundTask: Send + Sync { - fn activate<'a>( - &'a mut self, - opctx: &'a OpContext, - ) -> BoxFuture<'a, serde_json::Value>; -} - /// Drives the execution of background tasks /// /// Nexus has only one Driver. All background tasks are registered with the @@ -170,21 +38,6 @@ pub struct Driver { tasks: BTreeMap, } -/// Identifies a background task -/// -/// This is returned by [`Driver::register()`] to identify the corresponding -/// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. -#[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] -pub struct TaskHandle(String); - -impl TaskHandle { - /// Returns the unique name of this background task - pub fn name(&self) -> &str { - &self.0 - } -} - /// Driver-side state of a background task struct Task { /// what this task does (for developers) @@ -466,7 +319,6 @@ impl GenericWatcher for watch::Receiver { mod test { use super::BackgroundTask; use super::Driver; - use crate::app::background::common::ActivationReason; use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; @@ -474,6 +326,7 @@ mod test { use futures::FutureExt; use nexus_db_queries::context::OpContext; use nexus_test_utils_macros::nexus_test; + use nexus_types::internal_api::views::ActivationReason; use std::time::Duration; use std::time::Instant; use tokio::sync::mpsc; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 1fc1e858b7..a32f60c2b6 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -2,30 +2,31 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background task initialization - -use super::abandoned_vmm_reaper; -use super::bfd; -use super::blueprint_execution; -use super::blueprint_load; -use super::common; -use super::crdb_node_id_collector; -use super::dns_config; -use super::dns_propagation; -use super::dns_servers; -use super::external_endpoints; -use super::instance_watcher; -use super::inventory_collection; -use super::metrics_producer_gc; -use super::nat_cleanup; -use super::phantom_disks; -use super::physical_disk_adoption; -use super::region_replacement; -use super::region_replacement_driver; -use super::service_firewall_rules; -use super::sync_service_zone_nat::ServiceZoneNatTracker; -use super::sync_switch_configuration::SwitchPortSettingsManager; -use super::v2p_mappings::V2PManager; +//! Specific background task initialization + +use super::tasks::abandoned_vmm_reaper; +use super::tasks::bfd; +use super::tasks::blueprint_execution; +use super::tasks::blueprint_load; +use super::tasks::crdb_node_id_collector; +use super::tasks::dns_config; +use super::tasks::dns_propagation; +use super::tasks::dns_servers; +use super::tasks::external_endpoints; +use super::tasks::instance_watcher; +use super::tasks::inventory_collection; +use super::tasks::metrics_producer_gc; +use super::tasks::nat_cleanup; +use super::tasks::phantom_disks; +use super::tasks::physical_disk_adoption; +use super::tasks::region_replacement; +use super::tasks::region_replacement_driver; +use super::tasks::service_firewall_rules; +use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; +use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; +use super::tasks::v2p_mappings::V2PManager; +use super::Driver; +use super::TaskHandle; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; use nexus_config::BackgroundTaskConfig; @@ -47,76 +48,76 @@ use uuid::Uuid; pub struct BackgroundTasks { /// interface for working with background tasks (activation, checking /// status, etc.) - pub driver: common::Driver, + pub driver: Driver, /// task handle for the internal DNS config background task - pub task_internal_dns_config: common::TaskHandle, + pub task_internal_dns_config: TaskHandle, /// task handle for the internal DNS servers background task - pub task_internal_dns_servers: common::TaskHandle, + pub task_internal_dns_servers: TaskHandle, /// task handle for the external DNS config background task - pub task_external_dns_config: common::TaskHandle, + pub task_external_dns_config: TaskHandle, /// task handle for the external DNS servers background task - pub task_external_dns_servers: common::TaskHandle, + pub task_external_dns_servers: TaskHandle, /// task handle for pruning metrics producers with expired leases - pub task_metrics_producer_gc: common::TaskHandle, + pub task_metrics_producer_gc: TaskHandle, /// task handle for the task that keeps track of external endpoints - pub task_external_endpoints: common::TaskHandle, + pub task_external_endpoints: TaskHandle, /// external endpoints read by the background task pub external_endpoints: tokio::sync::watch::Receiver< Option, >, /// task handle for the ipv4 nat entry garbage collector - pub nat_cleanup: common::TaskHandle, + pub nat_cleanup: TaskHandle, /// task handle for the switch bfd manager - pub bfd_manager: common::TaskHandle, + pub bfd_manager: TaskHandle, /// task handle for the task that collects inventory - pub task_inventory_collection: common::TaskHandle, + pub task_inventory_collection: TaskHandle, /// task handle for the task that collects inventory - pub task_physical_disk_adoption: common::TaskHandle, + pub task_physical_disk_adoption: TaskHandle, /// task handle for the task that detects phantom disks - pub task_phantom_disks: common::TaskHandle, + pub task_phantom_disks: TaskHandle, /// task handle for blueprint target loader - pub task_blueprint_loader: common::TaskHandle, + pub task_blueprint_loader: TaskHandle, /// task handle for blueprint execution background task - pub task_blueprint_executor: common::TaskHandle, + pub task_blueprint_executor: TaskHandle, /// task handle for collecting CockroachDB node IDs - pub task_crdb_node_id_collector: common::TaskHandle, + pub task_crdb_node_id_collector: TaskHandle, /// task handle for the service zone nat tracker - pub task_service_zone_nat_tracker: common::TaskHandle, + pub task_service_zone_nat_tracker: TaskHandle, /// task handle for the switch port settings manager - pub task_switch_port_settings_manager: common::TaskHandle, + pub task_switch_port_settings_manager: TaskHandle, /// task handle for the opte v2p manager - pub task_v2p_manager: common::TaskHandle, + pub task_v2p_manager: TaskHandle, /// task handle for the task that detects if regions need replacement and /// begins the process - pub task_region_replacement: common::TaskHandle, + pub task_region_replacement: TaskHandle, /// task handle for the task that drives region replacements forward - pub task_region_replacement_driver: common::TaskHandle, + pub task_region_replacement_driver: TaskHandle, /// task handle for the task that polls sled agents for instance states. - pub task_instance_watcher: common::TaskHandle, + pub task_instance_watcher: TaskHandle, /// task handle for propagation of VPC firewall rules for Omicron services /// with external network connectivity, - pub task_service_firewall_propagation: common::TaskHandle, + pub task_service_firewall_propagation: TaskHandle, /// task handle for deletion of database records for VMMs abandoned by their /// instances. - pub task_abandoned_vmm_reaper: common::TaskHandle, + pub task_abandoned_vmm_reaper: TaskHandle, } impl BackgroundTasks { @@ -136,7 +137,7 @@ impl BackgroundTasks { ), producer_registry: &ProducerRegistry, ) -> BackgroundTasks { - let mut driver = common::Driver::new(); + let mut driver = Driver::new(); let (task_internal_dns_config, task_internal_dns_servers) = init_dns( &mut driver, @@ -494,19 +495,19 @@ impl BackgroundTasks { } } - pub fn activate(&self, task: &common::TaskHandle) { + pub fn activate(&self, task: &TaskHandle) { self.driver.activate(task); } } fn init_dns( - driver: &mut common::Driver, + driver: &mut Driver, opctx: &OpContext, datastore: Arc, dns_group: DnsGroup, resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, -) -> (common::TaskHandle, common::TaskHandle) { +) -> (TaskHandle, TaskHandle) { let dns_group_name = dns_group.to_string(); let metadata = BTreeMap::from([("dns_group".to_string(), dns_group_name)]); diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 4d5a63e69f..40716aa036 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -2,32 +2,164 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background tasks +//! # Nexus Background Tasks +//! +//! A **background task** in Nexus is any operation that can be activated both +//! periodically and by an explicit signal. This is aimed at RFD 373-style +//! "reliable persistent workflows", also called "reconcilers" or "controllers". +//! These are a kind of automation that examines some _current_ state, compares +//! it to some _intended_ state, and potentially takes action to try to bring +//! the current state in sync with the intended state. Our canonical example is +//! that we want to have Nexus monitor the intended DNS configuration. When it +//! changes, we want to propagate the new configuration to all DNS servers. We +//! implement this with three different background tasks: +//! +//! 1. `DnsConfigWatcher` reads the DNS configuration from the database, stores +//! it in memory, and makes it available via a `tokio::sync::watch` channel. +//! 2. `DnsServersWatcher` reads the list of DNS servers from the database, +//! stores it in memory, and makes it available via a `tokio::sync::watch` +//! channel. +//! 3. `DnsPropagator` uses the the watch channels provided by the other two +//! background tasks to notice when either the DNS configuration or the list +//! of DNS servers has changed. It uses the latest values to make a request +//! to each server to update its configuration. +//! +//! When Nexus changes the DNS configuration, it will update the database with +//! the new configuration and then explicitly activate the `DnsConfigWatcher`. +//! When it reads the new config, it will send it to its watch channel, and that +//! will activate the `DnsPropagator`. If any of this fails, or if Nexus +//! crashes at any point, then the periodic activation of every background task +//! will eventually cause the latest config to be propagated to all of the +//! current servers. +//! +//! The background task framework here is pretty minimal: essentially what it +//! gives you is that you just write an idempotent function that you want to +//! happen periodically or on-demand, wrap it in an impl of `BackgroundTask`, +//! register that with the `Driver`, and you're done. The framework will take +//! care of: +//! +//! * providing a way for Nexus at-large to activate your task +//! * activating your task periodically +//! * ensuring that the task is activated only once at a time in this Nexus +//! (but note that it may always be running concurrently in other Nexus +//! instances) +//! * providing basic visibility into whether the task is running, when the task +//! last ran, etc. +//! +//! We may well want to extend the framework as we build more tasks in general +//! and reconcilers specifically. But we should be mindful not to create +//! footguns for ourselves! See "Design notes" below. +//! +//! ## Notes for background task implementors +//! +//! Background tasks are not necessarily just for reconcilers. That's just the +//! design center. The first two DNS background tasks above aren't reconcilers +//! in any non-trivial sense. +//! +//! Background task activations do not accept input, by design. See "Design +//! notes" below. +//! +//! Generally, you probably don't want to have your background task do retries. +//! If things fail, you rely on the periodic reactivation to try again. +//! +//! ## Design notes +//! +//! The underlying design for RFD 373-style reconcilers is inspired by a few +//! related principles: +//! +//! * the principle in distributed systems of having exactly one code path to +//! achieve a thing, and then always using that path to do that thing (as +//! opposed to having separate paths for, say, the happy path vs. failover, +//! and having one of those paths rarely used) +//! * the [constant-work pattern][1], which basically suggests that a system can +//! be more robust and scalable if it's constructed in a way that always does +//! the same amount of work. Imagine if we made requests to the DNS servers +//! to incrementally update their config every time the DNS data changed. +//! This system does more work as users make more requests. During overloads, +//! things can fall over. Compare with a system whose frontend merely updates +//! the DNS configuration that _should_ exist and whose backend periodically +//! scans the complete intended state and then sets its own state accordingly. +//! The backend does the same amount of work no matter how many requests were +//! made, making it more resistant to overload. A big downside of this +//! approach is increased latency from the user making a request to seeing it +//! applied. This can be mitigated (sacrificing some, but not all, of the +//! "constant work" property) by triggering a backend scan operation when user +//! requests complete. +//! * the design pattern in distributed systems of keeping two copies of data in +//! sync using both event notifications (like a changelog) _and_ periodic full +//! scans. The hope is that a full scan never finds a change that wasn't +//! correctly sync'd, but incorporating an occasional full scan into the +//! design ensures that such bugs are found and their impact repaired +//! automatically. +//! +//! [1]: https://aws.amazon.com/builders-library/reliability-and-constant-work/ +//! +//! Combining these, we get a design pattern for a "reconciler" where: +//! +//! * The reconciler is activated by explicit request (when we know it has work +//! to do) _and_ periodically (to deal with all manner of transient failures) +//! * The reconciler's activity is idempotent: given the same underlying state +//! (e.g., database state), it always attempts to do the same thing. +//! * Each activation of the reconciler accepts no input. That is, even when we +//! think we know what changed, we do not use that information. This ensures +//! that the reconciler really is idempotent and its actions are based solely +//! on the state that it's watching. Put differently: having reconcilers +//! accept an explicit hint about what changed (and then doing something +//! differently based on that) bifurcates the code: there's the common case +//! where that hint is available and the rarely-exercised case when it's not +//! (e.g., because Nexus crashed and it's the subsequent periodic activation +//! that's propagating this change). This is what we're trying to avoid. +//! * We do allow reconcilers to be triggered by a `tokio::sync::watch` channel +//! -- but again, not using the _data_ from that channel. There are two big +//! advantages here: (1) reduced latency from when a change is made to when +//! the reconciler applies it, and (2) (arguably another way to say the same +//! thing) we can space out the periodic activations much further, knowing +//! that most of the time we're not increasing latency by doing this. This +//! compromises the "constant-work" pattern a bit: we might wind up running +//! the reconciler more often during busy times than during idle times, and we +//! could find that overloads something. However, the _operation_ of the +//! reconciler can still be constant work, and there's no more than that +//! amount of work going on at any given time. +//! +//! `watch` channels are a convenient primitive here because they only store +//! one value. With a little care, we can ensure that the writer never blocks +//! and the readers can all see the latest value. (By design, reconcilers +//! generally only care about the latest state of something, not any +//! intermediate states.) We don't have to worry about an unbounded queue, or +//! handling a full queue, or other forms of backpressure. -mod abandoned_vmm_reaper; -mod bfd; -mod blueprint_execution; -mod blueprint_load; -mod common; -mod crdb_node_id_collector; -mod dns_config; -mod dns_propagation; -mod dns_servers; -mod external_endpoints; +mod driver; mod init; -mod instance_watcher; -mod inventory_collection; -mod metrics_producer_gc; -mod nat_cleanup; -mod networking; -mod phantom_disks; -mod physical_disk_adoption; -mod region_replacement; -mod region_replacement_driver; -mod service_firewall_rules; mod status; -mod sync_service_zone_nat; -mod sync_switch_configuration; -mod v2p_mappings; +mod tasks; +pub use driver::Driver; pub use init::BackgroundTasks; + +use futures::future::BoxFuture; +use nexus_auth::context::OpContext; + +/// An operation activated both periodically and by an explicit signal +/// +/// See module-level documentation for details. +pub trait BackgroundTask: Send + Sync { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value>; +} + +/// Identifies a background task +/// +/// This is returned by [`Driver::register()`] to identify the corresponding +/// background task. It's then accepted by functions like +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. +#[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] +pub struct TaskHandle(String); + +impl TaskHandle { + /// Returns the unique name of this background task + pub fn name(&self) -> &str { + &self.0 + } +} diff --git a/nexus/src/app/background/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs similarity index 99% rename from nexus/src/app/background/abandoned_vmm_reaper.rs rename to nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 3883185d9f..bd9a7b1602 100644 --- a/nexus/src/app/background/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -31,7 +31,7 @@ //! is handled elsewhere, by `notify_instance_updated` and (eventually) the //! `instance-update` saga. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/bfd.rs b/nexus/src/app/background/tasks/bfd.rs similarity index 98% rename from nexus/src/app/background/bfd.rs rename to nexus/src/app/background/tasks/bfd.rs index 39b3c8f661..67b15ee3d3 100644 --- a/nexus/src/app/background/bfd.rs +++ b/nexus/src/app/background/tasks/bfd.rs @@ -6,10 +6,10 @@ //! (BFD) sessions. use crate::app::{ - background::networking::build_mgd_clients, map_switch_zone_addrs, + background::tasks::networking::build_mgd_clients, map_switch_zone_addrs, }; -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::{resolver::Resolver, ServiceName}; diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs similarity index 99% rename from nexus/src/app/background/blueprint_execution.rs rename to nexus/src/app/background/tasks/blueprint_execution.rs index b01d1213de..253a89a18d 100644 --- a/nexus/src/app/background/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -4,7 +4,7 @@ //! Background task for realizing a plan blueprint -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; @@ -111,7 +111,7 @@ impl BackgroundTask for BlueprintExecutor { #[cfg(test)] mod test { use super::BlueprintExecutor; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use httptest::matchers::{all_of, request}; use httptest::responders::status_code; use httptest::Expectation; diff --git a/nexus/src/app/background/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs similarity index 99% rename from nexus/src/app/background/blueprint_load.rs rename to nexus/src/app/background/tasks/blueprint_load.rs index baf86d655f..38cfe85558 100644 --- a/nexus/src/app/background/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -7,7 +7,7 @@ //! This task triggers the `blueprint_execution` background task when the //! blueprint changes. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; @@ -185,7 +185,7 @@ impl BackgroundTask for TargetBlueprintLoader { #[cfg(test)] mod test { use super::*; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use nexus_inventory::now_db_precision; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::{ diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs similarity index 99% rename from nexus/src/app/background/crdb_node_id_collector.rs rename to nexus/src/app/background/tasks/crdb_node_id_collector.rs index 2736514021..2a0e1c6d3d 100644 --- a/nexus/src/app/background/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -23,7 +23,7 @@ //! the status of all nodes and looking for orphans, perhaps) to determine //! whether a zone without a known node ID ever existed. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs similarity index 98% rename from nexus/src/app/background/dns_config.rs rename to nexus/src/app/background/tasks/dns_config.rs index 71e0a812a7..4cf8c4d86d 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS configuration -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; use futures::FutureExt; @@ -157,9 +157,9 @@ impl BackgroundTask for DnsConfigWatcher { #[cfg(test)] mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_config::DnsConfigWatcher; + use super::DnsConfigWatcher; use crate::app::background::init::test::write_test_dns_generation; + use crate::app::background::BackgroundTask; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; diff --git a/nexus/src/app/background/dns_propagation.rs b/nexus/src/app/background/tasks/dns_propagation.rs similarity index 98% rename from nexus/src/app/background/dns_propagation.rs rename to nexus/src/app/background/tasks/dns_propagation.rs index 7d650f6f27..c680a6f010 100644 --- a/nexus/src/app/background/dns_propagation.rs +++ b/nexus/src/app/background/tasks/dns_propagation.rs @@ -4,8 +4,8 @@ //! Background task for propagating DNS configuration to all DNS servers -use super::common::BackgroundTask; use super::dns_servers::DnsServersList; +use crate::app::background::BackgroundTask; use anyhow::Context; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; @@ -177,9 +177,9 @@ async fn dns_propagate_one( #[cfg(test)] mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_propagation::DnsPropagator; - use crate::app::background::dns_servers::DnsServersList; + use super::DnsPropagator; + use crate::app::background::tasks::dns_servers::DnsServersList; + use crate::app::background::BackgroundTask; use dns_service_client::types::DnsConfigParams; use httptest::matchers::request; use httptest::responders::status_code; diff --git a/nexus/src/app/background/dns_servers.rs b/nexus/src/app/background/tasks/dns_servers.rs similarity index 99% rename from nexus/src/app/background/dns_servers.rs rename to nexus/src/app/background/tasks/dns_servers.rs index 8f4cce4ee0..9d99460917 100644 --- a/nexus/src/app/background/dns_servers.rs +++ b/nexus/src/app/background/tasks/dns_servers.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS servers -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::names::ServiceName; diff --git a/nexus/src/app/background/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs similarity index 97% rename from nexus/src/app/background/external_endpoints.rs rename to nexus/src/app/background/tasks/external_endpoints.rs index 1a587298d5..0ff1e06a46 100644 --- a/nexus/src/app/background/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -6,7 +6,7 @@ //! all Silos, their externally-visible DNS names, and the TLS certificates //! associated with those names -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use crate::app::external_endpoints::read_all_endpoints; pub use crate::app::external_endpoints::ExternalEndpoints; use futures::future::BoxFuture; @@ -117,8 +117,8 @@ impl BackgroundTask for ExternalEndpointsWatcher { #[cfg(test)] mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::external_endpoints::ExternalEndpointsWatcher; + use super::ExternalEndpointsWatcher; + use crate::app::background::BackgroundTask; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; use nexus_test_utils::resource_helpers::create_silo; diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs similarity index 99% rename from nexus/src/app/background/instance_watcher.rs rename to nexus/src/app/background/tasks/instance_watcher.rs index 1b10605c5e..a6e579eb8a 100644 --- a/nexus/src/app/background/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -4,7 +4,7 @@ //! Background task for pulling instance state from sled-agents. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs similarity index 98% rename from nexus/src/app/background/inventory_collection.rs rename to nexus/src/app/background/tasks/inventory_collection.rs index 52ee8f6e13..95268334d9 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -4,7 +4,7 @@ //! Background task for reading inventory for the rack -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; @@ -186,10 +186,10 @@ impl<'a> nexus_inventory::SledAgentEnumerator for DbSledAgentEnumerator<'a> { #[cfg(test)] mod test { + use super::DbSledAgentEnumerator; + use super::InventoryCollector; use crate::app::authz; - use crate::app::background::common::BackgroundTask; - use crate::app::background::inventory_collection::DbSledAgentEnumerator; - use crate::app::background::inventory_collection::InventoryCollector; + use crate::app::background::BackgroundTask; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; diff --git a/nexus/src/app/background/metrics_producer_gc.rs b/nexus/src/app/background/tasks/metrics_producer_gc.rs similarity index 99% rename from nexus/src/app/background/metrics_producer_gc.rs rename to nexus/src/app/background/tasks/metrics_producer_gc.rs index 2a8464b80f..8c2c271cfb 100644 --- a/nexus/src/app/background/metrics_producer_gc.rs +++ b/nexus/src/app/background/tasks/metrics_producer_gc.rs @@ -5,7 +5,7 @@ //! Background task for garbage collecting metrics producers that have not //! renewed their lease -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use chrono::TimeDelta; use chrono::Utc; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs new file mode 100644 index 0000000000..3886b43a30 --- /dev/null +++ b/nexus/src/app/background/tasks/mod.rs @@ -0,0 +1,28 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementations of specific background tasks + +pub mod abandoned_vmm_reaper; +pub mod bfd; +pub mod blueprint_execution; +pub mod blueprint_load; +pub mod crdb_node_id_collector; +pub mod dns_config; +pub mod dns_propagation; +pub mod dns_servers; +pub mod external_endpoints; +pub mod instance_watcher; +pub mod inventory_collection; +pub mod metrics_producer_gc; +pub mod nat_cleanup; +pub mod networking; +pub mod phantom_disks; +pub mod physical_disk_adoption; +pub mod region_replacement; +pub mod region_replacement_driver; +pub mod service_firewall_rules; +pub mod sync_service_zone_nat; +pub mod sync_switch_configuration; +pub mod v2p_mappings; diff --git a/nexus/src/app/background/nat_cleanup.rs b/nexus/src/app/background/tasks/nat_cleanup.rs similarity index 99% rename from nexus/src/app/background/nat_cleanup.rs rename to nexus/src/app/background/tasks/nat_cleanup.rs index 844dbffefe..675f4fc809 100644 --- a/nexus/src/app/background/nat_cleanup.rs +++ b/nexus/src/app/background/tasks/nat_cleanup.rs @@ -8,8 +8,8 @@ use crate::app::map_switch_zone_addrs; -use super::common::BackgroundTask; use super::networking::build_dpd_clients; +use crate::app::background::BackgroundTask; use chrono::{Duration, Utc}; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/networking.rs b/nexus/src/app/background/tasks/networking.rs similarity index 100% rename from nexus/src/app/background/networking.rs rename to nexus/src/app/background/tasks/networking.rs diff --git a/nexus/src/app/background/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs similarity index 98% rename from nexus/src/app/background/phantom_disks.rs rename to nexus/src/app/background/tasks/phantom_disks.rs index 48688838e5..eac37eee19 100644 --- a/nexus/src/app/background/phantom_disks.rs +++ b/nexus/src/app/background/tasks/phantom_disks.rs @@ -18,7 +18,7 @@ //! this background task is required to apply the same fix for disks that are //! already in this phantom state. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/physical_disk_adoption.rs b/nexus/src/app/background/tasks/physical_disk_adoption.rs similarity index 99% rename from nexus/src/app/background/physical_disk_adoption.rs rename to nexus/src/app/background/tasks/physical_disk_adoption.rs index 05c53963de..f3b9e8ac62 100644 --- a/nexus/src/app/background/physical_disk_adoption.rs +++ b/nexus/src/app/background/tasks/physical_disk_adoption.rs @@ -11,7 +11,7 @@ //! //! In the future, this may become more explicitly operator-controlled. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::PhysicalDisk; diff --git a/nexus/src/app/background/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs similarity index 99% rename from nexus/src/app/background/region_replacement.rs rename to nexus/src/app/background/tasks/region_replacement.rs index 02ae548d75..635b956193 100644 --- a/nexus/src/app/background/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -10,8 +10,8 @@ //! for any requests that are in state "Requested". See the documentation there //! for more information. -use super::common::BackgroundTask; use crate::app::authn; +use crate::app::background::BackgroundTask; use crate::app::sagas; use crate::app::RegionAllocationStrategy; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs similarity index 99% rename from nexus/src/app/background/region_replacement_driver.rs rename to nexus/src/app/background/tasks/region_replacement_driver.rs index a02fa4e2e4..06155ffa24 100644 --- a/nexus/src/app/background/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -18,8 +18,8 @@ //! Basically, keep starting either repair or reconcilation until they complete //! successfully, then "finish" the region replacement. -use super::common::BackgroundTask; use crate::app::authn; +use crate::app::background::BackgroundTask; use crate::app::sagas; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/service_firewall_rules.rs b/nexus/src/app/background/tasks/service_firewall_rules.rs similarity index 98% rename from nexus/src/app/background/service_firewall_rules.rs rename to nexus/src/app/background/tasks/service_firewall_rules.rs index 1a705d1fae..230172dab8 100644 --- a/nexus/src/app/background/service_firewall_rules.rs +++ b/nexus/src/app/background/tasks/service_firewall_rules.rs @@ -10,7 +10,7 @@ //! handle general changes to customer-visible VPC firewalls, and is mostly in //! place to propagate changes in the IP allowlist for user-facing services. -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs similarity index 99% rename from nexus/src/app/background/sync_service_zone_nat.rs rename to nexus/src/app/background/tasks/sync_service_zone_nat.rs index b0a4c8cef2..59cd6a6a79 100644 --- a/nexus/src/app/background/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -7,8 +7,8 @@ use crate::app::map_switch_zone_addrs; -use super::common::BackgroundTask; use super::networking::build_dpd_clients; +use crate::app::background::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs similarity index 99% rename from nexus/src/app/background/sync_switch_configuration.rs rename to nexus/src/app/background/tasks/sync_switch_configuration.rs index 8552d62988..ed13545d3b 100644 --- a/nexus/src/app/background/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -6,7 +6,7 @@ //! to relevant management daemons (dendrite, mgd, sled-agent, etc.) use crate::app::{ - background::networking::{ + background::tasks::networking::{ api_to_dpd_port_settings, build_dpd_clients, build_mgd_clients, }, map_switch_zone_addrs, @@ -23,7 +23,7 @@ use nexus_db_model::{ }; use uuid::Uuid; -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use display_error_chain::DisplayErrorChain; use dpd_client::types::PortId; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/v2p_mappings.rs b/nexus/src/app/background/tasks/v2p_mappings.rs similarity index 99% rename from nexus/src/app/background/v2p_mappings.rs rename to nexus/src/app/background/tasks/v2p_mappings.rs index e2318f94d6..26ce131e9a 100644 --- a/nexus/src/app/background/v2p_mappings.rs +++ b/nexus/src/app/background/tasks/v2p_mappings.rs @@ -12,7 +12,7 @@ use omicron_common::api::external::Vni; use serde_json::json; use sled_agent_client::types::VirtualNetworkInterfaceHost; -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; pub struct V2PManager { datastore: Arc,