From fc933eb2d0d6226169c18fdfe838195d230c16c5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 21 Jun 2024 15:19:17 -0700 Subject: [PATCH 01/23] phase one: move specific task implementations into submodule --- nexus/src/app/background/init.rs | 40 +++++++++---------- nexus/src/app/background/mod.rs | 24 +---------- .../{ => tasks}/abandoned_vmm_reaper.rs | 0 nexus/src/app/background/{ => tasks}/bfd.rs | 2 +- .../{ => tasks}/blueprint_execution.rs | 0 .../background/{ => tasks}/blueprint_load.rs | 0 .../{ => tasks}/crdb_node_id_collector.rs | 0 .../app/background/{ => tasks}/dns_config.rs | 2 +- .../background/{ => tasks}/dns_propagation.rs | 4 +- .../app/background/{ => tasks}/dns_servers.rs | 0 .../{ => tasks}/external_endpoints.rs | 2 +- .../{ => tasks}/instance_watcher.rs | 0 .../{ => tasks}/inventory_collection.rs | 4 +- .../{ => tasks}/metrics_producer_gc.rs | 0 nexus/src/app/background/tasks/mod.rs | 30 ++++++++++++++ .../app/background/{ => tasks}/nat_cleanup.rs | 0 .../app/background/{ => tasks}/networking.rs | 0 .../background/{ => tasks}/phantom_disks.rs | 0 .../{ => tasks}/physical_disk_adoption.rs | 0 .../{ => tasks}/region_replacement.rs | 0 .../{ => tasks}/service_firewall_rules.rs | 0 .../{ => tasks}/sync_service_zone_nat.rs | 0 .../{ => tasks}/sync_switch_configuration.rs | 2 +- .../background/{ => tasks}/v2p_mappings.rs | 0 24 files changed, 60 insertions(+), 50 deletions(-) rename nexus/src/app/background/{ => tasks}/abandoned_vmm_reaper.rs (100%) rename nexus/src/app/background/{ => tasks}/bfd.rs (99%) rename nexus/src/app/background/{ => tasks}/blueprint_execution.rs (100%) rename nexus/src/app/background/{ => tasks}/blueprint_load.rs (100%) rename nexus/src/app/background/{ => tasks}/crdb_node_id_collector.rs (100%) rename nexus/src/app/background/{ => tasks}/dns_config.rs (99%) rename nexus/src/app/background/{ => tasks}/dns_propagation.rs (98%) rename nexus/src/app/background/{ => tasks}/dns_servers.rs (100%) rename nexus/src/app/background/{ => tasks}/external_endpoints.rs (98%) rename nexus/src/app/background/{ => tasks}/instance_watcher.rs (100%) rename nexus/src/app/background/{ => tasks}/inventory_collection.rs (98%) rename nexus/src/app/background/{ => tasks}/metrics_producer_gc.rs (100%) create mode 100644 nexus/src/app/background/tasks/mod.rs rename nexus/src/app/background/{ => tasks}/nat_cleanup.rs (100%) rename nexus/src/app/background/{ => tasks}/networking.rs (100%) rename nexus/src/app/background/{ => tasks}/phantom_disks.rs (100%) rename nexus/src/app/background/{ => tasks}/physical_disk_adoption.rs (100%) rename nexus/src/app/background/{ => tasks}/region_replacement.rs (100%) rename nexus/src/app/background/{ => tasks}/service_firewall_rules.rs (100%) rename nexus/src/app/background/{ => tasks}/sync_service_zone_nat.rs (100%) rename nexus/src/app/background/{ => tasks}/sync_switch_configuration.rs (99%) rename nexus/src/app/background/{ => tasks}/v2p_mappings.rs (100%) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index f78cb69d76..38af9c682a 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -4,27 +4,27 @@ //! Background task initialization -use super::abandoned_vmm_reaper; -use super::bfd; -use super::blueprint_execution; -use super::blueprint_load; use super::common; -use super::crdb_node_id_collector; -use super::dns_config; -use super::dns_propagation; -use super::dns_servers; -use super::external_endpoints; -use super::instance_watcher; -use super::inventory_collection; -use super::metrics_producer_gc; -use super::nat_cleanup; -use super::phantom_disks; -use super::physical_disk_adoption; -use super::region_replacement; -use super::service_firewall_rules; -use super::sync_service_zone_nat::ServiceZoneNatTracker; -use super::sync_switch_configuration::SwitchPortSettingsManager; -use super::v2p_mappings::V2PManager; +use super::tasks::abandoned_vmm_reaper; +use super::tasks::bfd; +use super::tasks::blueprint_execution; +use super::tasks::blueprint_load; +use super::tasks::crdb_node_id_collector; +use super::tasks::dns_config; +use super::tasks::dns_propagation; +use super::tasks::dns_servers; +use super::tasks::external_endpoints; +use super::tasks::instance_watcher; +use super::tasks::inventory_collection; +use super::tasks::metrics_producer_gc; +use super::tasks::nat_cleanup; +use super::tasks::phantom_disks; +use super::tasks::physical_disk_adoption; +use super::tasks::region_replacement; +use super::tasks::service_firewall_rules; +use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; +use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; +use super::tasks::v2p_mappings::V2PManager; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; use nexus_config::BackgroundTaskConfig; diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 7d1fc43d69..41240ab810 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -2,31 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background tasks +//! Background task subsystem -mod abandoned_vmm_reaper; -mod bfd; -mod blueprint_execution; -mod blueprint_load; mod common; -mod crdb_node_id_collector; -mod dns_config; -mod dns_propagation; -mod dns_servers; -mod external_endpoints; mod init; -mod instance_watcher; -mod inventory_collection; -mod metrics_producer_gc; -mod nat_cleanup; -mod networking; -mod phantom_disks; -mod physical_disk_adoption; -mod region_replacement; -mod service_firewall_rules; mod status; -mod sync_service_zone_nat; -mod sync_switch_configuration; -mod v2p_mappings; +mod tasks; pub use init::BackgroundTasks; diff --git a/nexus/src/app/background/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs similarity index 100% rename from nexus/src/app/background/abandoned_vmm_reaper.rs rename to nexus/src/app/background/tasks/abandoned_vmm_reaper.rs diff --git a/nexus/src/app/background/bfd.rs b/nexus/src/app/background/tasks/bfd.rs similarity index 99% rename from nexus/src/app/background/bfd.rs rename to nexus/src/app/background/tasks/bfd.rs index 39b3c8f661..2c8f5de216 100644 --- a/nexus/src/app/background/bfd.rs +++ b/nexus/src/app/background/tasks/bfd.rs @@ -6,7 +6,7 @@ //! (BFD) sessions. use crate::app::{ - background::networking::build_mgd_clients, map_switch_zone_addrs, + background::tasks::networking::build_mgd_clients, map_switch_zone_addrs, }; use super::common::BackgroundTask; diff --git a/nexus/src/app/background/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs similarity index 100% rename from nexus/src/app/background/blueprint_execution.rs rename to nexus/src/app/background/tasks/blueprint_execution.rs diff --git a/nexus/src/app/background/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs similarity index 100% rename from nexus/src/app/background/blueprint_load.rs rename to nexus/src/app/background/tasks/blueprint_load.rs diff --git a/nexus/src/app/background/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs similarity index 100% rename from nexus/src/app/background/crdb_node_id_collector.rs rename to nexus/src/app/background/tasks/crdb_node_id_collector.rs diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs similarity index 99% rename from nexus/src/app/background/dns_config.rs rename to nexus/src/app/background/tasks/dns_config.rs index 71e0a812a7..c717d8cf05 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -157,8 +157,8 @@ impl BackgroundTask for DnsConfigWatcher { #[cfg(test)] mod test { + use super::DnsConfigWatcher; use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_config::DnsConfigWatcher; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; diff --git a/nexus/src/app/background/dns_propagation.rs b/nexus/src/app/background/tasks/dns_propagation.rs similarity index 98% rename from nexus/src/app/background/dns_propagation.rs rename to nexus/src/app/background/tasks/dns_propagation.rs index 7d650f6f27..4349453148 100644 --- a/nexus/src/app/background/dns_propagation.rs +++ b/nexus/src/app/background/tasks/dns_propagation.rs @@ -177,9 +177,9 @@ async fn dns_propagate_one( #[cfg(test)] mod test { + use super::DnsPropagator; use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_propagation::DnsPropagator; - use crate::app::background::dns_servers::DnsServersList; + use crate::app::background::tasks::dns_servers::DnsServersList; use dns_service_client::types::DnsConfigParams; use httptest::matchers::request; use httptest::responders::status_code; diff --git a/nexus/src/app/background/dns_servers.rs b/nexus/src/app/background/tasks/dns_servers.rs similarity index 100% rename from nexus/src/app/background/dns_servers.rs rename to nexus/src/app/background/tasks/dns_servers.rs diff --git a/nexus/src/app/background/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs similarity index 98% rename from nexus/src/app/background/external_endpoints.rs rename to nexus/src/app/background/tasks/external_endpoints.rs index 1a587298d5..2cc8e18f0d 100644 --- a/nexus/src/app/background/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -117,8 +117,8 @@ impl BackgroundTask for ExternalEndpointsWatcher { #[cfg(test)] mod test { + use super::ExternalEndpointsWatcher; use crate::app::background::common::BackgroundTask; - use crate::app::background::external_endpoints::ExternalEndpointsWatcher; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; use nexus_test_utils::resource_helpers::create_silo; diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs similarity index 100% rename from nexus/src/app/background/instance_watcher.rs rename to nexus/src/app/background/tasks/instance_watcher.rs diff --git a/nexus/src/app/background/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs similarity index 98% rename from nexus/src/app/background/inventory_collection.rs rename to nexus/src/app/background/tasks/inventory_collection.rs index 52ee8f6e13..13410eb9d2 100644 --- a/nexus/src/app/background/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -186,10 +186,10 @@ impl<'a> nexus_inventory::SledAgentEnumerator for DbSledAgentEnumerator<'a> { #[cfg(test)] mod test { + use super::DbSledAgentEnumerator; + use super::InventoryCollector; use crate::app::authz; use crate::app::background::common::BackgroundTask; - use crate::app::background::inventory_collection::DbSledAgentEnumerator; - use crate::app::background::inventory_collection::InventoryCollector; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; diff --git a/nexus/src/app/background/metrics_producer_gc.rs b/nexus/src/app/background/tasks/metrics_producer_gc.rs similarity index 100% rename from nexus/src/app/background/metrics_producer_gc.rs rename to nexus/src/app/background/tasks/metrics_producer_gc.rs diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs new file mode 100644 index 0000000000..ae7f8af74e --- /dev/null +++ b/nexus/src/app/background/tasks/mod.rs @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementations of specific background tasks + +pub mod abandoned_vmm_reaper; +pub mod bfd; +pub mod blueprint_execution; +pub mod blueprint_load; +pub mod crdb_node_id_collector; +pub mod dns_config; +pub mod dns_propagation; +pub mod dns_servers; +pub mod external_endpoints; +pub mod instance_watcher; +pub mod inventory_collection; +pub mod metrics_producer_gc; +pub mod nat_cleanup; +pub mod networking; +pub mod phantom_disks; +pub mod physical_disk_adoption; +pub mod region_replacement; +pub mod service_firewall_rules; +pub mod sync_service_zone_nat; +pub mod sync_switch_configuration; +pub mod v2p_mappings; + +// XXX-dap fix up all the individual tasks to fix up this import +use super::common; diff --git a/nexus/src/app/background/nat_cleanup.rs b/nexus/src/app/background/tasks/nat_cleanup.rs similarity index 100% rename from nexus/src/app/background/nat_cleanup.rs rename to nexus/src/app/background/tasks/nat_cleanup.rs diff --git a/nexus/src/app/background/networking.rs b/nexus/src/app/background/tasks/networking.rs similarity index 100% rename from nexus/src/app/background/networking.rs rename to nexus/src/app/background/tasks/networking.rs diff --git a/nexus/src/app/background/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs similarity index 100% rename from nexus/src/app/background/phantom_disks.rs rename to nexus/src/app/background/tasks/phantom_disks.rs diff --git a/nexus/src/app/background/physical_disk_adoption.rs b/nexus/src/app/background/tasks/physical_disk_adoption.rs similarity index 100% rename from nexus/src/app/background/physical_disk_adoption.rs rename to nexus/src/app/background/tasks/physical_disk_adoption.rs diff --git a/nexus/src/app/background/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs similarity index 100% rename from nexus/src/app/background/region_replacement.rs rename to nexus/src/app/background/tasks/region_replacement.rs diff --git a/nexus/src/app/background/service_firewall_rules.rs b/nexus/src/app/background/tasks/service_firewall_rules.rs similarity index 100% rename from nexus/src/app/background/service_firewall_rules.rs rename to nexus/src/app/background/tasks/service_firewall_rules.rs diff --git a/nexus/src/app/background/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs similarity index 100% rename from nexus/src/app/background/sync_service_zone_nat.rs rename to nexus/src/app/background/tasks/sync_service_zone_nat.rs diff --git a/nexus/src/app/background/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs similarity index 99% rename from nexus/src/app/background/sync_switch_configuration.rs rename to nexus/src/app/background/tasks/sync_switch_configuration.rs index 8552d62988..7069016bbc 100644 --- a/nexus/src/app/background/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -6,7 +6,7 @@ //! to relevant management daemons (dendrite, mgd, sled-agent, etc.) use crate::app::{ - background::networking::{ + background::tasks::networking::{ api_to_dpd_port_settings, build_dpd_clients, build_mgd_clients, }, map_switch_zone_addrs, diff --git a/nexus/src/app/background/v2p_mappings.rs b/nexus/src/app/background/tasks/v2p_mappings.rs similarity index 100% rename from nexus/src/app/background/v2p_mappings.rs rename to nexus/src/app/background/tasks/v2p_mappings.rs From 79036c404fef48634d7fe5697034755004f627fd Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 21 Jun 2024 15:25:53 -0700 Subject: [PATCH 02/23] phase two: rework imports --- nexus/src/app/background/tasks/abandoned_vmm_reaper.rs | 2 +- nexus/src/app/background/tasks/bfd.rs | 2 +- nexus/src/app/background/tasks/blueprint_execution.rs | 2 +- nexus/src/app/background/tasks/blueprint_load.rs | 2 +- nexus/src/app/background/tasks/crdb_node_id_collector.rs | 2 +- nexus/src/app/background/tasks/dns_config.rs | 2 +- nexus/src/app/background/tasks/dns_propagation.rs | 2 +- nexus/src/app/background/tasks/dns_servers.rs | 2 +- nexus/src/app/background/tasks/external_endpoints.rs | 2 +- nexus/src/app/background/tasks/instance_watcher.rs | 2 +- nexus/src/app/background/tasks/inventory_collection.rs | 2 +- nexus/src/app/background/tasks/metrics_producer_gc.rs | 2 +- nexus/src/app/background/tasks/mod.rs | 3 --- nexus/src/app/background/tasks/nat_cleanup.rs | 2 +- nexus/src/app/background/tasks/phantom_disks.rs | 2 +- nexus/src/app/background/tasks/physical_disk_adoption.rs | 2 +- nexus/src/app/background/tasks/region_replacement.rs | 2 +- nexus/src/app/background/tasks/service_firewall_rules.rs | 2 +- nexus/src/app/background/tasks/sync_service_zone_nat.rs | 2 +- nexus/src/app/background/tasks/sync_switch_configuration.rs | 2 +- nexus/src/app/background/tasks/v2p_mappings.rs | 2 +- 21 files changed, 20 insertions(+), 23 deletions(-) diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 3883185d9f..9f185004f3 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -31,7 +31,7 @@ //! is handled elsewhere, by `notify_instance_updated` and (eventually) the //! `instance-update` saga. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/bfd.rs b/nexus/src/app/background/tasks/bfd.rs index 2c8f5de216..601ef4a5a6 100644 --- a/nexus/src/app/background/tasks/bfd.rs +++ b/nexus/src/app/background/tasks/bfd.rs @@ -9,7 +9,7 @@ use crate::app::{ background::tasks::networking::build_mgd_clients, map_switch_zone_addrs, }; -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::{resolver::Resolver, ServiceName}; diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index b01d1213de..2fc1779504 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -4,7 +4,7 @@ //! Background task for realizing a plan blueprint -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index baf86d655f..c43207e281 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -7,7 +7,7 @@ //! This task triggers the `blueprint_execution` background task when the //! blueprint changes. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 2736514021..da4e3448dd 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -23,7 +23,7 @@ //! the status of all nodes and looking for orphans, perhaps) to determine //! whether a zone without a known node ID ever existed. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs index c717d8cf05..55bc88761c 100644 --- a/nexus/src/app/background/tasks/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS configuration -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/dns_propagation.rs b/nexus/src/app/background/tasks/dns_propagation.rs index 4349453148..7ce321d7f5 100644 --- a/nexus/src/app/background/tasks/dns_propagation.rs +++ b/nexus/src/app/background/tasks/dns_propagation.rs @@ -4,8 +4,8 @@ //! Background task for propagating DNS configuration to all DNS servers -use super::common::BackgroundTask; use super::dns_servers::DnsServersList; +use crate::app::background::common::BackgroundTask; use anyhow::Context; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/dns_servers.rs b/nexus/src/app/background/tasks/dns_servers.rs index 8f4cce4ee0..19442f9655 100644 --- a/nexus/src/app/background/tasks/dns_servers.rs +++ b/nexus/src/app/background/tasks/dns_servers.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS servers -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::names::ServiceName; diff --git a/nexus/src/app/background/tasks/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs index 2cc8e18f0d..d0fd0ed20a 100644 --- a/nexus/src/app/background/tasks/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -6,7 +6,7 @@ //! all Silos, their externally-visible DNS names, and the TLS certificates //! associated with those names -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use crate::app::external_endpoints::read_all_endpoints; pub use crate::app::external_endpoints::ExternalEndpoints; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 1b10605c5e..e1c94d577d 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -4,7 +4,7 @@ //! Background task for pulling instance state from sled-agents. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index 13410eb9d2..dae5ccae8e 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -4,7 +4,7 @@ //! Background task for reading inventory for the rack -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/metrics_producer_gc.rs b/nexus/src/app/background/tasks/metrics_producer_gc.rs index 2a8464b80f..5dbb16ef80 100644 --- a/nexus/src/app/background/tasks/metrics_producer_gc.rs +++ b/nexus/src/app/background/tasks/metrics_producer_gc.rs @@ -5,7 +5,7 @@ //! Background task for garbage collecting metrics producers that have not //! renewed their lease -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use chrono::TimeDelta; use chrono::Utc; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index ae7f8af74e..d4a8733eb6 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -25,6 +25,3 @@ pub mod service_firewall_rules; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; pub mod v2p_mappings; - -// XXX-dap fix up all the individual tasks to fix up this import -use super::common; diff --git a/nexus/src/app/background/tasks/nat_cleanup.rs b/nexus/src/app/background/tasks/nat_cleanup.rs index 844dbffefe..399b999340 100644 --- a/nexus/src/app/background/tasks/nat_cleanup.rs +++ b/nexus/src/app/background/tasks/nat_cleanup.rs @@ -8,8 +8,8 @@ use crate::app::map_switch_zone_addrs; -use super::common::BackgroundTask; use super::networking::build_dpd_clients; +use crate::app::background::common::BackgroundTask; use chrono::{Duration, Utc}; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs index 48688838e5..6b128760ed 100644 --- a/nexus/src/app/background/tasks/phantom_disks.rs +++ b/nexus/src/app/background/tasks/phantom_disks.rs @@ -18,7 +18,7 @@ //! this background task is required to apply the same fix for disks that are //! already in this phantom state. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/physical_disk_adoption.rs b/nexus/src/app/background/tasks/physical_disk_adoption.rs index 05c53963de..db2b126d71 100644 --- a/nexus/src/app/background/tasks/physical_disk_adoption.rs +++ b/nexus/src/app/background/tasks/physical_disk_adoption.rs @@ -11,7 +11,7 @@ //! //! In the future, this may become more explicitly operator-controlled. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::PhysicalDisk; diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 02ae548d75..14e11cc86b 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -10,8 +10,8 @@ //! for any requests that are in state "Requested". See the documentation there //! for more information. -use super::common::BackgroundTask; use crate::app::authn; +use crate::app::background::common::BackgroundTask; use crate::app::sagas; use crate::app::RegionAllocationStrategy; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/service_firewall_rules.rs b/nexus/src/app/background/tasks/service_firewall_rules.rs index 1a705d1fae..36a9926518 100644 --- a/nexus/src/app/background/tasks/service_firewall_rules.rs +++ b/nexus/src/app/background/tasks/service_firewall_rules.rs @@ -10,7 +10,7 @@ //! handle general changes to customer-visible VPC firewalls, and is mostly in //! place to propagate changes in the IP allowlist for user-facing services. -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs index b0a4c8cef2..3152ae468e 100644 --- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -7,8 +7,8 @@ use crate::app::map_switch_zone_addrs; -use super::common::BackgroundTask; use super::networking::build_dpd_clients; +use crate::app::background::common::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index 7069016bbc..dd21aeca31 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -23,7 +23,7 @@ use nexus_db_model::{ }; use uuid::Uuid; -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; use display_error_chain::DisplayErrorChain; use dpd_client::types::PortId; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/v2p_mappings.rs b/nexus/src/app/background/tasks/v2p_mappings.rs index e2318f94d6..16c9d57526 100644 --- a/nexus/src/app/background/tasks/v2p_mappings.rs +++ b/nexus/src/app/background/tasks/v2p_mappings.rs @@ -12,7 +12,7 @@ use omicron_common::api::external::Vni; use serde_json::json; use sled_agent_client::types::VirtualNetworkInterfaceHost; -use super::common::BackgroundTask; +use crate::app::background::common::BackgroundTask; pub struct V2PManager { datastore: Arc, From 7472be7bc131627374e625f87ac62e0cbb360278 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 24 Jun 2024 11:09:42 -0700 Subject: [PATCH 03/23] reorganize the top-level code a bit --- .../app/background/{common.rs => driver.rs} | 155 +---------------- nexus/src/app/background/init.rs | 57 +++---- nexus/src/app/background/mod.rs | 157 +++++++++++++++++- .../background/tasks/abandoned_vmm_reaper.rs | 2 +- nexus/src/app/background/tasks/bfd.rs | 2 +- .../background/tasks/blueprint_execution.rs | 4 +- .../app/background/tasks/blueprint_load.rs | 4 +- .../tasks/crdb_node_id_collector.rs | 2 +- nexus/src/app/background/tasks/dns_config.rs | 4 +- .../app/background/tasks/dns_propagation.rs | 4 +- nexus/src/app/background/tasks/dns_servers.rs | 2 +- .../background/tasks/external_endpoints.rs | 4 +- .../app/background/tasks/instance_watcher.rs | 2 +- .../background/tasks/inventory_collection.rs | 4 +- .../background/tasks/metrics_producer_gc.rs | 2 +- nexus/src/app/background/tasks/nat_cleanup.rs | 2 +- .../src/app/background/tasks/phantom_disks.rs | 2 +- .../tasks/physical_disk_adoption.rs | 2 +- .../background/tasks/region_replacement.rs | 2 +- .../tasks/service_firewall_rules.rs | 2 +- .../background/tasks/sync_service_zone_nat.rs | 2 +- .../tasks/sync_switch_configuration.rs | 2 +- .../src/app/background/tasks/v2p_mappings.rs | 2 +- 23 files changed, 214 insertions(+), 207 deletions(-) rename nexus/src/app/background/{common.rs => driver.rs} (76%) diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/driver.rs similarity index 76% rename from nexus/src/app/background/common.rs rename to nexus/src/app/background/driver.rs index da595dc4e1..f1982b1ad8 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/driver.rs @@ -2,132 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! # Nexus Background Tasks -//! -//! A **background task** in Nexus is any operation that can be activated both -//! periodically and by an explicit signal. This is aimed at RFD 373-style -//! "reliable persistent workflows", also called "reconcilers" or "controllers". -//! These are a kind of automation that examines some _current_ state, compares -//! it to some _intended_ state, and potentially takes action to try to bring -//! the current state in sync with the intended state. Our canonical example is -//! that we want to have Nexus monitor the intended DNS configuration. When it -//! changes, we want to propagate the new configuration to all DNS servers. We -//! implement this with three different background tasks: -//! -//! 1. `DnsConfigWatcher` reads the DNS configuration from the database, stores -//! it in memory, and makes it available via a `tokio::sync::watch` channel. -//! 2. `DnsServersWatcher` reads the list of DNS servers from the database, -//! stores it in memory, and makes it available via a `tokio::sync::watch` -//! channel. -//! 3. `DnsPropagator` uses the the watch channels provided by the other two -//! background tasks to notice when either the DNS configuration or the list -//! of DNS servers has changed. It uses the latest values to make a request -//! to each server to update its configuration. -//! -//! When Nexus changes the DNS configuration, it will update the database with -//! the new configuration and then explicitly activate the `DnsConfigWatcher`. -//! When it reads the new config, it will send it to its watch channel, and that -//! will activate the `DnsPropagator`. If any of this fails, or if Nexus -//! crashes at any point, then the periodic activation of every background task -//! will eventually cause the latest config to be propagated to all of the -//! current servers. -//! -//! The background task framework here is pretty minimal: essentially what it -//! gives you is that you just write an idempotent function that you want to -//! happen periodically or on-demand, wrap it in an impl of `BackgroundTask`, -//! register that with the `Driver`, and you're done. The framework will take -//! care of: -//! -//! * providing a way for Nexus at-large to activate your task -//! * activating your task periodically -//! * ensuring that the task is activated only once at a time in this Nexus -//! (but note that it may always be running concurrently in other Nexus -//! instances) -//! * providing basic visibility into whether the task is running, when the task -//! last ran, etc. -//! -//! We may well want to extend the framework as we build more tasks in general -//! and reconcilers specifically. But we should be mindful not to create -//! footguns for ourselves! See "Design notes" below. -//! -//! ## Notes for background task implementors -//! -//! Background tasks are not necessarily just for reconcilers. That's just the -//! design center. The first two DNS background tasks above aren't reconcilers -//! in any non-trivial sense. -//! -//! Background task activations do not accept input, by design. See "Design -//! notes" below. -//! -//! Generally, you probably don't want to have your background task do retries. -//! If things fail, you rely on the periodic reactivation to try again. -//! -//! ## Design notes -//! -//! The underlying design for RFD 373-style reconcilers is inspired by a few -//! related principles: -//! -//! * the principle in distributed systems of having exactly one code path to -//! achieve a thing, and then always using that path to do that thing (as -//! opposed to having separate paths for, say, the happy path vs. failover, -//! and having one of those paths rarely used) -//! * the [constant-work pattern][1], which basically suggests that a system can -//! be more robust and scalable if it's constructed in a way that always does -//! the same amount of work. Imagine if we made requests to the DNS servers -//! to incrementally update their config every time the DNS data changed. -//! This system does more work as users make more requests. During overloads, -//! things can fall over. Compare with a system whose frontend merely updates -//! the DNS configuration that _should_ exist and whose backend periodically -//! scans the complete intended state and then sets its own state accordingly. -//! The backend does the same amount of work no matter how many requests were -//! made, making it more resistant to overload. A big downside of this -//! approach is increased latency from the user making a request to seeing it -//! applied. This can be mitigated (sacrificing some, but not all, of the -//! "constant work" property) by triggering a backend scan operation when user -//! requests complete. -//! * the design pattern in distributed systems of keeping two copies of data in -//! sync using both event notifications (like a changelog) _and_ periodic full -//! scans. The hope is that a full scan never finds a change that wasn't -//! correctly sync'd, but incorporating an occasional full scan into the -//! design ensures that such bugs are found and their impact repaired -//! automatically. -//! -//! [1]: https://aws.amazon.com/builders-library/reliability-and-constant-work/ -//! -//! Combining these, we get a design pattern for a "reconciler" where: -//! -//! * The reconciler is activated by explicit request (when we know it has work -//! to do) _and_ periodically (to deal with all manner of transient failures) -//! * The reconciler's activity is idempotent: given the same underlying state -//! (e.g., database state), it always attempts to do the same thing. -//! * Each activation of the reconciler accepts no input. That is, even when we -//! think we know what changed, we do not use that information. This ensures -//! that the reconciler really is idempotent and its actions are based solely -//! on the state that it's watching. Put differently: having reconcilers -//! accept an explicit hint about what changed (and then doing something -//! differently based on that) bifurcates the code: there's the common case -//! where that hint is available and the rarely-exercised case when it's not -//! (e.g., because Nexus crashed and it's the subsequent periodic activation -//! that's propagating this change). This is what we're trying to avoid. -//! * We do allow reconcilers to be triggered by a `tokio::sync::watch` channel -//! -- but again, not using the _data_ from that channel. There are two big -//! advantages here: (1) reduced latency from when a change is made to when -//! the reconciler applies it, and (2) (arguably another way to say the same -//! thing) we can space out the periodic activations much further, knowing -//! that most of the time we're not increasing latency by doing this. This -//! compromises the "constant-work" pattern a bit: we might wind up running -//! the reconciler more often during busy times than during idle times, and we -//! could find that overloads something. However, the _operation_ of the -//! reconciler can still be constant work, and there's no more than that -//! amount of work going on at any given time. -//! -//! `watch` channels are a convenient primitive here because they only store -//! one value. With a little care, we can ensure that the writer never blocks -//! and the readers can all see the latest value. (By design, reconcilers -//! generally only care about the latest state of something, not any -//! intermediate states.) We don't have to worry about an unbounded queue, or -//! handling a full queue, or other forms of backpressure. +//! Manages execution of background tasks +use super::BackgroundTask; +use super::TaskHandle; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -149,16 +27,6 @@ use tokio::sync::watch; use tokio::sync::Notify; use tokio::time::MissedTickBehavior; -/// An operation activated both periodically and by an explicit signal -/// -/// See module-level documentation for details. -pub trait BackgroundTask: Send + Sync { - fn activate<'a>( - &'a mut self, - opctx: &'a OpContext, - ) -> BoxFuture<'a, serde_json::Value>; -} - /// Drives the execution of background tasks /// /// Nexus has only one Driver. All background tasks are registered with the @@ -170,21 +38,6 @@ pub struct Driver { tasks: BTreeMap, } -/// Identifies a background task -/// -/// This is returned by [`Driver::register()`] to identify the corresponding -/// background task. It's then accepted by functions like -/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. -#[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] -pub struct TaskHandle(String); - -impl TaskHandle { - /// Returns the unique name of this background task - pub fn name(&self) -> &str { - &self.0 - } -} - /// Driver-side state of a background task struct Task { /// what this task does (for developers) @@ -466,7 +319,6 @@ impl GenericWatcher for watch::Receiver { mod test { use super::BackgroundTask; use super::Driver; - use crate::app::background::common::ActivationReason; use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; @@ -474,6 +326,7 @@ mod test { use futures::FutureExt; use nexus_db_queries::context::OpContext; use nexus_test_utils_macros::nexus_test; + use nexus_types::internal_api::views::ActivationReason; use std::time::Duration; use std::time::Instant; use tokio::sync::mpsc; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 38af9c682a..bae478458f 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -2,9 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background task initialization +//! Specific background task initialization -use super::common; use super::tasks::abandoned_vmm_reaper; use super::tasks::bfd; use super::tasks::blueprint_execution; @@ -25,6 +24,8 @@ use super::tasks::service_firewall_rules; use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::v2p_mappings::V2PManager; +use super::Driver; +use super::TaskHandle; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; use nexus_config::BackgroundTaskConfig; @@ -46,73 +47,73 @@ use uuid::Uuid; pub struct BackgroundTasks { /// interface for working with background tasks (activation, checking /// status, etc.) - pub driver: common::Driver, + pub driver: Driver, /// task handle for the internal DNS config background task - pub task_internal_dns_config: common::TaskHandle, + pub task_internal_dns_config: TaskHandle, /// task handle for the internal DNS servers background task - pub task_internal_dns_servers: common::TaskHandle, + pub task_internal_dns_servers: TaskHandle, /// task handle for the external DNS config background task - pub task_external_dns_config: common::TaskHandle, + pub task_external_dns_config: TaskHandle, /// task handle for the external DNS servers background task - pub task_external_dns_servers: common::TaskHandle, + pub task_external_dns_servers: TaskHandle, /// task handle for pruning metrics producers with expired leases - pub task_metrics_producer_gc: common::TaskHandle, + pub task_metrics_producer_gc: TaskHandle, /// task handle for the task that keeps track of external endpoints - pub task_external_endpoints: common::TaskHandle, + pub task_external_endpoints: TaskHandle, /// external endpoints read by the background task pub external_endpoints: tokio::sync::watch::Receiver< Option, >, /// task handle for the ipv4 nat entry garbage collector - pub nat_cleanup: common::TaskHandle, + pub nat_cleanup: TaskHandle, /// task handle for the switch bfd manager - pub bfd_manager: common::TaskHandle, + pub bfd_manager: TaskHandle, /// task handle for the task that collects inventory - pub task_inventory_collection: common::TaskHandle, + pub task_inventory_collection: TaskHandle, /// task handle for the task that collects inventory - pub task_physical_disk_adoption: common::TaskHandle, + pub task_physical_disk_adoption: TaskHandle, /// task handle for the task that detects phantom disks - pub task_phantom_disks: common::TaskHandle, + pub task_phantom_disks: TaskHandle, /// task handle for blueprint target loader - pub task_blueprint_loader: common::TaskHandle, + pub task_blueprint_loader: TaskHandle, /// task handle for blueprint execution background task - pub task_blueprint_executor: common::TaskHandle, + pub task_blueprint_executor: TaskHandle, /// task handle for collecting CockroachDB node IDs - pub task_crdb_node_id_collector: common::TaskHandle, + pub task_crdb_node_id_collector: TaskHandle, /// task handle for the service zone nat tracker - pub task_service_zone_nat_tracker: common::TaskHandle, + pub task_service_zone_nat_tracker: TaskHandle, /// task handle for the switch port settings manager - pub task_switch_port_settings_manager: common::TaskHandle, + pub task_switch_port_settings_manager: TaskHandle, /// task handle for the opte v2p manager - pub task_v2p_manager: common::TaskHandle, + pub task_v2p_manager: TaskHandle, /// task handle for the task that detects if regions need replacement and /// begins the process - pub task_region_replacement: common::TaskHandle, + pub task_region_replacement: TaskHandle, /// task handle for the task that polls sled agents for instance states. - pub task_instance_watcher: common::TaskHandle, + pub task_instance_watcher: TaskHandle, /// task handle for propagation of VPC firewall rules for Omicron services /// with external network connectivity, - pub task_service_firewall_propagation: common::TaskHandle, + pub task_service_firewall_propagation: TaskHandle, /// task handle for deletion of database records for VMMs abandoned by their /// instances. - pub task_abandoned_vmm_reaper: common::TaskHandle, + pub task_abandoned_vmm_reaper: TaskHandle, } impl BackgroundTasks { @@ -132,7 +133,7 @@ impl BackgroundTasks { ), producer_registry: &ProducerRegistry, ) -> BackgroundTasks { - let mut driver = common::Driver::new(); + let mut driver = Driver::new(); let (task_internal_dns_config, task_internal_dns_servers) = init_dns( &mut driver, @@ -468,19 +469,19 @@ impl BackgroundTasks { } } - pub fn activate(&self, task: &common::TaskHandle) { + pub fn activate(&self, task: &TaskHandle) { self.driver.activate(task); } } fn init_dns( - driver: &mut common::Driver, + driver: &mut Driver, opctx: &OpContext, datastore: Arc, dns_group: DnsGroup, resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, -) -> (common::TaskHandle, common::TaskHandle) { +) -> (TaskHandle, TaskHandle) { let dns_group_name = dns_group.to_string(); let metadata = BTreeMap::from([("dns_group".to_string(), dns_group_name)]); diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 41240ab810..40716aa036 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -2,11 +2,164 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Background task subsystem +//! # Nexus Background Tasks +//! +//! A **background task** in Nexus is any operation that can be activated both +//! periodically and by an explicit signal. This is aimed at RFD 373-style +//! "reliable persistent workflows", also called "reconcilers" or "controllers". +//! These are a kind of automation that examines some _current_ state, compares +//! it to some _intended_ state, and potentially takes action to try to bring +//! the current state in sync with the intended state. Our canonical example is +//! that we want to have Nexus monitor the intended DNS configuration. When it +//! changes, we want to propagate the new configuration to all DNS servers. We +//! implement this with three different background tasks: +//! +//! 1. `DnsConfigWatcher` reads the DNS configuration from the database, stores +//! it in memory, and makes it available via a `tokio::sync::watch` channel. +//! 2. `DnsServersWatcher` reads the list of DNS servers from the database, +//! stores it in memory, and makes it available via a `tokio::sync::watch` +//! channel. +//! 3. `DnsPropagator` uses the the watch channels provided by the other two +//! background tasks to notice when either the DNS configuration or the list +//! of DNS servers has changed. It uses the latest values to make a request +//! to each server to update its configuration. +//! +//! When Nexus changes the DNS configuration, it will update the database with +//! the new configuration and then explicitly activate the `DnsConfigWatcher`. +//! When it reads the new config, it will send it to its watch channel, and that +//! will activate the `DnsPropagator`. If any of this fails, or if Nexus +//! crashes at any point, then the periodic activation of every background task +//! will eventually cause the latest config to be propagated to all of the +//! current servers. +//! +//! The background task framework here is pretty minimal: essentially what it +//! gives you is that you just write an idempotent function that you want to +//! happen periodically or on-demand, wrap it in an impl of `BackgroundTask`, +//! register that with the `Driver`, and you're done. The framework will take +//! care of: +//! +//! * providing a way for Nexus at-large to activate your task +//! * activating your task periodically +//! * ensuring that the task is activated only once at a time in this Nexus +//! (but note that it may always be running concurrently in other Nexus +//! instances) +//! * providing basic visibility into whether the task is running, when the task +//! last ran, etc. +//! +//! We may well want to extend the framework as we build more tasks in general +//! and reconcilers specifically. But we should be mindful not to create +//! footguns for ourselves! See "Design notes" below. +//! +//! ## Notes for background task implementors +//! +//! Background tasks are not necessarily just for reconcilers. That's just the +//! design center. The first two DNS background tasks above aren't reconcilers +//! in any non-trivial sense. +//! +//! Background task activations do not accept input, by design. See "Design +//! notes" below. +//! +//! Generally, you probably don't want to have your background task do retries. +//! If things fail, you rely on the periodic reactivation to try again. +//! +//! ## Design notes +//! +//! The underlying design for RFD 373-style reconcilers is inspired by a few +//! related principles: +//! +//! * the principle in distributed systems of having exactly one code path to +//! achieve a thing, and then always using that path to do that thing (as +//! opposed to having separate paths for, say, the happy path vs. failover, +//! and having one of those paths rarely used) +//! * the [constant-work pattern][1], which basically suggests that a system can +//! be more robust and scalable if it's constructed in a way that always does +//! the same amount of work. Imagine if we made requests to the DNS servers +//! to incrementally update their config every time the DNS data changed. +//! This system does more work as users make more requests. During overloads, +//! things can fall over. Compare with a system whose frontend merely updates +//! the DNS configuration that _should_ exist and whose backend periodically +//! scans the complete intended state and then sets its own state accordingly. +//! The backend does the same amount of work no matter how many requests were +//! made, making it more resistant to overload. A big downside of this +//! approach is increased latency from the user making a request to seeing it +//! applied. This can be mitigated (sacrificing some, but not all, of the +//! "constant work" property) by triggering a backend scan operation when user +//! requests complete. +//! * the design pattern in distributed systems of keeping two copies of data in +//! sync using both event notifications (like a changelog) _and_ periodic full +//! scans. The hope is that a full scan never finds a change that wasn't +//! correctly sync'd, but incorporating an occasional full scan into the +//! design ensures that such bugs are found and their impact repaired +//! automatically. +//! +//! [1]: https://aws.amazon.com/builders-library/reliability-and-constant-work/ +//! +//! Combining these, we get a design pattern for a "reconciler" where: +//! +//! * The reconciler is activated by explicit request (when we know it has work +//! to do) _and_ periodically (to deal with all manner of transient failures) +//! * The reconciler's activity is idempotent: given the same underlying state +//! (e.g., database state), it always attempts to do the same thing. +//! * Each activation of the reconciler accepts no input. That is, even when we +//! think we know what changed, we do not use that information. This ensures +//! that the reconciler really is idempotent and its actions are based solely +//! on the state that it's watching. Put differently: having reconcilers +//! accept an explicit hint about what changed (and then doing something +//! differently based on that) bifurcates the code: there's the common case +//! where that hint is available and the rarely-exercised case when it's not +//! (e.g., because Nexus crashed and it's the subsequent periodic activation +//! that's propagating this change). This is what we're trying to avoid. +//! * We do allow reconcilers to be triggered by a `tokio::sync::watch` channel +//! -- but again, not using the _data_ from that channel. There are two big +//! advantages here: (1) reduced latency from when a change is made to when +//! the reconciler applies it, and (2) (arguably another way to say the same +//! thing) we can space out the periodic activations much further, knowing +//! that most of the time we're not increasing latency by doing this. This +//! compromises the "constant-work" pattern a bit: we might wind up running +//! the reconciler more often during busy times than during idle times, and we +//! could find that overloads something. However, the _operation_ of the +//! reconciler can still be constant work, and there's no more than that +//! amount of work going on at any given time. +//! +//! `watch` channels are a convenient primitive here because they only store +//! one value. With a little care, we can ensure that the writer never blocks +//! and the readers can all see the latest value. (By design, reconcilers +//! generally only care about the latest state of something, not any +//! intermediate states.) We don't have to worry about an unbounded queue, or +//! handling a full queue, or other forms of backpressure. -mod common; +mod driver; mod init; mod status; mod tasks; +pub use driver::Driver; pub use init::BackgroundTasks; + +use futures::future::BoxFuture; +use nexus_auth::context::OpContext; + +/// An operation activated both periodically and by an explicit signal +/// +/// See module-level documentation for details. +pub trait BackgroundTask: Send + Sync { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value>; +} + +/// Identifies a background task +/// +/// This is returned by [`Driver::register()`] to identify the corresponding +/// background task. It's then accepted by functions like +/// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. +#[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] +pub struct TaskHandle(String); + +impl TaskHandle { + /// Returns the unique name of this background task + pub fn name(&self) -> &str { + &self.0 + } +} diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 9f185004f3..bd9a7b1602 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -31,7 +31,7 @@ //! is handled elsewhere, by `notify_instance_updated` and (eventually) the //! `instance-update` saga. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/bfd.rs b/nexus/src/app/background/tasks/bfd.rs index 601ef4a5a6..67b15ee3d3 100644 --- a/nexus/src/app/background/tasks/bfd.rs +++ b/nexus/src/app/background/tasks/bfd.rs @@ -9,7 +9,7 @@ use crate::app::{ background::tasks::networking::build_mgd_clients, map_switch_zone_addrs, }; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::{resolver::Resolver, ServiceName}; diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 2fc1779504..253a89a18d 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -4,7 +4,7 @@ //! Background task for realizing a plan blueprint -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; @@ -111,7 +111,7 @@ impl BackgroundTask for BlueprintExecutor { #[cfg(test)] mod test { use super::BlueprintExecutor; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use httptest::matchers::{all_of, request}; use httptest::responders::status_code; use httptest::Expectation; diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index c43207e281..38cfe85558 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -7,7 +7,7 @@ //! This task triggers the `blueprint_execution` background task when the //! blueprint changes. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; @@ -185,7 +185,7 @@ impl BackgroundTask for TargetBlueprintLoader { #[cfg(test)] mod test { use super::*; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use nexus_inventory::now_db_precision; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::{ diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index da4e3448dd..2a0e1c6d3d 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -23,7 +23,7 @@ //! the status of all nodes and looking for orphans, perhaps) to determine //! whether a zone without a known node ID ever existed. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs index 55bc88761c..03e104090b 100644 --- a/nexus/src/app/background/tasks/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS configuration -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; use futures::FutureExt; @@ -158,7 +158,7 @@ impl BackgroundTask for DnsConfigWatcher { #[cfg(test)] mod test { use super::DnsConfigWatcher; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; diff --git a/nexus/src/app/background/tasks/dns_propagation.rs b/nexus/src/app/background/tasks/dns_propagation.rs index 7ce321d7f5..07b3783f72 100644 --- a/nexus/src/app/background/tasks/dns_propagation.rs +++ b/nexus/src/app/background/tasks/dns_propagation.rs @@ -5,7 +5,7 @@ //! Background task for propagating DNS configuration to all DNS servers use super::dns_servers::DnsServersList; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::Context; use dns_service_client::types::DnsConfigParams; use futures::future::BoxFuture; @@ -178,7 +178,7 @@ async fn dns_propagate_one( #[cfg(test)] mod test { use super::DnsPropagator; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use crate::app::background::tasks::dns_servers::DnsServersList; use dns_service_client::types::DnsConfigParams; use httptest::matchers::request; diff --git a/nexus/src/app/background/tasks/dns_servers.rs b/nexus/src/app/background/tasks/dns_servers.rs index 19442f9655..9d99460917 100644 --- a/nexus/src/app/background/tasks/dns_servers.rs +++ b/nexus/src/app/background/tasks/dns_servers.rs @@ -4,7 +4,7 @@ //! Background task for keeping track of DNS servers -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::names::ServiceName; diff --git a/nexus/src/app/background/tasks/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs index d0fd0ed20a..0ff1e06a46 100644 --- a/nexus/src/app/background/tasks/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -6,7 +6,7 @@ //! all Silos, their externally-visible DNS names, and the TLS certificates //! associated with those names -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use crate::app::external_endpoints::read_all_endpoints; pub use crate::app::external_endpoints::ExternalEndpoints; use futures::future::BoxFuture; @@ -118,7 +118,7 @@ impl BackgroundTask for ExternalEndpointsWatcher { #[cfg(test)] mod test { use super::ExternalEndpointsWatcher; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; use nexus_test_utils::resource_helpers::create_silo; diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index e1c94d577d..a6e579eb8a 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -4,7 +4,7 @@ //! Background task for pulling instance state from sled-agents. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index dae5ccae8e..95268334d9 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -4,7 +4,7 @@ //! Background task for reading inventory for the rack -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::ensure; use anyhow::Context; use futures::future::BoxFuture; @@ -189,7 +189,7 @@ mod test { use super::DbSledAgentEnumerator; use super::InventoryCollector; use crate::app::authz; - use crate::app::background::common::BackgroundTask; + use crate::app::background::BackgroundTask; use nexus_db_model::Generation; use nexus_db_model::SledBaseboard; use nexus_db_model::SledSystemHardware; diff --git a/nexus/src/app/background/tasks/metrics_producer_gc.rs b/nexus/src/app/background/tasks/metrics_producer_gc.rs index 5dbb16ef80..8c2c271cfb 100644 --- a/nexus/src/app/background/tasks/metrics_producer_gc.rs +++ b/nexus/src/app/background/tasks/metrics_producer_gc.rs @@ -5,7 +5,7 @@ //! Background task for garbage collecting metrics producers that have not //! renewed their lease -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use chrono::TimeDelta; use chrono::Utc; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/nat_cleanup.rs b/nexus/src/app/background/tasks/nat_cleanup.rs index 399b999340..675f4fc809 100644 --- a/nexus/src/app/background/tasks/nat_cleanup.rs +++ b/nexus/src/app/background/tasks/nat_cleanup.rs @@ -9,7 +9,7 @@ use crate::app::map_switch_zone_addrs; use super::networking::build_dpd_clients; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use chrono::{Duration, Utc}; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs index 6b128760ed..eac37eee19 100644 --- a/nexus/src/app/background/tasks/phantom_disks.rs +++ b/nexus/src/app/background/tasks/phantom_disks.rs @@ -18,7 +18,7 @@ //! this background task is required to apply the same fix for disks that are //! already in this phantom state. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/physical_disk_adoption.rs b/nexus/src/app/background/tasks/physical_disk_adoption.rs index db2b126d71..f3b9e8ac62 100644 --- a/nexus/src/app/background/tasks/physical_disk_adoption.rs +++ b/nexus/src/app/background/tasks/physical_disk_adoption.rs @@ -11,7 +11,7 @@ //! //! In the future, this may become more explicitly operator-controlled. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::PhysicalDisk; diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 14e11cc86b..635b956193 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -11,7 +11,7 @@ //! for more information. use crate::app::authn; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use crate::app::sagas; use crate::app::RegionAllocationStrategy; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/service_firewall_rules.rs b/nexus/src/app/background/tasks/service_firewall_rules.rs index 36a9926518..230172dab8 100644 --- a/nexus/src/app/background/tasks/service_firewall_rules.rs +++ b/nexus/src/app/background/tasks/service_firewall_rules.rs @@ -10,7 +10,7 @@ //! handle general changes to customer-visible VPC firewalls, and is mostly in //! place to propagate changes in the IP allowlist for user-facing services. -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs index 3152ae468e..59cd6a6a79 100644 --- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -8,7 +8,7 @@ use crate::app::map_switch_zone_addrs; use super::networking::build_dpd_clients; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index dd21aeca31..ed13545d3b 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -23,7 +23,7 @@ use nexus_db_model::{ }; use uuid::Uuid; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; use display_error_chain::DisplayErrorChain; use dpd_client::types::PortId; use futures::future::BoxFuture; diff --git a/nexus/src/app/background/tasks/v2p_mappings.rs b/nexus/src/app/background/tasks/v2p_mappings.rs index 16c9d57526..26ce131e9a 100644 --- a/nexus/src/app/background/tasks/v2p_mappings.rs +++ b/nexus/src/app/background/tasks/v2p_mappings.rs @@ -12,7 +12,7 @@ use omicron_common::api::external::Vni; use serde_json::json; use sled_agent_client::types::VirtualNetworkInterfaceHost; -use crate::app::background::common::BackgroundTask; +use crate::app::background::BackgroundTask; pub struct V2PManager { datastore: Arc, From c2594098454737e6e221d9e07e5e77249103e440 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 24 Jun 2024 11:11:50 -0700 Subject: [PATCH 04/23] rustfmt --- nexus/src/app/background/tasks/dns_config.rs | 2 +- nexus/src/app/background/tasks/dns_propagation.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs index 03e104090b..4cf8c4d86d 100644 --- a/nexus/src/app/background/tasks/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -158,8 +158,8 @@ impl BackgroundTask for DnsConfigWatcher { #[cfg(test)] mod test { use super::DnsConfigWatcher; - use crate::app::background::BackgroundTask; use crate::app::background::init::test::write_test_dns_generation; + use crate::app::background::BackgroundTask; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; diff --git a/nexus/src/app/background/tasks/dns_propagation.rs b/nexus/src/app/background/tasks/dns_propagation.rs index 07b3783f72..c680a6f010 100644 --- a/nexus/src/app/background/tasks/dns_propagation.rs +++ b/nexus/src/app/background/tasks/dns_propagation.rs @@ -178,8 +178,8 @@ async fn dns_propagate_one( #[cfg(test)] mod test { use super::DnsPropagator; - use crate::app::background::BackgroundTask; use crate::app::background::tasks::dns_servers::DnsServersList; + use crate::app::background::BackgroundTask; use dns_service_client::types::DnsConfigParams; use httptest::matchers::request; use httptest::responders::status_code; From 2303eeacbc75ec4ccaecc0e7ea84dfb2444c80e2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 24 Jun 2024 11:36:46 -0700 Subject: [PATCH 05/23] wrap too-long strings in background task subsystem --- nexus/src/app/background/init.rs | 68 ++++++----- .../background/tasks/abandoned_vmm_reaper.rs | 3 +- .../app/background/tasks/blueprint_load.rs | 4 +- nexus/src/app/background/tasks/dns_config.rs | 8 +- .../background/tasks/inventory_collection.rs | 12 +- .../background/tasks/metrics_producer_gc.rs | 2 +- .../src/app/background/tasks/phantom_disks.rs | 3 +- .../background/tasks/region_replacement.rs | 13 +-- .../tasks/service_firewall_rules.rs | 4 +- .../tasks/sync_switch_configuration.rs | 108 +++++++++--------- 10 files changed, 119 insertions(+), 106 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index bae478458f..36125eaef6 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -161,7 +161,7 @@ impl BackgroundTasks { String::from("metrics_producer_gc"), String::from( "unregisters Oximeter metrics producers that have not \ - renewed their lease", + renewed their lease", ), config.metrics_producer_gc.period_secs, Box::new(gc), @@ -180,8 +180,8 @@ impl BackgroundTasks { String::from("external_endpoints"), String::from( "reads config for silos and TLS certificates to determine \ - the right set of HTTP endpoints, their HTTP server names, \ - and which TLS certificates to use on each one", + the right set of HTTP endpoints, their HTTP server \ + names, and which TLS certificates to use on each one", ), config.external_endpoints.period_secs, Box::new(watcher), @@ -195,13 +195,13 @@ impl BackgroundTasks { driver.register( "nat_v4_garbage_collector".to_string(), String::from( - "prunes soft-deleted IPV4 NAT entries from ipv4_nat_entry table \ - based on a predetermined retention policy", + "prunes soft-deleted IPV4 NAT entries from ipv4_nat_entry \ + table based on a predetermined retention policy", ), config.nat_cleanup.period_secs, Box::new(nat_cleanup::Ipv4NatGarbageCollector::new( datastore.clone(), - resolver.clone() + resolver.clone(), )), opctx.child(BTreeMap::new()), vec![], @@ -213,7 +213,7 @@ impl BackgroundTasks { "bfd_manager".to_string(), String::from( "Manages bidirectional fowarding detection (BFD) \ - configuration on rack switches", + configuration on rack switches", ), config.bfd_manager.period_secs, Box::new(bfd::BfdManager::new( @@ -307,7 +307,7 @@ impl BackgroundTasks { String::from("inventory_collection"), String::from( "collects hardware and software inventory data from the \ - whole system", + whole system", ), config.inventory.period_secs, Box::new(collector), @@ -339,7 +339,8 @@ impl BackgroundTasks { driver.register( "service_zone_nat_tracker".to_string(), String::from( - "ensures service zone nat records are recorded in NAT RPW table", + "ensures service zone nat records are recorded in NAT RPW \ + table", ), config.sync_service_zone_nat.period_secs, Box::new(ServiceZoneNatTracker::new( @@ -386,7 +387,10 @@ impl BackgroundTasks { let task = driver.register( String::from("region_replacement"), - String::from("detects if a region requires replacing and begins the process"), + String::from( + "detects if a region requires replacing and begins the \ + process", + ), config.region_replacement.period_secs, Box::new(detector), opctx.child(BTreeMap::new()), @@ -417,8 +421,8 @@ impl BackgroundTasks { let task_service_firewall_propagation = driver.register( String::from("service_firewall_rule_propagation"), String::from( - "propagates VPC firewall rules for Omicron \ - services with external network connectivity", + "propagates VPC firewall rules for Omicron services with \ + external network connectivity", ), config.service_firewall_propagation.period_secs, Box::new(service_firewall_rules::ServiceRulePropagator::new( @@ -430,17 +434,16 @@ impl BackgroundTasks { // Background task: abandoned VMM reaping let task_abandoned_vmm_reaper = driver.register( - String::from("abandoned_vmm_reaper"), - String::from( - "deletes sled reservations for VMMs that have been abandoned by their instances", - ), - config.abandoned_vmm_reaper.period_secs, - Box::new(abandoned_vmm_reaper::AbandonedVmmReaper::new( - datastore, - )), - opctx.child(BTreeMap::new()), - vec![], - ); + String::from("abandoned_vmm_reaper"), + String::from( + "deletes sled reservations for VMMs that have been abandoned \ + by their instances", + ), + config.abandoned_vmm_reaper.period_secs, + Box::new(abandoned_vmm_reaper::AbandonedVmmReaper::new(datastore)), + opctx.child(BTreeMap::new()), + vec![], + ); BackgroundTasks { driver, @@ -525,8 +528,8 @@ fn init_dns( format!("dns_propagation_{}", dns_group), format!( "propagates latest {} DNS configuration (from {:?} background \ - task) to the latest list of DNS servers (from {:?} background \ - task)", + task) to the latest list of DNS servers (from {:?} background \ + task)", dns_group, task_name_config, task_name_servers, ), config.period_secs_propagation, @@ -608,7 +611,10 @@ pub mod test { }; match record.get(0) { Some(dns_service_client::types::DnsRecord::Srv(srv)) => srv, - record => panic!("expected a SRV record for {internal_dns_srv_name}, found {record:?}"), + record => panic!( + "expected a SRV record for {internal_dns_srv_name}, found \ + {record:?}" + ), } }; @@ -755,7 +761,7 @@ pub mod test { ) { println!( "waiting for propagation of generation {generation} to {label} \ - DNS server ({addr})", + DNS server ({addr})", ); let client = dns_service_client::Client::new( @@ -786,13 +792,13 @@ pub mod test { .await; if let Err(err) = result { panic!( - "DNS generation {generation} not propagated to \ - {label} DNS server ({addr}) within {poll_max:?}: {err}" + "DNS generation {generation} not propagated to {label} DNS \ + server ({addr}) within {poll_max:?}: {err}" ); } else { println!( - "DNS generation {generation} propagated to {label} \ - DNS server ({addr}) successfully." + "DNS generation {generation} propagated to {label} DNS server \ + ({addr}) successfully." ); } } diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index bd9a7b1602..a81080ec75 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -135,7 +135,8 @@ impl AbandonedVmmReaper { results.error_count += 1; *last_err = Err(e).with_context(|| { format!( - "failed to delete sled reservation for VMM {vmm_id}" + "failed to delete sled reservation for VMM \ + {vmm_id}" ) }); } diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index 38cfe85558..31bc00441d 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -127,8 +127,8 @@ impl BackgroundTask for TargetBlueprintLoader { // bugs further up the stack. if *old_blueprint != new_blueprint { let message = format!( - "blueprint for id {} changed. \ - Blueprints are supposed to be immutable.", + "blueprint for id {} changed. Blueprints are supposed \ + to be immutable.", target_id ); error!(&log, "{}", message); diff --git a/nexus/src/app/background/tasks/dns_config.rs b/nexus/src/app/background/tasks/dns_config.rs index 4cf8c4d86d..1b0f627870 100644 --- a/nexus/src/app/background/tasks/dns_config.rs +++ b/nexus/src/app/background/tasks/dns_config.rs @@ -100,8 +100,8 @@ impl BackgroundTask for DnsConfigWatcher { // we just read. This should never happen because we // never remove the latest generation. let message = format!( - "found latest DNS generation ({}) is older \ - than the one we already know about ({})", + "found latest DNS generation ({}) is older than \ + the one we already know about ({})", new.generation, old.generation ); @@ -115,8 +115,8 @@ impl BackgroundTask for DnsConfigWatcher { // immutable once created. let message = format!( "found DNS config at generation {} that does \ - not match the config that we already have for \ - the same generation", + not match the config that we already have \ + for the same generation", new.generation ); error!(&log, "{}", message); diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index 95268334d9..1e2d3bda1f 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -270,8 +270,8 @@ mod test { // has pushed us out. if our_collections.is_empty() { println!( - "iter {i}: no test collections \ - ({num_collections} Nexus collections)", + "iter {i}: no test collections ({num_collections} Nexus \ + collections)", ); continue; } @@ -285,8 +285,8 @@ mod test { // tail of all IDs we've seen matches the ones we saw in this // iteration (i.e., we're pushing out old collections in order). println!( - "iter {i}: saw {our_collections:?}; \ - should match tail of {all_our_collection_ids:?}" + "iter {i}: saw {our_collections:?}; should match tail of \ + {all_our_collection_ids:?}" ); assert_eq!( all_our_collection_ids @@ -398,8 +398,8 @@ mod test { assert_eq!( removed_urls.len(), 1, - "expected to find exactly one sled URL matching our \ - expunged sled's URL" + "expected to find exactly one sled URL matching our expunged \ + sled's URL" ); let mut found_urls = db_enum.list_sled_agents().await.unwrap(); found_urls.sort(); diff --git a/nexus/src/app/background/tasks/metrics_producer_gc.rs b/nexus/src/app/background/tasks/metrics_producer_gc.rs index 8c2c271cfb..1df0afb7ed 100644 --- a/nexus/src/app/background/tasks/metrics_producer_gc.rs +++ b/nexus/src/app/background/tasks/metrics_producer_gc.rs @@ -144,7 +144,7 @@ mod tests { { panic!( "failed to update time_modified for producer {producer_id}: \ - {err}" + {err}" ); } } diff --git a/nexus/src/app/background/tasks/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs index eac37eee19..4b0d8bec38 100644 --- a/nexus/src/app/background/tasks/phantom_disks.rs +++ b/nexus/src/app/background/tasks/phantom_disks.rs @@ -74,7 +74,8 @@ impl BackgroundTask for PhantomDiskDetector { if let Err(e) = result { error!( &log, - "error un-deleting disk {} and setting to faulted: {:#}", + "error un-deleting disk {} and setting to faulted: \ + {:#}", disk.id(), e, ); diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 635b956193..9e14c294ba 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -82,8 +82,7 @@ impl BackgroundTask for RegionReplacementDetector { Err(e) => { error!( &log, - "find_regions_on_expunged_physical_disks failed: \ - {e}" + "find_regions_on_expunged_physical_disks failed: {e}" ); err += 1; @@ -110,8 +109,8 @@ impl BackgroundTask for RegionReplacementDetector { Err(e) => { error!( &log, - "error looking for existing region \ - replacement requests for {}: {e}", + "error looking for existing region replacement \ + requests for {}: {e}", region.id(), ); continue; @@ -130,7 +129,7 @@ impl BackgroundTask for RegionReplacementDetector { info!( &log, "added region replacement request \ - {request_id} for {} volume {}", + {request_id} for {} volume {}", region.id(), region.volume_id(), ); @@ -140,7 +139,7 @@ impl BackgroundTask for RegionReplacementDetector { error!( &log, "error adding region replacement request for \ - region {} volume id {}: {e}", + region {} volume id {}: {e}", region.id(), region.volume_id(), ); @@ -172,7 +171,7 @@ impl BackgroundTask for RegionReplacementDetector { error!( &log, "sending region replacement start request \ - failed: {e}", + failed: {e}", ); err += 1; } diff --git a/nexus/src/app/background/tasks/service_firewall_rules.rs b/nexus/src/app/background/tasks/service_firewall_rules.rs index 230172dab8..4004de42c8 100644 --- a/nexus/src/app/background/tasks/service_firewall_rules.rs +++ b/nexus/src/app/background/tasks/service_firewall_rules.rs @@ -38,8 +38,8 @@ impl BackgroundTask for ServiceRulePropagator { .new(slog::o!("component" => "service-firewall-rule-progator")); debug!( log, - "starting background task for service \ - firewall rule propagation" + "starting background task for service firewall rule \ + propagation" ); let start = std::time::Instant::now(); let res = nexus_networking::plumb_service_firewall_rules( diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index ed13545d3b..0351c9542a 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -235,14 +235,19 @@ impl SwitchPortSettingsManager { let config = sled_agent_client::types::BfdPeerConfig { local: spec.local.map(|x| x.ip()), remote: spec.remote.ip(), - detection_threshold: spec.detection_threshold.0.try_into().map_err(|_| { - omicron_common::api::external::Error::InternalError { - internal_message: format!( - "db_bfd_peer_configs: detection threshold overflow: {}", - spec.detection_threshold.0, - ), - } - })?, + detection_threshold: spec + .detection_threshold + .0 + .try_into() + .map_err(|_| { + omicron_common::api::external::Error::InternalError { + internal_message: format!( + "db_bfd_peer_configs: detection threshold \ + overflow: {}", + spec.detection_threshold.0, + ), + } + })?, required_rx: spec.required_rx.0.into(), mode: match spec.mode { nexus_db_model::BfdMode::SingleHop => { @@ -252,15 +257,17 @@ impl SwitchPortSettingsManager { sled_agent_client::types::BfdMode::MultiHop } }, - switch: spec.switch.parse().map_err(|e: ParseSwitchLocationError| { - omicron_common::api::external::Error::InternalError { - internal_message: format!( - "db_bfd_peer_configs: failed to parse switch name: {}: {:?}", - spec.switch, - e, - ), - } - })?, + switch: spec.switch.parse().map_err( + |e: ParseSwitchLocationError| { + omicron_common::api::external::Error::InternalError { + internal_message: format!( + "db_bfd_peer_configs: failed to parse switch \ + name: {}: {:?}", + spec.switch, e, + ), + } + }, + )?, }; result.push(config); } @@ -1760,45 +1767,44 @@ async fn static_routes_on_switch<'a>( let mut routes_on_switch = HashMap::new(); for (location, client) in mgd_clients { - let static_routes: SwitchStaticRoutes = match client - .static_list_v4_routes() - .await - { - Ok(routes) => { - let mut flattened = HashSet::new(); - for (destination, paths) in routes.iter() { - let Ok(dst) = destination.parse() else { - error!( + let static_routes: SwitchStaticRoutes = + match client.static_list_v4_routes().await { + Ok(routes) => { + let mut flattened = HashSet::new(); + for (destination, paths) in routes.iter() { + let Ok(dst) = destination.parse() else { + error!( log, - "failed to parse static route destination: {destination}" + "failed to parse static route destination: \ + {destination}" ); - continue; - }; - for p in paths.iter() { - let nh = match p.nexthop { - IpAddr::V4(addr) => addr, - IpAddr::V6(addr) => { - error!( - log, - "ipv6 nexthops not supported: {addr}" - ); - continue; - } + continue; }; - flattened.insert((nh, dst, p.vlan_id)); + for p in paths.iter() { + let nh = match p.nexthop { + IpAddr::V4(addr) => addr, + IpAddr::V6(addr) => { + error!( + log, + "ipv6 nexthops not supported: {addr}" + ); + continue; + } + }; + flattened.insert((nh, dst, p.vlan_id)); + } } + flattened } - flattened - } - Err(_) => { - error!( - &log, - "unable to retrieve routes from switch"; - "switch_location" => ?location, - ); - continue; - } - }; + Err(_) => { + error!( + &log, + "unable to retrieve routes from switch"; + "switch_location" => ?location, + ); + continue; + } + }; routes_on_switch.insert(*location, static_routes); } routes_on_switch From ee8bac540c9bc4a4cbfa4bef9b0e2a1a882556bc Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 24 Jun 2024 11:52:37 -0700 Subject: [PATCH 06/23] task name consistency; avoid using driver.activate() directly --- nexus/src/app/background/driver.rs | 2 +- nexus/src/app/background/init.rs | 16 ++++++++++------ nexus/src/app/bfd.rs | 10 ++-------- nexus/src/app/mod.rs | 2 +- nexus/src/app/switch_interface.rs | 2 -- nexus/src/app/switch_port.rs | 3 --- 6 files changed, 14 insertions(+), 21 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index f1982b1ad8..e620f5d7bc 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -156,7 +156,7 @@ impl Driver { /// /// If the task is currently running, it will be activated again when it /// finishes. - pub fn activate(&self, task: &TaskHandle) { + pub(super) fn activate(&self, task: &TaskHandle) { self.task_required(task).notify.notify_one(); } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 36125eaef6..447899b532 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -68,10 +68,10 @@ pub struct BackgroundTasks { Option, >, /// task handle for the ipv4 nat entry garbage collector - pub nat_cleanup: TaskHandle, + pub task_nat_cleanup: TaskHandle, /// task handle for the switch bfd manager - pub bfd_manager: TaskHandle, + pub task_bfd_manager: TaskHandle, /// task handle for the task that collects inventory pub task_inventory_collection: TaskHandle, @@ -191,7 +191,7 @@ impl BackgroundTasks { (task, watcher_channel) }; - let nat_cleanup = { + let task_nat_cleanup = { driver.register( "nat_v4_garbage_collector".to_string(), String::from( @@ -208,7 +208,7 @@ impl BackgroundTasks { ) }; - let bfd_manager = { + let task_bfd_manager = { driver.register( "bfd_manager".to_string(), String::from( @@ -454,8 +454,8 @@ impl BackgroundTasks { task_metrics_producer_gc, task_external_endpoints, external_endpoints, - nat_cleanup, - bfd_manager, + task_nat_cleanup, + task_bfd_manager, task_inventory_collection, task_physical_disk_adoption, task_phantom_disks, @@ -472,6 +472,10 @@ impl BackgroundTasks { } } + /// Activate the specified background task + /// + /// If the task is currently running, it will be activated again when it + /// finishes. pub fn activate(&self, task: &TaskHandle) { self.driver.activate(task); } diff --git a/nexus/src/app/bfd.rs b/nexus/src/app/bfd.rs index 0afa238ee3..1ae958c20d 100644 --- a/nexus/src/app/bfd.rs +++ b/nexus/src/app/bfd.rs @@ -39,12 +39,9 @@ impl super::Nexus { // add the bfd session to the db and trigger the bfd manager to handle // the reset self.datastore().bfd_session_create(opctx, &session).await?; - self.background_tasks - .driver - .activate(&self.background_tasks.bfd_manager); + self.background_tasks.activate(&self.background_tasks.task_bfd_manager); // for timely propagation to bootstore self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(()) } @@ -57,12 +54,9 @@ impl super::Nexus { // remove the bfd session from the db and trigger the bfd manager to // handle the reset self.datastore().bfd_session_delete(opctx, &session).await?; - self.background_tasks - .driver - .activate(&self.background_tasks.bfd_manager); + self.background_tasks.activate(&self.background_tasks.task_bfd_manager); // for timely propagation to bootstore self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(()) } diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 8e4a795a95..3462a1429b 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -504,7 +504,7 @@ impl Nexus { "populate complete; activating background tasks" ); for task in task_nexus.background_tasks.driver.tasks() { - task_nexus.background_tasks.driver.activate(task); + task_nexus.background_tasks.activate(task); } } Err(_) => { diff --git a/nexus/src/app/switch_interface.rs b/nexus/src/app/switch_interface.rs index bb4cba4c7b..c4e69d1e3e 100644 --- a/nexus/src/app/switch_interface.rs +++ b/nexus/src/app/switch_interface.rs @@ -57,7 +57,6 @@ impl super::Nexus { // eagerly propagate changes via rpw self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(value) @@ -86,7 +85,6 @@ impl super::Nexus { // eagerly propagate changes via rpw self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(()) diff --git a/nexus/src/app/switch_port.rs b/nexus/src/app/switch_port.rs index 7a6d56252a..bb35b6939e 100644 --- a/nexus/src/app/switch_port.rs +++ b/nexus/src/app/switch_port.rs @@ -100,7 +100,6 @@ impl super::Nexus { // eagerly propagate changes via rpw self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(result) @@ -214,7 +213,6 @@ impl super::Nexus { // eagerly propagate changes via rpw self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(()) @@ -248,7 +246,6 @@ impl super::Nexus { // eagerly propagate changes via rpw self.background_tasks - .driver .activate(&self.background_tasks.task_switch_port_settings_manager); Ok(()) From 87d6d8b810861d9f4cff8c904de2639b2176301d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 25 Jun 2024 15:19:51 -0700 Subject: [PATCH 07/23] BackgroundTasks: initial cleanup --- nexus/src/app/background/init.rs | 56 ++++---------------------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 447899b532..d0461f0f6c 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -39,81 +39,39 @@ use std::sync::Arc; use tokio::sync::mpsc::Sender; use uuid::Uuid; -/// Describes ongoing background tasks and provides interfaces for working with -/// them -/// -/// Most interaction happens through the `driver` field. The rest of the fields -/// are specific background tasks. +/// Describes ongoing background tasks and provides interfaces for activating +/// them and accessing any data that they expose to Nexus at-large pub struct BackgroundTasks { /// interface for working with background tasks (activation, checking /// status, etc.) pub driver: Driver, - /// task handle for the internal DNS config background task pub task_internal_dns_config: TaskHandle, - /// task handle for the internal DNS servers background task pub task_internal_dns_servers: TaskHandle, - /// task handle for the external DNS config background task pub task_external_dns_config: TaskHandle, - /// task handle for the external DNS servers background task pub task_external_dns_servers: TaskHandle, - - /// task handle for pruning metrics producers with expired leases pub task_metrics_producer_gc: TaskHandle, - - /// task handle for the task that keeps track of external endpoints pub task_external_endpoints: TaskHandle, - /// external endpoints read by the background task - pub external_endpoints: tokio::sync::watch::Receiver< - Option, - >, - /// task handle for the ipv4 nat entry garbage collector pub task_nat_cleanup: TaskHandle, - - /// task handle for the switch bfd manager pub task_bfd_manager: TaskHandle, - - /// task handle for the task that collects inventory pub task_inventory_collection: TaskHandle, - - /// task handle for the task that collects inventory pub task_physical_disk_adoption: TaskHandle, - - /// task handle for the task that detects phantom disks pub task_phantom_disks: TaskHandle, - - /// task handle for blueprint target loader pub task_blueprint_loader: TaskHandle, - - /// task handle for blueprint execution background task pub task_blueprint_executor: TaskHandle, - - /// task handle for collecting CockroachDB node IDs pub task_crdb_node_id_collector: TaskHandle, - - /// task handle for the service zone nat tracker pub task_service_zone_nat_tracker: TaskHandle, - - /// task handle for the switch port settings manager pub task_switch_port_settings_manager: TaskHandle, - - /// task handle for the opte v2p manager pub task_v2p_manager: TaskHandle, - - /// task handle for the task that detects if regions need replacement and - /// begins the process pub task_region_replacement: TaskHandle, - - /// task handle for the task that polls sled agents for instance states. pub task_instance_watcher: TaskHandle, - - /// task handle for propagation of VPC firewall rules for Omicron services - /// with external network connectivity, pub task_service_firewall_propagation: TaskHandle, - - /// task handle for deletion of database records for VMMs abandoned by their - /// instances. pub task_abandoned_vmm_reaper: TaskHandle, + + /// external endpoints read by the background task + pub external_endpoints: tokio::sync::watch::Receiver< + Option, + >, } impl BackgroundTasks { From 257f18d1e3ab96ba00ad250ab51b8bb24456b83f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 25 Jun 2024 15:23:45 -0700 Subject: [PATCH 08/23] TaskHandle -> TaskName --- nexus/src/app/background/driver.rs | 22 +++++++------- nexus/src/app/background/init.rs | 48 +++++++++++++++--------------- nexus/src/app/background/mod.rs | 7 ++--- nexus/src/app/background/status.rs | 8 ++--- 4 files changed, 42 insertions(+), 43 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index e620f5d7bc..f6ddd6a5fc 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -5,7 +5,7 @@ //! Manages execution of background tasks use super::BackgroundTask; -use super::TaskHandle; +use super::TaskName; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -35,7 +35,7 @@ use tokio::time::MissedTickBehavior; /// provides interfaces for monitoring high-level state of each task (e.g., when /// it last ran, whether it's currently running, etc.). pub struct Driver { - tasks: BTreeMap, + tasks: BTreeMap, } /// Driver-side state of a background task @@ -88,7 +88,7 @@ impl Driver { imp: Box, opctx: OpContext, watchers: Vec>, - ) -> TaskHandle { + ) -> TaskName { // Activation of the background task happens in a separate tokio task. // Set up a channel so that tokio task can report status back to us. let (status_tx, status_rx) = watch::channel(TaskStatus { @@ -115,13 +115,13 @@ impl Driver { // tokio task. let task = Task { description, period, status: status_rx, tokio_task, notify }; - if self.tasks.insert(TaskHandle(name.clone()), task).is_some() { + if self.tasks.insert(TaskName(name.clone()), task).is_some() { panic!("started two background tasks called {:?}", name); } // Return a handle that the caller can use to activate the task or get // its status. - TaskHandle(name) + TaskName(name) } /// Enumerate all registered background tasks @@ -129,11 +129,11 @@ impl Driver { /// This is aimed at callers that want to get the status of all background /// tasks. You'd call [`Driver::task_status()`] with each of the items /// produced by the iterator. - pub fn tasks(&self) -> impl Iterator { + pub fn tasks(&self) -> impl Iterator { self.tasks.keys() } - fn task_required(&self, task: &TaskHandle) -> &Task { + fn task_required(&self, task: &TaskName) -> &Task { // It should be hard to hit this in practice, since you'd have to have // gotten a TaskHandle from somewhere. It would have to be another // Driver instance. But it's generally a singleton. @@ -143,12 +143,12 @@ impl Driver { } /// Returns a summary of what this task does (for developers) - pub fn task_description(&self, task: &TaskHandle) -> &str { + pub fn task_description(&self, task: &TaskName) -> &str { &self.task_required(task).description } /// Returns the configured period of the task - pub fn task_period(&self, task: &TaskHandle) -> Duration { + pub fn task_period(&self, task: &TaskName) -> Duration { self.task_required(task).period } @@ -156,12 +156,12 @@ impl Driver { /// /// If the task is currently running, it will be activated again when it /// finishes. - pub(super) fn activate(&self, task: &TaskHandle) { + pub(super) fn activate(&self, task: &TaskName) { self.task_required(task).notify.notify_one(); } /// Returns the runtime status of the background task - pub fn task_status(&self, task: &TaskHandle) -> TaskStatus { + pub fn task_status(&self, task: &TaskName) -> TaskStatus { // Borrowing from a watch channel's receiver blocks the sender. Clone // the status to avoid an errant caller gumming up the works by hanging // on to a reference. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index d0461f0f6c..109bf4f551 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -25,7 +25,7 @@ use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::v2p_mappings::V2PManager; use super::Driver; -use super::TaskHandle; +use super::TaskName; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; use nexus_config::BackgroundTaskConfig; @@ -46,27 +46,27 @@ pub struct BackgroundTasks { /// status, etc.) pub driver: Driver, - pub task_internal_dns_config: TaskHandle, - pub task_internal_dns_servers: TaskHandle, - pub task_external_dns_config: TaskHandle, - pub task_external_dns_servers: TaskHandle, - pub task_metrics_producer_gc: TaskHandle, - pub task_external_endpoints: TaskHandle, - pub task_nat_cleanup: TaskHandle, - pub task_bfd_manager: TaskHandle, - pub task_inventory_collection: TaskHandle, - pub task_physical_disk_adoption: TaskHandle, - pub task_phantom_disks: TaskHandle, - pub task_blueprint_loader: TaskHandle, - pub task_blueprint_executor: TaskHandle, - pub task_crdb_node_id_collector: TaskHandle, - pub task_service_zone_nat_tracker: TaskHandle, - pub task_switch_port_settings_manager: TaskHandle, - pub task_v2p_manager: TaskHandle, - pub task_region_replacement: TaskHandle, - pub task_instance_watcher: TaskHandle, - pub task_service_firewall_propagation: TaskHandle, - pub task_abandoned_vmm_reaper: TaskHandle, + pub task_internal_dns_config: TaskName, + pub task_internal_dns_servers: TaskName, + pub task_external_dns_config: TaskName, + pub task_external_dns_servers: TaskName, + pub task_metrics_producer_gc: TaskName, + pub task_external_endpoints: TaskName, + pub task_nat_cleanup: TaskName, + pub task_bfd_manager: TaskName, + pub task_inventory_collection: TaskName, + pub task_physical_disk_adoption: TaskName, + pub task_phantom_disks: TaskName, + pub task_blueprint_loader: TaskName, + pub task_blueprint_executor: TaskName, + pub task_crdb_node_id_collector: TaskName, + pub task_service_zone_nat_tracker: TaskName, + pub task_switch_port_settings_manager: TaskName, + pub task_v2p_manager: TaskName, + pub task_region_replacement: TaskName, + pub task_instance_watcher: TaskName, + pub task_service_firewall_propagation: TaskName, + pub task_abandoned_vmm_reaper: TaskName, /// external endpoints read by the background task pub external_endpoints: tokio::sync::watch::Receiver< @@ -434,7 +434,7 @@ impl BackgroundTasks { /// /// If the task is currently running, it will be activated again when it /// finishes. - pub fn activate(&self, task: &TaskHandle) { + pub fn activate(&self, task: &TaskName) { self.driver.activate(task); } } @@ -446,7 +446,7 @@ fn init_dns( dns_group: DnsGroup, resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, -) -> (TaskHandle, TaskHandle) { +) -> (TaskName, TaskName) { let dns_group_name = dns_group.to_string(); let metadata = BTreeMap::from([("dns_group".to_string(), dns_group_name)]); diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 40716aa036..e70ec50016 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -155,11 +155,10 @@ pub trait BackgroundTask: Send + Sync { /// background task. It's then accepted by functions like /// [`Driver::activate()`] and [`Driver::task_status()`] to identify the task. #[derive(Clone, Debug, Ord, PartialOrd, PartialEq, Eq)] -pub struct TaskHandle(String); +pub struct TaskName(String); -impl TaskHandle { - /// Returns the unique name of this background task - pub fn name(&self) -> &str { +impl TaskName { + pub fn as_str(&self) -> &str { &self.0 } } diff --git a/nexus/src/app/background/status.rs b/nexus/src/app/background/status.rs index f4fb9e56e5..c6c8b1dcf1 100644 --- a/nexus/src/app/background/status.rs +++ b/nexus/src/app/background/status.rs @@ -25,7 +25,7 @@ impl Nexus { Ok(driver .tasks() .map(|t| { - let name = t.name(); + let name = t.as_str(); let description = driver.task_description(t); let period = driver.task_period(t); let status = driver.task_status(t); @@ -45,14 +45,14 @@ impl Nexus { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; let driver = &self.background_tasks.driver; let task = - driver.tasks().find(|t| t.name() == name).ok_or_else(|| { + driver.tasks().find(|t| t.as_str() == name).ok_or_else(|| { LookupType::ByName(name.to_owned()) .into_not_found(ResourceType::BackgroundTask) })?; let description = driver.task_description(task); let status = driver.task_status(task); let period = driver.task_period(task); - Ok(BackgroundTask::new(task.name(), description, period, status)) + Ok(BackgroundTask::new(task.as_str(), description, period, status)) } pub(crate) async fn bgtask_activate( @@ -66,7 +66,7 @@ impl Nexus { // Ensure all task names are valid by removing them from the set of // names as we find them. let tasks_to_activate: Vec<_> = - driver.tasks().filter(|t| names.remove(t.name())).collect(); + driver.tasks().filter(|t| names.remove(t.as_str())).collect(); // If any names weren't recognized, return an error. if !names.is_empty() { From 0449bf4c620b3a02ffb8b42a046ff476fc43f9d1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 10:36:30 -0700 Subject: [PATCH 09/23] WIP: reworking initialization (still lots of stuff to fix up) --- nexus/src/app/background/driver.rs | 897 +++++++++--------- nexus/src/app/background/init.rs | 485 ++++++---- nexus/src/app/background/mod.rs | 1 + nexus/src/app/background/status.rs | 12 +- .../background/tasks/external_endpoints.rs | 180 ++-- nexus/src/app/mod.rs | 39 +- 6 files changed, 875 insertions(+), 739 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index f6ddd6a5fc..0bcdfa6335 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -4,6 +4,7 @@ //! Manages execution of background tasks +use super::init::ExternalTaskHandle; use super::BackgroundTask; use super::TaskName; use assert_matches::assert_matches; @@ -20,6 +21,7 @@ use nexus_types::internal_api::views::LastResult; use nexus_types::internal_api::views::LastResultCompleted; use nexus_types::internal_api::views::TaskStatus; use std::collections::BTreeMap; +use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -88,7 +90,8 @@ impl Driver { imp: Box, opctx: OpContext, watchers: Vec>, - ) -> TaskName { + task_external: &ExternalTaskHandle, + ) { // Activation of the background task happens in a separate tokio task. // Set up a channel so that tokio task can report status back to us. let (status_tx, status_rx) = watch::channel(TaskStatus { @@ -96,9 +99,24 @@ impl Driver { last: LastResult::NeverCompleted, }); - // Set up a channel so that we can wake up the tokio task if somebody - // requests an explicit activation. - let notify = Arc::new(Notify::new()); + // Hook into the given external task handle's notification channel so + // that we can wake up the tokio task if somebody requests an explicit + // activation. + if let Err(previous) = task_external.wired_up.compare_exchange( + false, + true, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + panic!( + "attempted to wire up the same background task handle twice \ + (previous \"wired_up\" = {}): currently attempting to wire \ + it up to task {:?}", + previous, name + ); + } + + let notify = Arc::clone(&task_external.notify); // Spawn the tokio task that will manage activation of the background // task. @@ -118,10 +136,6 @@ impl Driver { if self.tasks.insert(TaskName(name.clone()), task).is_some() { panic!("started two background tasks called {:?}", name); } - - // Return a handle that the caller can use to activate the task or get - // its status. - TaskName(name) } /// Enumerate all registered background tasks @@ -315,436 +329,437 @@ impl GenericWatcher for watch::Receiver { } } -#[cfg(test)] -mod test { - use super::BackgroundTask; - use super::Driver; - use crate::app::sagas::SagaRequest; - use assert_matches::assert_matches; - use chrono::Utc; - use futures::future::BoxFuture; - use futures::FutureExt; - use nexus_db_queries::context::OpContext; - use nexus_test_utils_macros::nexus_test; - use nexus_types::internal_api::views::ActivationReason; - use std::time::Duration; - use std::time::Instant; - use tokio::sync::mpsc; - use tokio::sync::mpsc::error::TryRecvError; - use tokio::sync::mpsc::Sender; - use tokio::sync::watch; - - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - - /// Simple BackgroundTask impl that just reports how many times it's run. - struct ReportingTask { - counter: usize, - tx: watch::Sender, - } - - impl ReportingTask { - fn new() -> (ReportingTask, watch::Receiver) { - let (tx, rx) = watch::channel(0); - (ReportingTask { counter: 1, tx }, rx) - } - } - - impl BackgroundTask for ReportingTask { - fn activate<'a>( - &'a mut self, - _: &'a OpContext, - ) -> BoxFuture<'a, serde_json::Value> { - async { - let count = self.counter; - self.counter += 1; - self.tx.send_replace(count); - serde_json::Value::Number(serde_json::Number::from(count)) - } - .boxed() - } - } - - async fn wait_until_count(mut rx: watch::Receiver, count: usize) { - loop { - let v = rx.borrow_and_update(); - assert!(*v <= count, "count went past what we expected"); - if *v == count { - return; - } - drop(v); - - tokio::time::timeout(Duration::from_secs(5), rx.changed()) - .await - .unwrap() - .unwrap(); - } - } - - // Verifies that activation through each of the three mechanisms (explicit - // signal, timeout, or dependency) causes exactly the right tasks to be - // activated - #[nexus_test(server = crate::Server)] - async fn test_driver_basic(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - // Create up front: - // - // - three ReportingTasks (our background tasks) - // - two "watch" channels used as dependencies for these tasks - - let (t1, rx1) = ReportingTask::new(); - let (t2, rx2) = ReportingTask::new(); - let (t3, rx3) = ReportingTask::new(); - let (dep_tx1, dep_rx1) = watch::channel(0); - let (dep_tx2, dep_rx2) = watch::channel(0); - let mut driver = Driver::new(); - - assert_eq!(*rx1.borrow(), 0); - let h1 = driver.register( - "t1".to_string(), - "test task".to_string(), - Duration::from_millis(100), - Box::new(t1), - opctx.child(std::collections::BTreeMap::new()), - vec![Box::new(dep_rx1.clone()), Box::new(dep_rx2.clone())], - ); - - let h2 = driver.register( - "t2".to_string(), - "test task".to_string(), - Duration::from_secs(300), // should never fire in this test - Box::new(t2), - opctx.child(std::collections::BTreeMap::new()), - vec![Box::new(dep_rx1.clone())], - ); - - let h3 = driver.register( - "t3".to_string(), - "test task".to_string(), - Duration::from_secs(300), // should never fire in this test - Box::new(t3), - opctx, - vec![Box::new(dep_rx1), Box::new(dep_rx2)], - ); - - // Wait for four activations of our task. (This is three periods.) That - // should take between 300ms and 400ms. Allow extra time for a busy - // system. - let start = Instant::now(); - let wall_start = Utc::now(); - wait_until_count(rx1.clone(), 4).await; - assert!(*rx1.borrow() == 4 || *rx1.borrow() == 5); - let duration = start.elapsed(); - println!("rx1 -> 3 took {:?}", duration); - assert!( - duration.as_millis() < 1000, - "took longer than 1s to activate our every-100ms-task three times" - ); - assert!(duration.as_millis() >= 300); - // Check how the last activation was reported. - let status = driver.task_status(&h1); - let last = status.last.unwrap_completion(); - // It's conceivable that there's been another activation already. - assert!(last.iteration == 3 || last.iteration == 4); - assert!(last.start_time >= wall_start); - assert!(last.start_time <= Utc::now()); - assert!(last.elapsed <= duration); - assert_matches!( - last.details, - serde_json::Value::Number(n) - if n.as_u64().unwrap() == last.iteration - ); - - // Tasks "t2" and "t3" ought to have seen only one activation in this - // time, from its beginning-of-time activation. - assert_eq!(*rx2.borrow(), 1); - assert_eq!(*rx3.borrow(), 1); - let status = driver.task_status(&h2); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 1); - let status = driver.task_status(&h3); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 1); - - // Explicitly wake up all of our tasks by reporting that dep1 has - // changed. - println!("firing dependency tx1"); - dep_tx1.send_replace(1); - wait_until_count(rx2.clone(), 2).await; - wait_until_count(rx3.clone(), 2).await; - assert_eq!(*rx2.borrow(), 2); - assert_eq!(*rx3.borrow(), 2); - let status = driver.task_status(&h2); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 2); - let status = driver.task_status(&h3); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 2); - - // Explicitly wake up just "t3" by reporting that dep2 has changed. - println!("firing dependency tx2"); - dep_tx2.send_replace(1); - wait_until_count(rx3.clone(), 3).await; - assert_eq!(*rx2.borrow(), 2); - assert_eq!(*rx3.borrow(), 3); - let status = driver.task_status(&h2); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 2); - let status = driver.task_status(&h3); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 3); - - // Explicitly activate just "t3". - driver.activate(&h3); - wait_until_count(rx3.clone(), 4).await; - assert_eq!(*rx2.borrow(), 2); - assert_eq!(*rx3.borrow(), 4); - let status = driver.task_status(&h2); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 2); - let status = driver.task_status(&h3); - let last = status.last.unwrap_completion(); - assert_eq!(last.iteration, 4); - } - - /// Simple background task that moves in lockstep with a consumer, allowing - /// the creator to be notified when it becomes active and to determine when - /// the activation finishes. - struct PausingTask { - counter: usize, - ready_tx: mpsc::Sender, - wait_rx: mpsc::Receiver<()>, - } - - impl PausingTask { - fn new( - wait_rx: mpsc::Receiver<()>, - ) -> (PausingTask, mpsc::Receiver) { - let (ready_tx, ready_rx) = mpsc::channel(10); - (PausingTask { counter: 1, wait_rx, ready_tx }, ready_rx) - } - } - - impl BackgroundTask for PausingTask { - fn activate<'a>( - &'a mut self, - _: &'a OpContext, - ) -> BoxFuture<'a, serde_json::Value> { - async { - let count = self.counter; - self.counter += 1; - let _ = self.ready_tx.send(count).await; - let _ = self.wait_rx.recv().await; - serde_json::Value::Null - } - .boxed() - } - } - - // Exercises various case of activation while a background task is currently - // activated. - #[nexus_test(server = crate::Server)] - async fn test_activation_in_progress(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - let mut driver = Driver::new(); - let (tx1, rx1) = mpsc::channel(10); - let (t1, mut ready_rx1) = PausingTask::new(rx1); - let (dep_tx1, dep_rx1) = watch::channel(0); - let before_wall = Utc::now(); - let before_instant = Instant::now(); - let h1 = driver.register( - "t1".to_string(), - "test task".to_string(), - Duration::from_secs(300), // should not elapse during test - Box::new(t1), - opctx.child(std::collections::BTreeMap::new()), - vec![Box::new(dep_rx1.clone())], - ); - - // Wait to enter the first activation. - let which = ready_rx1.recv().await.unwrap(); - assert_eq!(which, 1); - let after_wall = Utc::now(); - let after_instant = Instant::now(); - // Verify that it's a timeout-based activation. - let status = driver.task_status(&h1); - assert!(!status.last.has_completed()); - let current = status.current.unwrap_running(); - assert!(current.start_time >= before_wall); - assert!(current.start_time <= after_wall); - assert!(current.start_instant >= before_instant); - assert!(current.start_instant <= after_instant); - assert_eq!(current.iteration, 1); - assert_eq!(current.reason, ActivationReason::Timeout); - // Enqueue another activation by dependency while this one is still - // running. - dep_tx1.send_replace(1); - // Complete the activation. - tx1.send(()).await.unwrap(); - - // We should immediately see another activation. - let which = ready_rx1.recv().await.unwrap(); - assert_eq!(which, 2); - assert!(after_instant.elapsed().as_millis() < 5000); - // Verify that it's a dependency-caused activation. - let status = driver.task_status(&h1); - let last = status.last.unwrap_completion(); - assert_eq!(last.start_time, current.start_time); - assert_eq!(last.iteration, current.iteration); - let current = status.current.unwrap_running(); - assert!(current.start_time >= after_wall); - assert!(current.start_instant >= after_instant); - assert_eq!(current.iteration, 2); - assert_eq!(current.reason, ActivationReason::Dependency); - // Enqueue another activation by explicit signal while this one is still - // running. - driver.activate(&h1); - // Complete the activation. - tx1.send(()).await.unwrap(); - - // We should immediately see another activation. - let which = ready_rx1.recv().await.unwrap(); - assert_eq!(which, 3); - assert!(after_instant.elapsed().as_millis() < 10000); - // Verify that it's a signal-caused activation. - let status = driver.task_status(&h1); - let last = status.last.unwrap_completion(); - assert_eq!(last.start_time, current.start_time); - assert_eq!(last.iteration, current.iteration); - let current = status.current.unwrap_running(); - assert_eq!(current.iteration, 3); - assert_eq!(current.reason, ActivationReason::Signaled); - // This time, queue up several explicit activations. - driver.activate(&h1); - driver.activate(&h1); - driver.activate(&h1); - tx1.send(()).await.unwrap(); - - // Again, we should see an activation basically immediately. - let which = ready_rx1.recv().await.unwrap(); - assert_eq!(which, 4); - tx1.send(()).await.unwrap(); - - // But we should not see any more activations. Those multiple - // notifications should have gotten collapsed. It is hard to know - // there's not another one coming, so we just wait long enough that we - // expect to have seen it if it is coming. - tokio::time::sleep(Duration::from_secs(1)).await; - let status = driver.task_status(&h1); - assert!(status.current.is_idle()); - assert_eq!(status.last.unwrap_completion().iteration, 4); - assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); - - // Now, trigger several dependency-based activations. We should see the - // same result: these get collapsed. - dep_tx1.send_replace(2); - dep_tx1.send_replace(3); - dep_tx1.send_replace(4); - let which = ready_rx1.recv().await.unwrap(); - assert_eq!(which, 5); - tx1.send(()).await.unwrap(); - tokio::time::sleep(Duration::from_secs(1)).await; - let status = driver.task_status(&h1); - assert!(status.current.is_idle()); - assert_eq!(status.last.unwrap_completion().iteration, 5); - assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); - - // It would be nice to also verify that multiple time-based activations - // also get collapsed, but this is a fair bit trickier. Using the same - // approach, we'd need to wait long enough that we'd catch any - // _erroneous_ activation, but not so long that we might catch the next - // legitimate periodic activation. It's hard to choose a period for - // such a task that would allow us to reliably distinguish between these - // two without also spending a lot of wall-clock time on this test. - } - - /// Simple BackgroundTask impl that sends a test-only SagaRequest - struct SagaRequestTask { - saga_request: Sender, - } - - impl SagaRequestTask { - fn new(saga_request: Sender) -> SagaRequestTask { - SagaRequestTask { saga_request } - } - } - - impl BackgroundTask for SagaRequestTask { - fn activate<'a>( - &'a mut self, - _: &'a OpContext, - ) -> BoxFuture<'a, serde_json::Value> { - async { - let _ = self.saga_request.send(SagaRequest::TestOnly).await; - serde_json::Value::Null - } - .boxed() - } - } - - #[nexus_test(server = crate::Server)] - async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - let (saga_request, mut saga_request_recv) = SagaRequest::channel(); - let t1 = SagaRequestTask::new(saga_request); - - let mut driver = Driver::new(); - let (_dep_tx1, dep_rx1) = watch::channel(0); - - let h1 = driver.register( - "t1".to_string(), - "test saga request flow task".to_string(), - Duration::from_secs(300), // should not fire in this test - Box::new(t1), - opctx.child(std::collections::BTreeMap::new()), - vec![Box::new(dep_rx1.clone())], - ); - - assert!(matches!( - saga_request_recv.try_recv(), - Err(mpsc::error::TryRecvError::Empty), - )); - - driver.activate(&h1); - - // wait 1 second for the saga request to arrive - tokio::select! { - _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { - assert!(false); - } - - saga_request = saga_request_recv.recv() => { - match saga_request { - None => { - assert!(false); - } - - Some(saga_request) => { - assert!(matches!( - saga_request, - SagaRequest::TestOnly, - )); - } - } - } - } - } -} +// XXX-dap +// #[cfg(test)] +// mod test { +// use super::BackgroundTask; +// use super::Driver; +// use crate::app::sagas::SagaRequest; +// use assert_matches::assert_matches; +// use chrono::Utc; +// use futures::future::BoxFuture; +// use futures::FutureExt; +// use nexus_db_queries::context::OpContext; +// use nexus_test_utils_macros::nexus_test; +// use nexus_types::internal_api::views::ActivationReason; +// use std::time::Duration; +// use std::time::Instant; +// use tokio::sync::mpsc; +// use tokio::sync::mpsc::error::TryRecvError; +// use tokio::sync::mpsc::Sender; +// use tokio::sync::watch; +// +// type ControlPlaneTestContext = +// nexus_test_utils::ControlPlaneTestContext; +// +// /// Simple BackgroundTask impl that just reports how many times it's run. +// struct ReportingTask { +// counter: usize, +// tx: watch::Sender, +// } +// +// impl ReportingTask { +// fn new() -> (ReportingTask, watch::Receiver) { +// let (tx, rx) = watch::channel(0); +// (ReportingTask { counter: 1, tx }, rx) +// } +// } +// +// impl BackgroundTask for ReportingTask { +// fn activate<'a>( +// &'a mut self, +// _: &'a OpContext, +// ) -> BoxFuture<'a, serde_json::Value> { +// async { +// let count = self.counter; +// self.counter += 1; +// self.tx.send_replace(count); +// serde_json::Value::Number(serde_json::Number::from(count)) +// } +// .boxed() +// } +// } +// +// async fn wait_until_count(mut rx: watch::Receiver, count: usize) { +// loop { +// let v = rx.borrow_and_update(); +// assert!(*v <= count, "count went past what we expected"); +// if *v == count { +// return; +// } +// drop(v); +// +// tokio::time::timeout(Duration::from_secs(5), rx.changed()) +// .await +// .unwrap() +// .unwrap(); +// } +// } +// +// // Verifies that activation through each of the three mechanisms (explicit +// // signal, timeout, or dependency) causes exactly the right tasks to be +// // activated +// #[nexus_test(server = crate::Server)] +// async fn test_driver_basic(cptestctx: &ControlPlaneTestContext) { +// let nexus = &cptestctx.server.server_context().nexus; +// let datastore = nexus.datastore(); +// let opctx = OpContext::for_tests( +// cptestctx.logctx.log.clone(), +// datastore.clone(), +// ); +// +// // Create up front: +// // +// // - three ReportingTasks (our background tasks) +// // - two "watch" channels used as dependencies for these tasks +// +// let (t1, rx1) = ReportingTask::new(); +// let (t2, rx2) = ReportingTask::new(); +// let (t3, rx3) = ReportingTask::new(); +// let (dep_tx1, dep_rx1) = watch::channel(0); +// let (dep_tx2, dep_rx2) = watch::channel(0); +// let mut driver = Driver::new(); +// +// assert_eq!(*rx1.borrow(), 0); +// let h1 = driver.register( +// "t1".to_string(), +// "test task".to_string(), +// Duration::from_millis(100), +// Box::new(t1), +// opctx.child(std::collections::BTreeMap::new()), +// vec![Box::new(dep_rx1.clone()), Box::new(dep_rx2.clone())], +// ); +// +// let h2 = driver.register( +// "t2".to_string(), +// "test task".to_string(), +// Duration::from_secs(300), // should never fire in this test +// Box::new(t2), +// opctx.child(std::collections::BTreeMap::new()), +// vec![Box::new(dep_rx1.clone())], +// ); +// +// let h3 = driver.register( +// "t3".to_string(), +// "test task".to_string(), +// Duration::from_secs(300), // should never fire in this test +// Box::new(t3), +// opctx, +// vec![Box::new(dep_rx1), Box::new(dep_rx2)], +// ); +// +// // Wait for four activations of our task. (This is three periods.) That +// // should take between 300ms and 400ms. Allow extra time for a busy +// // system. +// let start = Instant::now(); +// let wall_start = Utc::now(); +// wait_until_count(rx1.clone(), 4).await; +// assert!(*rx1.borrow() == 4 || *rx1.borrow() == 5); +// let duration = start.elapsed(); +// println!("rx1 -> 3 took {:?}", duration); +// assert!( +// duration.as_millis() < 1000, +// "took longer than 1s to activate our every-100ms-task three times" +// ); +// assert!(duration.as_millis() >= 300); +// // Check how the last activation was reported. +// let status = driver.task_status(&h1); +// let last = status.last.unwrap_completion(); +// // It's conceivable that there's been another activation already. +// assert!(last.iteration == 3 || last.iteration == 4); +// assert!(last.start_time >= wall_start); +// assert!(last.start_time <= Utc::now()); +// assert!(last.elapsed <= duration); +// assert_matches!( +// last.details, +// serde_json::Value::Number(n) +// if n.as_u64().unwrap() == last.iteration +// ); +// +// // Tasks "t2" and "t3" ought to have seen only one activation in this +// // time, from its beginning-of-time activation. +// assert_eq!(*rx2.borrow(), 1); +// assert_eq!(*rx3.borrow(), 1); +// let status = driver.task_status(&h2); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 1); +// let status = driver.task_status(&h3); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 1); +// +// // Explicitly wake up all of our tasks by reporting that dep1 has +// // changed. +// println!("firing dependency tx1"); +// dep_tx1.send_replace(1); +// wait_until_count(rx2.clone(), 2).await; +// wait_until_count(rx3.clone(), 2).await; +// assert_eq!(*rx2.borrow(), 2); +// assert_eq!(*rx3.borrow(), 2); +// let status = driver.task_status(&h2); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 2); +// let status = driver.task_status(&h3); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 2); +// +// // Explicitly wake up just "t3" by reporting that dep2 has changed. +// println!("firing dependency tx2"); +// dep_tx2.send_replace(1); +// wait_until_count(rx3.clone(), 3).await; +// assert_eq!(*rx2.borrow(), 2); +// assert_eq!(*rx3.borrow(), 3); +// let status = driver.task_status(&h2); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 2); +// let status = driver.task_status(&h3); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 3); +// +// // Explicitly activate just "t3". +// driver.activate(&h3); +// wait_until_count(rx3.clone(), 4).await; +// assert_eq!(*rx2.borrow(), 2); +// assert_eq!(*rx3.borrow(), 4); +// let status = driver.task_status(&h2); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 2); +// let status = driver.task_status(&h3); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.iteration, 4); +// } +// +// /// Simple background task that moves in lockstep with a consumer, allowing +// /// the creator to be notified when it becomes active and to determine when +// /// the activation finishes. +// struct PausingTask { +// counter: usize, +// ready_tx: mpsc::Sender, +// wait_rx: mpsc::Receiver<()>, +// } +// +// impl PausingTask { +// fn new( +// wait_rx: mpsc::Receiver<()>, +// ) -> (PausingTask, mpsc::Receiver) { +// let (ready_tx, ready_rx) = mpsc::channel(10); +// (PausingTask { counter: 1, wait_rx, ready_tx }, ready_rx) +// } +// } +// +// impl BackgroundTask for PausingTask { +// fn activate<'a>( +// &'a mut self, +// _: &'a OpContext, +// ) -> BoxFuture<'a, serde_json::Value> { +// async { +// let count = self.counter; +// self.counter += 1; +// let _ = self.ready_tx.send(count).await; +// let _ = self.wait_rx.recv().await; +// serde_json::Value::Null +// } +// .boxed() +// } +// } +// +// // Exercises various case of activation while a background task is currently +// // activated. +// #[nexus_test(server = crate::Server)] +// async fn test_activation_in_progress(cptestctx: &ControlPlaneTestContext) { +// let nexus = &cptestctx.server.server_context().nexus; +// let datastore = nexus.datastore(); +// let opctx = OpContext::for_tests( +// cptestctx.logctx.log.clone(), +// datastore.clone(), +// ); +// +// let mut driver = Driver::new(); +// let (tx1, rx1) = mpsc::channel(10); +// let (t1, mut ready_rx1) = PausingTask::new(rx1); +// let (dep_tx1, dep_rx1) = watch::channel(0); +// let before_wall = Utc::now(); +// let before_instant = Instant::now(); +// let h1 = driver.register( +// "t1".to_string(), +// "test task".to_string(), +// Duration::from_secs(300), // should not elapse during test +// Box::new(t1), +// opctx.child(std::collections::BTreeMap::new()), +// vec![Box::new(dep_rx1.clone())], +// ); +// +// // Wait to enter the first activation. +// let which = ready_rx1.recv().await.unwrap(); +// assert_eq!(which, 1); +// let after_wall = Utc::now(); +// let after_instant = Instant::now(); +// // Verify that it's a timeout-based activation. +// let status = driver.task_status(&h1); +// assert!(!status.last.has_completed()); +// let current = status.current.unwrap_running(); +// assert!(current.start_time >= before_wall); +// assert!(current.start_time <= after_wall); +// assert!(current.start_instant >= before_instant); +// assert!(current.start_instant <= after_instant); +// assert_eq!(current.iteration, 1); +// assert_eq!(current.reason, ActivationReason::Timeout); +// // Enqueue another activation by dependency while this one is still +// // running. +// dep_tx1.send_replace(1); +// // Complete the activation. +// tx1.send(()).await.unwrap(); +// +// // We should immediately see another activation. +// let which = ready_rx1.recv().await.unwrap(); +// assert_eq!(which, 2); +// assert!(after_instant.elapsed().as_millis() < 5000); +// // Verify that it's a dependency-caused activation. +// let status = driver.task_status(&h1); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.start_time, current.start_time); +// assert_eq!(last.iteration, current.iteration); +// let current = status.current.unwrap_running(); +// assert!(current.start_time >= after_wall); +// assert!(current.start_instant >= after_instant); +// assert_eq!(current.iteration, 2); +// assert_eq!(current.reason, ActivationReason::Dependency); +// // Enqueue another activation by explicit signal while this one is still +// // running. +// driver.activate(&h1); +// // Complete the activation. +// tx1.send(()).await.unwrap(); +// +// // We should immediately see another activation. +// let which = ready_rx1.recv().await.unwrap(); +// assert_eq!(which, 3); +// assert!(after_instant.elapsed().as_millis() < 10000); +// // Verify that it's a signal-caused activation. +// let status = driver.task_status(&h1); +// let last = status.last.unwrap_completion(); +// assert_eq!(last.start_time, current.start_time); +// assert_eq!(last.iteration, current.iteration); +// let current = status.current.unwrap_running(); +// assert_eq!(current.iteration, 3); +// assert_eq!(current.reason, ActivationReason::Signaled); +// // This time, queue up several explicit activations. +// driver.activate(&h1); +// driver.activate(&h1); +// driver.activate(&h1); +// tx1.send(()).await.unwrap(); +// +// // Again, we should see an activation basically immediately. +// let which = ready_rx1.recv().await.unwrap(); +// assert_eq!(which, 4); +// tx1.send(()).await.unwrap(); +// +// // But we should not see any more activations. Those multiple +// // notifications should have gotten collapsed. It is hard to know +// // there's not another one coming, so we just wait long enough that we +// // expect to have seen it if it is coming. +// tokio::time::sleep(Duration::from_secs(1)).await; +// let status = driver.task_status(&h1); +// assert!(status.current.is_idle()); +// assert_eq!(status.last.unwrap_completion().iteration, 4); +// assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); +// +// // Now, trigger several dependency-based activations. We should see the +// // same result: these get collapsed. +// dep_tx1.send_replace(2); +// dep_tx1.send_replace(3); +// dep_tx1.send_replace(4); +// let which = ready_rx1.recv().await.unwrap(); +// assert_eq!(which, 5); +// tx1.send(()).await.unwrap(); +// tokio::time::sleep(Duration::from_secs(1)).await; +// let status = driver.task_status(&h1); +// assert!(status.current.is_idle()); +// assert_eq!(status.last.unwrap_completion().iteration, 5); +// assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); +// +// // It would be nice to also verify that multiple time-based activations +// // also get collapsed, but this is a fair bit trickier. Using the same +// // approach, we'd need to wait long enough that we'd catch any +// // _erroneous_ activation, but not so long that we might catch the next +// // legitimate periodic activation. It's hard to choose a period for +// // such a task that would allow us to reliably distinguish between these +// // two without also spending a lot of wall-clock time on this test. +// } +// +// /// Simple BackgroundTask impl that sends a test-only SagaRequest +// struct SagaRequestTask { +// saga_request: Sender, +// } +// +// impl SagaRequestTask { +// fn new(saga_request: Sender) -> SagaRequestTask { +// SagaRequestTask { saga_request } +// } +// } +// +// impl BackgroundTask for SagaRequestTask { +// fn activate<'a>( +// &'a mut self, +// _: &'a OpContext, +// ) -> BoxFuture<'a, serde_json::Value> { +// async { +// let _ = self.saga_request.send(SagaRequest::TestOnly).await; +// serde_json::Value::Null +// } +// .boxed() +// } +// } +// +// #[nexus_test(server = crate::Server)] +// async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { +// let nexus = &cptestctx.server.server_context().nexus; +// let datastore = nexus.datastore(); +// let opctx = OpContext::for_tests( +// cptestctx.logctx.log.clone(), +// datastore.clone(), +// ); +// +// let (saga_request, mut saga_request_recv) = SagaRequest::channel(); +// let t1 = SagaRequestTask::new(saga_request); +// +// let mut driver = Driver::new(); +// let (_dep_tx1, dep_rx1) = watch::channel(0); +// +// let h1 = driver.register( +// "t1".to_string(), +// "test saga request flow task".to_string(), +// Duration::from_secs(300), // should not fire in this test +// Box::new(t1), +// opctx.child(std::collections::BTreeMap::new()), +// vec![Box::new(dep_rx1.clone())], +// ); +// +// assert!(matches!( +// saga_request_recv.try_recv(), +// Err(mpsc::error::TryRecvError::Empty), +// )); +// +// driver.activate(&h1); +// +// // wait 1 second for the saga request to arrive +// tokio::select! { +// _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { +// assert!(false); +// } +// +// saga_request = saga_request_recv.recv() => { +// match saga_request { +// None => { +// assert!(false); +// } +// +// Some(saga_request) => { +// assert!(matches!( +// saga_request, +// SagaRequest::TestOnly, +// )); +// } +// } +// } +// } +// } +// } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 109bf4f551..ec698e7d07 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Specific background task initialization +// XXX-dap TODO-doc this whole file use super::tasks::abandoned_vmm_reaper; use super::tasks::bfd; @@ -25,7 +26,6 @@ use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::v2p_mappings::V2PManager; use super::Driver; -use super::TaskName; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; use nexus_config::BackgroundTaskConfig; @@ -35,82 +35,198 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use oximeter::types::ProducerRegistry; use std::collections::BTreeMap; +use std::sync::atomic::AtomicBool; use std::sync::Arc; use tokio::sync::mpsc::Sender; +use tokio::sync::watch; +use tokio::sync::Notify; use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for activating /// them and accessing any data that they expose to Nexus at-large +// XXX-dap TODO-doc describe two-phase initialization pub struct BackgroundTasks { - /// interface for working with background tasks (activation, checking - /// status, etc.) - pub driver: Driver, - - pub task_internal_dns_config: TaskName, - pub task_internal_dns_servers: TaskName, - pub task_external_dns_config: TaskName, - pub task_external_dns_servers: TaskName, - pub task_metrics_producer_gc: TaskName, - pub task_external_endpoints: TaskName, - pub task_nat_cleanup: TaskName, - pub task_bfd_manager: TaskName, - pub task_inventory_collection: TaskName, - pub task_physical_disk_adoption: TaskName, - pub task_phantom_disks: TaskName, - pub task_blueprint_loader: TaskName, - pub task_blueprint_executor: TaskName, - pub task_crdb_node_id_collector: TaskName, - pub task_service_zone_nat_tracker: TaskName, - pub task_switch_port_settings_manager: TaskName, - pub task_v2p_manager: TaskName, - pub task_region_replacement: TaskName, - pub task_instance_watcher: TaskName, - pub task_service_firewall_propagation: TaskName, - pub task_abandoned_vmm_reaper: TaskName, - + // handles to individual background tasks, used to activate them + pub task_internal_dns_config: ExternalTaskHandle, + pub task_internal_dns_servers: ExternalTaskHandle, + pub task_external_dns_config: ExternalTaskHandle, + pub task_external_dns_servers: ExternalTaskHandle, + pub task_metrics_producer_gc: ExternalTaskHandle, + pub task_external_endpoints: ExternalTaskHandle, + pub task_nat_cleanup: ExternalTaskHandle, + pub task_bfd_manager: ExternalTaskHandle, + pub task_inventory_collection: ExternalTaskHandle, + pub task_physical_disk_adoption: ExternalTaskHandle, + pub task_phantom_disks: ExternalTaskHandle, + pub task_blueprint_loader: ExternalTaskHandle, + pub task_blueprint_executor: ExternalTaskHandle, + pub task_crdb_node_id_collector: ExternalTaskHandle, + pub task_service_zone_nat_tracker: ExternalTaskHandle, + pub task_switch_port_settings_manager: ExternalTaskHandle, + pub task_v2p_manager: ExternalTaskHandle, + pub task_region_replacement: ExternalTaskHandle, + pub task_instance_watcher: ExternalTaskHandle, + pub task_service_firewall_propagation: ExternalTaskHandle, + pub task_abandoned_vmm_reaper: ExternalTaskHandle, + + // XXX-dap stuff that wasn't here before because it couldn't be externally + // activated + task_internal_dns_propagation: ExternalTaskHandle, + task_external_dns_propagation: ExternalTaskHandle, + + // data exposed by various background tasks to Nexus at-large /// external endpoints read by the background task - pub external_endpoints: tokio::sync::watch::Receiver< - Option, - >, + pub external_endpoints: + watch::Receiver>, } -impl BackgroundTasks { - /// Kick off all background tasks - #[allow(clippy::too_many_arguments)] - pub fn start( - opctx: &OpContext, +pub struct BackgroundTasksInitializer { + driver: Driver, + + external_endpoints_tx: + watch::Sender>, + + opctx: OpContext, + datastore: Arc, + config: BackgroundTaskConfig, + rack_id: Uuid, + nexus_id: Uuid, + resolver: internal_dns::resolver::Resolver, + saga_request: Sender, + v2p_watcher: (watch::Sender<()>, watch::Receiver<()>), + producer_registry: ProducerRegistry, +} + +impl BackgroundTasksInitializer { + // XXX-dap now that we're accepting owned copies of these, there's no need + // for them to be in `new()` or the struct + pub fn new( + opctx: OpContext, datastore: Arc, - config: &BackgroundTaskConfig, + config: BackgroundTaskConfig, rack_id: Uuid, nexus_id: Uuid, resolver: internal_dns::resolver::Resolver, saga_request: Sender, - v2p_watcher: ( - tokio::sync::watch::Sender<()>, - tokio::sync::watch::Receiver<()>, - ), - producer_registry: &ProducerRegistry, - ) -> BackgroundTasks { - let mut driver = Driver::new(); + v2p_watcher: (watch::Sender<()>, watch::Receiver<()>), + producer_registry: ProducerRegistry, + ) -> (BackgroundTasksInitializer, BackgroundTasks) { + let (external_endpoints_tx, external_endpoints_rx) = + watch::channel(None); + + let initializer = BackgroundTasksInitializer { + driver: Driver::new(), + external_endpoints_tx, + + opctx, + datastore, + config, + rack_id, + nexus_id, + resolver, + saga_request, + v2p_watcher, + producer_registry, + }; + + let background_tasks = BackgroundTasks { + task_internal_dns_config: ExternalTaskHandle::new_stub(), + task_internal_dns_servers: ExternalTaskHandle::new_stub(), + task_external_dns_config: ExternalTaskHandle::new_stub(), + task_external_dns_servers: ExternalTaskHandle::new_stub(), + task_metrics_producer_gc: ExternalTaskHandle::new_stub(), + task_external_endpoints: ExternalTaskHandle::new_stub(), + task_nat_cleanup: ExternalTaskHandle::new_stub(), + task_bfd_manager: ExternalTaskHandle::new_stub(), + task_inventory_collection: ExternalTaskHandle::new_stub(), + task_physical_disk_adoption: ExternalTaskHandle::new_stub(), + task_phantom_disks: ExternalTaskHandle::new_stub(), + task_blueprint_loader: ExternalTaskHandle::new_stub(), + task_blueprint_executor: ExternalTaskHandle::new_stub(), + task_crdb_node_id_collector: ExternalTaskHandle::new_stub(), + task_service_zone_nat_tracker: ExternalTaskHandle::new_stub(), + task_switch_port_settings_manager: ExternalTaskHandle::new_stub(), + task_v2p_manager: ExternalTaskHandle::new_stub(), + task_region_replacement: ExternalTaskHandle::new_stub(), + task_instance_watcher: ExternalTaskHandle::new_stub(), + task_service_firewall_propagation: ExternalTaskHandle::new_stub(), + task_abandoned_vmm_reaper: ExternalTaskHandle::new_stub(), + + task_internal_dns_propagation: ExternalTaskHandle::new_stub(), + task_external_dns_propagation: ExternalTaskHandle::new_stub(), + + external_endpoints: external_endpoints_rx, + }; + + (initializer, background_tasks) + } + + pub fn start(self, background_tasks: &'_ BackgroundTasks) -> Driver { + let mut driver = self.driver; + let opctx = &self.opctx; + let datastore = self.datastore; + let config = self.config; + let rack_id = self.rack_id; + let nexus_id = self.nexus_id; + let resolver = self.resolver; + let saga_request = self.saga_request; + let v2p_watcher = self.v2p_watcher; + let producer_registry = &self.producer_registry; + + // XXX-dap TODO-doc this construction helps ensure we don't miss + // anything + let BackgroundTasks { + task_internal_dns_config, + task_internal_dns_servers, + task_internal_dns_propagation, + task_external_dns_config, + task_external_dns_servers, + task_external_dns_propagation, + task_metrics_producer_gc, + task_external_endpoints, + task_nat_cleanup, + task_bfd_manager, + task_inventory_collection, + task_physical_disk_adoption, + task_phantom_disks, + task_blueprint_loader, + task_blueprint_executor, + task_crdb_node_id_collector, + task_service_zone_nat_tracker, + task_switch_port_settings_manager, + task_v2p_manager, + task_region_replacement, + task_instance_watcher, + task_service_firewall_propagation, + task_abandoned_vmm_reaper, + external_endpoints: _external_endpoints, + } = &background_tasks; - let (task_internal_dns_config, task_internal_dns_servers) = init_dns( + init_dns( &mut driver, opctx, datastore.clone(), DnsGroup::Internal, resolver.clone(), &config.dns_internal, + task_internal_dns_config, + task_internal_dns_servers, + task_internal_dns_propagation, ); - let (task_external_dns_config, task_external_dns_servers) = init_dns( + + init_dns( &mut driver, opctx, datastore.clone(), DnsGroup::External, resolver.clone(), &config.dns_external, + task_external_dns_config, + task_external_dns_servers, + task_external_dns_propagation, ); - let task_metrics_producer_gc = { + { let gc = metrics_producer_gc::MetricProducerGc::new( datastore.clone(), PRODUCER_LEASE_DURATION, @@ -125,16 +241,17 @@ impl BackgroundTasks { Box::new(gc), opctx.child(BTreeMap::new()), vec![], + task_metrics_producer_gc, ) }; // Background task: External endpoints list watcher - let (task_external_endpoints, external_endpoints) = { + { let watcher = external_endpoints::ExternalEndpointsWatcher::new( datastore.clone(), + self.external_endpoints_tx, ); - let watcher_channel = watcher.watcher(); - let task = driver.register( + driver.register( String::from("external_endpoints"), String::from( "reads config for silos and TLS certificates to determine \ @@ -145,72 +262,66 @@ impl BackgroundTasks { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], + task_external_endpoints, ); - (task, watcher_channel) - }; + } - let task_nat_cleanup = { - driver.register( - "nat_v4_garbage_collector".to_string(), - String::from( - "prunes soft-deleted IPV4 NAT entries from ipv4_nat_entry \ - table based on a predetermined retention policy", - ), - config.nat_cleanup.period_secs, - Box::new(nat_cleanup::Ipv4NatGarbageCollector::new( - datastore.clone(), - resolver.clone(), - )), - opctx.child(BTreeMap::new()), - vec![], - ) - }; + driver.register( + "nat_v4_garbage_collector".to_string(), + String::from( + "prunes soft-deleted IPV4 NAT entries from ipv4_nat_entry \ + table based on a predetermined retention policy", + ), + config.nat_cleanup.period_secs, + Box::new(nat_cleanup::Ipv4NatGarbageCollector::new( + datastore.clone(), + resolver.clone(), + )), + opctx.child(BTreeMap::new()), + vec![], + task_nat_cleanup, + ); - let task_bfd_manager = { - driver.register( - "bfd_manager".to_string(), - String::from( - "Manages bidirectional fowarding detection (BFD) \ - configuration on rack switches", - ), - config.bfd_manager.period_secs, - Box::new(bfd::BfdManager::new( - datastore.clone(), - resolver.clone(), - )), - opctx.child(BTreeMap::new()), - vec![], - ) - }; + driver.register( + "bfd_manager".to_string(), + String::from( + "Manages bidirectional fowarding detection (BFD) \ + configuration on rack switches", + ), + config.bfd_manager.period_secs, + Box::new(bfd::BfdManager::new(datastore.clone(), resolver.clone())), + opctx.child(BTreeMap::new()), + vec![], + task_bfd_manager, + ); // Background task: phantom disk detection - let task_phantom_disks = { + { let detector = phantom_disks::PhantomDiskDetector::new(datastore.clone()); - - let task = driver.register( + driver.register( String::from("phantom_disks"), String::from("detects and un-deletes phantom disks"), config.phantom_disks.period_secs, Box::new(detector), opctx.child(BTreeMap::new()), vec![], + task_phantom_disks, ); - - task }; // Background task: blueprint loader let blueprint_loader = blueprint_load::TargetBlueprintLoader::new(datastore.clone()); let rx_blueprint = blueprint_loader.watcher(); - let task_blueprint_loader = driver.register( + driver.register( String::from("blueprint_loader"), String::from("Loads the current target blueprint from the DB"), config.blueprints.period_secs_load, Box::new(blueprint_loader), opctx.child(BTreeMap::new()), vec![], + task_blueprint_loader, ); // Background task: blueprint executor @@ -220,13 +331,14 @@ impl BackgroundTasks { nexus_id.to_string(), ); let rx_blueprint_exec = blueprint_executor.watcher(); - let task_blueprint_executor = driver.register( + driver.register( String::from("blueprint_executor"), String::from("Executes the target blueprint"), config.blueprints.period_secs_execute, Box::new(blueprint_executor), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint.clone())], + task_blueprint_executor, ); // Background task: CockroachDB node ID collector @@ -235,13 +347,14 @@ impl BackgroundTasks { datastore.clone(), rx_blueprint.clone(), ); - let task_crdb_node_id_collector = driver.register( + driver.register( String::from("crdb_node_id_collector"), String::from("Collects node IDs of running CockroachDB zones"), config.blueprints.period_secs_collect_crdb_node_ids, Box::new(crdb_node_id_collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint)], + task_crdb_node_id_collector, ); // Background task: inventory collector @@ -252,7 +365,7 @@ impl BackgroundTasks { // because the blueprint executor might also depend indirectly on the // inventory collector. In that case, we may need to do something more // complicated. But for now, this works. - let (task_inventory_collection, inventory_watcher) = { + let inventory_watcher = { let collector = inventory_collection::InventoryCollector::new( datastore.clone(), resolver.clone(), @@ -261,7 +374,7 @@ impl BackgroundTasks { config.inventory.disable, ); let inventory_watcher = collector.watcher(); - let task = driver.register( + driver.register( String::from("inventory_collection"), String::from( "collects hardware and software inventory data from the \ @@ -271,79 +384,76 @@ impl BackgroundTasks { Box::new(collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint_exec)], + task_inventory_collection, ); - (task, inventory_watcher) + inventory_watcher }; - let task_physical_disk_adoption = { - driver.register( - "physical_disk_adoption".to_string(), - "ensure new physical disks are automatically marked in-service" - .to_string(), - config.physical_disk_adoption.period_secs, - Box::new(physical_disk_adoption::PhysicalDiskAdoption::new( - datastore.clone(), - inventory_watcher.clone(), - config.physical_disk_adoption.disable, - rack_id, - )), - opctx.child(BTreeMap::new()), - vec![Box::new(inventory_watcher)], - ) - }; + driver.register( + "physical_disk_adoption".to_string(), + "ensure new physical disks are automatically marked in-service" + .to_string(), + config.physical_disk_adoption.period_secs, + Box::new(physical_disk_adoption::PhysicalDiskAdoption::new( + datastore.clone(), + inventory_watcher.clone(), + config.physical_disk_adoption.disable, + rack_id, + )), + opctx.child(BTreeMap::new()), + vec![Box::new(inventory_watcher)], + task_physical_disk_adoption, + ); - let task_service_zone_nat_tracker = { - driver.register( - "service_zone_nat_tracker".to_string(), - String::from( - "ensures service zone nat records are recorded in NAT RPW \ - table", - ), - config.sync_service_zone_nat.period_secs, - Box::new(ServiceZoneNatTracker::new( - datastore.clone(), - resolver.clone(), - )), - opctx.child(BTreeMap::new()), - vec![], - ) - }; + driver.register( + "service_zone_nat_tracker".to_string(), + String::from( + "ensures service zone nat records are recorded in NAT RPW \ + table", + ), + config.sync_service_zone_nat.period_secs, + Box::new(ServiceZoneNatTracker::new( + datastore.clone(), + resolver.clone(), + )), + opctx.child(BTreeMap::new()), + vec![], + task_service_zone_nat_tracker, + ); - let task_switch_port_settings_manager = { - driver.register( - "switch_port_config_manager".to_string(), - String::from("manages switch port settings for rack switches"), - config.switch_port_settings_manager.period_secs, - Box::new(SwitchPortSettingsManager::new( - datastore.clone(), - resolver.clone(), - )), - opctx.child(BTreeMap::new()), - vec![], - ) - }; + driver.register( + "switch_port_config_manager".to_string(), + String::from("manages switch port settings for rack switches"), + config.switch_port_settings_manager.period_secs, + Box::new(SwitchPortSettingsManager::new( + datastore.clone(), + resolver.clone(), + )), + opctx.child(BTreeMap::new()), + vec![], + task_switch_port_settings_manager, + ); - let task_v2p_manager = { - driver.register( - "v2p_manager".to_string(), - String::from("manages opte v2p mappings for vpc networking"), - config.v2p_mapping_propagation.period_secs, - Box::new(V2PManager::new(datastore.clone())), - opctx.child(BTreeMap::new()), - vec![Box::new(v2p_watcher.1)], - ) - }; + driver.register( + "v2p_manager".to_string(), + String::from("manages opte v2p mappings for vpc networking"), + config.v2p_mapping_propagation.period_secs, + Box::new(V2PManager::new(datastore.clone())), + opctx.child(BTreeMap::new()), + vec![Box::new(v2p_watcher.1)], + task_v2p_manager, + ); // Background task: detect if a region needs replacement and begin the // process - let task_region_replacement = { + { let detector = region_replacement::RegionReplacementDetector::new( datastore.clone(), saga_request.clone(), ); - let task = driver.register( + driver.register( String::from("region_replacement"), String::from( "detects if a region requires replacing and begins the \ @@ -353,12 +463,11 @@ impl BackgroundTasks { Box::new(detector), opctx.child(BTreeMap::new()), vec![], + task_region_replacement, ); - - task }; - let task_instance_watcher = { + { let watcher = instance_watcher::InstanceWatcher::new( datastore.clone(), resolver.clone(), @@ -373,10 +482,12 @@ impl BackgroundTasks { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], + task_instance_watcher, ) }; + // Background task: service firewall rule propagation - let task_service_firewall_propagation = driver.register( + driver.register( String::from("service_firewall_rule_propagation"), String::from( "propagates VPC firewall rules for Omicron services with \ @@ -388,10 +499,11 @@ impl BackgroundTasks { )), opctx.child(BTreeMap::new()), vec![], + task_service_firewall_propagation, ); // Background task: abandoned VMM reaping - let task_abandoned_vmm_reaper = driver.register( + driver.register( String::from("abandoned_vmm_reaper"), String::from( "deletes sled reservations for VMMs that have been abandoned \ @@ -401,41 +513,36 @@ impl BackgroundTasks { Box::new(abandoned_vmm_reaper::AbandonedVmmReaper::new(datastore)), opctx.child(BTreeMap::new()), vec![], + task_abandoned_vmm_reaper, ); - BackgroundTasks { - driver, - task_internal_dns_config, - task_internal_dns_servers, - task_external_dns_config, - task_external_dns_servers, - task_metrics_producer_gc, - task_external_endpoints, - external_endpoints, - task_nat_cleanup, - task_bfd_manager, - task_inventory_collection, - task_physical_disk_adoption, - task_phantom_disks, - task_blueprint_loader, - task_blueprint_executor, - task_crdb_node_id_collector, - task_service_zone_nat_tracker, - task_switch_port_settings_manager, - task_v2p_manager, - task_region_replacement, - task_instance_watcher, - task_service_firewall_propagation, - task_abandoned_vmm_reaper, + // XXX-dap consider checking all the bools inside BackgroundTasks + + driver + } +} + +pub struct ExternalTaskHandle { + pub(super) notify: Arc, + pub(super) wired_up: AtomicBool, +} + +impl ExternalTaskHandle { + fn new_stub() -> ExternalTaskHandle { + ExternalTaskHandle { + notify: Arc::new(Notify::new()), + wired_up: AtomicBool::new(false), } } +} +impl BackgroundTasks { /// Activate the specified background task /// /// If the task is currently running, it will be activated again when it /// finishes. - pub fn activate(&self, task: &TaskName) { - self.driver.activate(task); + pub fn activate(&self, _task: &ExternalTaskHandle) { + todo!(); // XXX-dap } } @@ -446,7 +553,10 @@ fn init_dns( dns_group: DnsGroup, resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, -) -> (TaskName, TaskName) { + task_config: &ExternalTaskHandle, + task_servers: &ExternalTaskHandle, + task_propagation: &ExternalTaskHandle, +) { let dns_group_name = dns_group.to_string(); let metadata = BTreeMap::from([("dns_group".to_string(), dns_group_name)]); @@ -455,20 +565,21 @@ fn init_dns( dns_config::DnsConfigWatcher::new(Arc::clone(&datastore), dns_group); let dns_config_watcher = dns_config.watcher(); let task_name_config = format!("dns_config_{}", dns_group); - let task_config = driver.register( + driver.register( task_name_config.clone(), format!("watches {} DNS data stored in CockroachDB", dns_group), config.period_secs_config, Box::new(dns_config), opctx.child(metadata.clone()), vec![], + task_config, ); // Background task: DNS server list watcher let dns_servers = dns_servers::DnsServersWatcher::new(dns_group, resolver); let dns_servers_watcher = dns_servers.watcher(); let task_name_servers = format!("dns_servers_{}", dns_group); - let task_servers = driver.register( + driver.register( task_name_servers.clone(), format!( "watches list of {} DNS servers stored in internal DNS", @@ -478,6 +589,7 @@ fn init_dns( Box::new(dns_servers), opctx.child(metadata.clone()), vec![], + task_servers, ); // Background task: DNS propagation @@ -498,9 +610,8 @@ fn init_dns( Box::new(dns_propagate), opctx.child(metadata), vec![Box::new(dns_config_watcher), Box::new(dns_servers_watcher)], + task_propagation, ); - - (task_config, task_servers) } #[cfg(test)] diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index e70ec50016..64bbf1563c 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -135,6 +135,7 @@ mod tasks; pub use driver::Driver; pub use init::BackgroundTasks; +pub use init::BackgroundTasksInitializer; use futures::future::BoxFuture; use nexus_auth::context::OpContext; diff --git a/nexus/src/app/background/status.rs b/nexus/src/app/background/status.rs index c6c8b1dcf1..d5e3f73204 100644 --- a/nexus/src/app/background/status.rs +++ b/nexus/src/app/background/status.rs @@ -21,7 +21,9 @@ impl Nexus { opctx: &OpContext, ) -> Result, Error> { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - let driver = &self.background_tasks.driver; + // XXX-dap + let driver_wrapper = self.background_tasks_driver.lock().unwrap(); + let driver = driver_wrapper.as_ref().unwrap(); Ok(driver .tasks() .map(|t| { @@ -43,7 +45,9 @@ impl Nexus { name: &str, ) -> LookupResult { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - let driver = &self.background_tasks.driver; + // XXX-dap + let driver_wrapper = self.background_tasks_driver.lock().unwrap(); + let driver = driver_wrapper.as_ref().unwrap(); let task = driver.tasks().find(|t| t.as_str() == name).ok_or_else(|| { LookupType::ByName(name.to_owned()) @@ -61,7 +65,9 @@ impl Nexus { mut names: BTreeSet, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - let driver = &self.background_tasks.driver; + // XXX-dap + let driver_wrapper = self.background_tasks_driver.lock().unwrap(); + let driver = driver_wrapper.as_ref().unwrap(); // Ensure all task names are valid by removing them from the set of // names as we find them. diff --git a/nexus/src/app/background/tasks/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs index 0ff1e06a46..2c3921bb97 100644 --- a/nexus/src/app/background/tasks/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -23,21 +23,14 @@ pub struct ExternalEndpointsWatcher { datastore: Arc, last: Option, tx: watch::Sender>, - rx: watch::Receiver>, } impl ExternalEndpointsWatcher { - pub fn new(datastore: Arc) -> ExternalEndpointsWatcher { - let (tx, rx) = watch::channel(None); - ExternalEndpointsWatcher { datastore, last: None, tx, rx } - } - - /// Exposes the latest set of TLS certificates - /// - /// You can use the returned [`watch::Receiver`] to look at the latest - /// configuration or to be notified when it changes. - pub fn watcher(&self) -> watch::Receiver> { - self.rx.clone() + pub fn new( + datastore: Arc, + tx: watch::Sender>, + ) -> ExternalEndpointsWatcher { + ExternalEndpointsWatcher { datastore, last: None, tx } } } @@ -115,84 +108,85 @@ impl BackgroundTask for ExternalEndpointsWatcher { } } -#[cfg(test)] -mod test { - use super::ExternalEndpointsWatcher; - use crate::app::background::BackgroundTask; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; - use nexus_test_utils::resource_helpers::create_silo; - use nexus_test_utils_macros::nexus_test; - use nexus_types::external_api::shared::SiloIdentityMode; - use nexus_types::identity::Resource; - - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - - #[nexus_test(server = crate::Server)] - async fn test_basic(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - // Verify the initial state. - let mut task = ExternalEndpointsWatcher::new(datastore.clone()); - let watcher = task.watcher(); - assert!(watcher.borrow().is_none()); - - // The datastore from the ControlPlaneTestContext is initialized with - // two Silos: the built-in Silo and the recovery Silo. - let recovery_silo_dns_name = format!( - "{}.sys.{}", - cptestctx.silo_name.as_str(), - cptestctx.external_dns_zone_name - ); - let builtin_silo_dns_name = format!( - "{}.sys.{}", - DEFAULT_SILO.identity().name.as_str(), - cptestctx.external_dns_zone_name, - ); - let _ = task.activate(&opctx).await; - let initial_state_raw = watcher.borrow(); - let initial_state = initial_state_raw.as_ref().unwrap(); - assert!(initial_state.has_domain(recovery_silo_dns_name.as_str())); - assert!(initial_state.has_domain(builtin_silo_dns_name.as_str())); - // There are no other Silos. - assert_eq!(initial_state.ndomains(), 2); - // Neither of these will have a valid certificate in this configuration. - assert_eq!(initial_state.nwarnings(), 2); - drop(initial_state_raw); - - // If we create another Silo, we should see that one, too. - let new_silo_name = "test-silo"; - create_silo( - &cptestctx.external_client, - new_silo_name, - false, - SiloIdentityMode::LocalOnly, - ) - .await; - let new_silo_dns_name = format!( - "{}.sys.{}", - new_silo_name, cptestctx.external_dns_zone_name - ); - let _ = task.activate(&opctx).await; - let new_state_raw = watcher.borrow(); - let new_state = new_state_raw.as_ref().unwrap(); - assert!(new_state.has_domain(recovery_silo_dns_name.as_str())); - assert!(new_state.has_domain(builtin_silo_dns_name.as_str())); - assert!(new_state.has_domain(new_silo_dns_name.as_str())); - // There are no other Silos. - assert_eq!(new_state.ndomains(), 3); - // None of these will have a valid certificate in this configuration. - assert_eq!(new_state.nwarnings(), 3); - - // That's it. We're not testing all possible cases. That's done with - // unit tests for the underlying function. We're just testing that the - // background task reports updated state when the underlying state - // changes. - } -} +// XXX-dap +// #[cfg(test)] +// mod test { +// use super::ExternalEndpointsWatcher; +// use crate::app::background::BackgroundTask; +// use nexus_db_queries::context::OpContext; +// use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; +// use nexus_test_utils::resource_helpers::create_silo; +// use nexus_test_utils_macros::nexus_test; +// use nexus_types::external_api::shared::SiloIdentityMode; +// use nexus_types::identity::Resource; +// +// type ControlPlaneTestContext = +// nexus_test_utils::ControlPlaneTestContext; +// +// #[nexus_test(server = crate::Server)] +// async fn test_basic(cptestctx: &ControlPlaneTestContext) { +// let nexus = &cptestctx.server.server_context().nexus; +// let datastore = nexus.datastore(); +// let opctx = OpContext::for_tests( +// cptestctx.logctx.log.clone(), +// datastore.clone(), +// ); +// +// // Verify the initial state. +// let mut task = ExternalEndpointsWatcher::new(datastore.clone()); +// let watcher = task.watcher(); +// assert!(watcher.borrow().is_none()); +// +// // The datastore from the ControlPlaneTestContext is initialized with +// // two Silos: the built-in Silo and the recovery Silo. +// let recovery_silo_dns_name = format!( +// "{}.sys.{}", +// cptestctx.silo_name.as_str(), +// cptestctx.external_dns_zone_name +// ); +// let builtin_silo_dns_name = format!( +// "{}.sys.{}", +// DEFAULT_SILO.identity().name.as_str(), +// cptestctx.external_dns_zone_name, +// ); +// let _ = task.activate(&opctx).await; +// let initial_state_raw = watcher.borrow(); +// let initial_state = initial_state_raw.as_ref().unwrap(); +// assert!(initial_state.has_domain(recovery_silo_dns_name.as_str())); +// assert!(initial_state.has_domain(builtin_silo_dns_name.as_str())); +// // There are no other Silos. +// assert_eq!(initial_state.ndomains(), 2); +// // Neither of these will have a valid certificate in this configuration. +// assert_eq!(initial_state.nwarnings(), 2); +// drop(initial_state_raw); +// +// // If we create another Silo, we should see that one, too. +// let new_silo_name = "test-silo"; +// create_silo( +// &cptestctx.external_client, +// new_silo_name, +// false, +// SiloIdentityMode::LocalOnly, +// ) +// .await; +// let new_silo_dns_name = format!( +// "{}.sys.{}", +// new_silo_name, cptestctx.external_dns_zone_name +// ); +// let _ = task.activate(&opctx).await; +// let new_state_raw = watcher.borrow(); +// let new_state = new_state_raw.as_ref().unwrap(); +// assert!(new_state.has_domain(recovery_silo_dns_name.as_str())); +// assert!(new_state.has_domain(builtin_silo_dns_name.as_str())); +// assert!(new_state.has_domain(new_silo_dns_name.as_str())); +// // There are no other Silos. +// assert_eq!(new_state.ndomains(), 3); +// // None of these will have a valid certificate in this configuration. +// assert_eq!(new_state.nwarnings(), 3); +// +// // That's it. We're not testing all possible cases. That's done with +// // unit tests for the underlying function. We're just testing that the +// // background task reports updated state when the underlying state +// // changes. +// } +// } diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 3462a1429b..673dd825b6 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -196,7 +196,11 @@ pub struct Nexus { // https://github.com/oxidecomputer/omicron/issues/3732 external_dns_servers: Vec, - /// Background tasks + /// Background task driver + // XXX-dap + background_tasks_driver: std::sync::Mutex>, + + /// Handles to various specific tasks background_tasks: background::BackgroundTasks, /// Default Crucible region allocation strategy @@ -398,17 +402,18 @@ impl Nexus { let (saga_request, mut saga_request_recv) = SagaRequest::channel(); - let background_tasks = background::BackgroundTasks::start( - &background_ctx, - Arc::clone(&db_datastore), - &config.pkg.background_tasks, - rack_id, - config.deployment.id, - resolver.clone(), - saga_request, - v2p_watcher_channel.clone(), - producer_registry, - ); + let (background_tasks_initializer, background_tasks) = + background::BackgroundTasksInitializer::new( + background_ctx, + Arc::clone(&db_datastore), + config.pkg.background_tasks.clone(), + rack_id, + config.deployment.id, + resolver.clone(), + saga_request, + v2p_watcher_channel.clone(), + producer_registry.clone(), + ); let external_resolver = { if config.deployment.external_dns_servers.is_empty() { @@ -457,6 +462,7 @@ impl Nexus { .deployment .external_dns_servers .clone(), + background_tasks_driver: std::sync::Mutex::new(None), background_tasks, default_region_allocation_strategy: config .pkg @@ -503,9 +509,12 @@ impl Nexus { task_log, "populate complete; activating background tasks" ); - for task in task_nexus.background_tasks.driver.tasks() { - task_nexus.background_tasks.activate(task); - } + + let driver = background_tasks_initializer + .start(&task_nexus.background_tasks); + + *task_nexus.background_tasks_driver.lock().unwrap() = + Some(driver); } Err(_) => { error!(task_log, "populate failed"); From 4f058f3fa0130de2a734a443c70ba7b226ef1e95 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 11:25:01 -0700 Subject: [PATCH 10/23] WIP: flesh it out more --- nexus/src/app/background/driver.rs | 930 +++++++++++++++-------------- nexus/src/app/background/init.rs | 156 ++--- 2 files changed, 556 insertions(+), 530 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index 0bcdfa6335..cb1436de7c 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -4,7 +4,7 @@ //! Manages execution of background tasks -use super::init::ExternalTaskHandle; +use super::init::Activator; use super::BackgroundTask; use super::TaskName; use assert_matches::assert_matches; @@ -82,6 +82,8 @@ impl Driver { /// changed. This can be used to create dependencies between background /// tasks, so that when one of them finishes doing something, it kicks off /// another one. + // XXX-dap TODO-doc activator + // XXX-dap TODO-coverage activator pub fn register( &mut self, name: String, @@ -90,8 +92,13 @@ impl Driver { imp: Box, opctx: OpContext, watchers: Vec>, - task_external: &ExternalTaskHandle, - ) { + // XXX-dap consider making this non-optional? If it's just for the + // tests, we can have them create an Activator. + // XXX-dap can we do something about TaskName? It's only used by the + // tests, too. It's nice to have it for sure but is it really + // necessary? + activator: Option<&Activator>, + ) -> TaskName { // Activation of the background task happens in a separate tokio task. // Set up a channel so that tokio task can report status back to us. let (status_tx, status_rx) = watch::channel(TaskStatus { @@ -99,24 +106,31 @@ impl Driver { last: LastResult::NeverCompleted, }); - // Hook into the given external task handle's notification channel so - // that we can wake up the tokio task if somebody requests an explicit - // activation. - if let Err(previous) = task_external.wired_up.compare_exchange( - false, - true, - Ordering::SeqCst, - Ordering::SeqCst, - ) { - panic!( - "attempted to wire up the same background task handle twice \ - (previous \"wired_up\" = {}): currently attempting to wire \ - it up to task {:?}", - previous, name - ); - } + // We'll use a `Notify` to wake up that tokio task when an activation is + // requested. The caller may provide their own Activator, which + // just provides a specific Notify for us to use here. This allows them + // to have set up the activator before the task is actually created. + // They can also choose not to bother with this, in which case we'll + // just create our own. + let notify = if let Some(activator) = activator { + if let Err(previous) = activator.wired_up.compare_exchange( + false, + true, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + panic!( + "attempted to wire up the same background task handle \ + twice (previous \"wired_up\" = {}): currently attempting \ + to wire it up to task {:?}", + previous, name + ); + } - let notify = Arc::clone(&task_external.notify); + Arc::clone(&activator.notify) + } else { + Arc::new(Notify::new()) + }; // Spawn the tokio task that will manage activation of the background // task. @@ -136,6 +150,10 @@ impl Driver { if self.tasks.insert(TaskName(name.clone()), task).is_some() { panic!("started two background tasks called {:?}", name); } + + // Return a handle that the caller can use to activate the task or get + // its status. + TaskName(name) } /// Enumerate all registered background tasks @@ -329,437 +347,441 @@ impl GenericWatcher for watch::Receiver { } } -// XXX-dap -// #[cfg(test)] -// mod test { -// use super::BackgroundTask; -// use super::Driver; -// use crate::app::sagas::SagaRequest; -// use assert_matches::assert_matches; -// use chrono::Utc; -// use futures::future::BoxFuture; -// use futures::FutureExt; -// use nexus_db_queries::context::OpContext; -// use nexus_test_utils_macros::nexus_test; -// use nexus_types::internal_api::views::ActivationReason; -// use std::time::Duration; -// use std::time::Instant; -// use tokio::sync::mpsc; -// use tokio::sync::mpsc::error::TryRecvError; -// use tokio::sync::mpsc::Sender; -// use tokio::sync::watch; -// -// type ControlPlaneTestContext = -// nexus_test_utils::ControlPlaneTestContext; -// -// /// Simple BackgroundTask impl that just reports how many times it's run. -// struct ReportingTask { -// counter: usize, -// tx: watch::Sender, -// } -// -// impl ReportingTask { -// fn new() -> (ReportingTask, watch::Receiver) { -// let (tx, rx) = watch::channel(0); -// (ReportingTask { counter: 1, tx }, rx) -// } -// } -// -// impl BackgroundTask for ReportingTask { -// fn activate<'a>( -// &'a mut self, -// _: &'a OpContext, -// ) -> BoxFuture<'a, serde_json::Value> { -// async { -// let count = self.counter; -// self.counter += 1; -// self.tx.send_replace(count); -// serde_json::Value::Number(serde_json::Number::from(count)) -// } -// .boxed() -// } -// } -// -// async fn wait_until_count(mut rx: watch::Receiver, count: usize) { -// loop { -// let v = rx.borrow_and_update(); -// assert!(*v <= count, "count went past what we expected"); -// if *v == count { -// return; -// } -// drop(v); -// -// tokio::time::timeout(Duration::from_secs(5), rx.changed()) -// .await -// .unwrap() -// .unwrap(); -// } -// } -// -// // Verifies that activation through each of the three mechanisms (explicit -// // signal, timeout, or dependency) causes exactly the right tasks to be -// // activated -// #[nexus_test(server = crate::Server)] -// async fn test_driver_basic(cptestctx: &ControlPlaneTestContext) { -// let nexus = &cptestctx.server.server_context().nexus; -// let datastore = nexus.datastore(); -// let opctx = OpContext::for_tests( -// cptestctx.logctx.log.clone(), -// datastore.clone(), -// ); -// -// // Create up front: -// // -// // - three ReportingTasks (our background tasks) -// // - two "watch" channels used as dependencies for these tasks -// -// let (t1, rx1) = ReportingTask::new(); -// let (t2, rx2) = ReportingTask::new(); -// let (t3, rx3) = ReportingTask::new(); -// let (dep_tx1, dep_rx1) = watch::channel(0); -// let (dep_tx2, dep_rx2) = watch::channel(0); -// let mut driver = Driver::new(); -// -// assert_eq!(*rx1.borrow(), 0); -// let h1 = driver.register( -// "t1".to_string(), -// "test task".to_string(), -// Duration::from_millis(100), -// Box::new(t1), -// opctx.child(std::collections::BTreeMap::new()), -// vec![Box::new(dep_rx1.clone()), Box::new(dep_rx2.clone())], -// ); -// -// let h2 = driver.register( -// "t2".to_string(), -// "test task".to_string(), -// Duration::from_secs(300), // should never fire in this test -// Box::new(t2), -// opctx.child(std::collections::BTreeMap::new()), -// vec![Box::new(dep_rx1.clone())], -// ); -// -// let h3 = driver.register( -// "t3".to_string(), -// "test task".to_string(), -// Duration::from_secs(300), // should never fire in this test -// Box::new(t3), -// opctx, -// vec![Box::new(dep_rx1), Box::new(dep_rx2)], -// ); -// -// // Wait for four activations of our task. (This is three periods.) That -// // should take between 300ms and 400ms. Allow extra time for a busy -// // system. -// let start = Instant::now(); -// let wall_start = Utc::now(); -// wait_until_count(rx1.clone(), 4).await; -// assert!(*rx1.borrow() == 4 || *rx1.borrow() == 5); -// let duration = start.elapsed(); -// println!("rx1 -> 3 took {:?}", duration); -// assert!( -// duration.as_millis() < 1000, -// "took longer than 1s to activate our every-100ms-task three times" -// ); -// assert!(duration.as_millis() >= 300); -// // Check how the last activation was reported. -// let status = driver.task_status(&h1); -// let last = status.last.unwrap_completion(); -// // It's conceivable that there's been another activation already. -// assert!(last.iteration == 3 || last.iteration == 4); -// assert!(last.start_time >= wall_start); -// assert!(last.start_time <= Utc::now()); -// assert!(last.elapsed <= duration); -// assert_matches!( -// last.details, -// serde_json::Value::Number(n) -// if n.as_u64().unwrap() == last.iteration -// ); -// -// // Tasks "t2" and "t3" ought to have seen only one activation in this -// // time, from its beginning-of-time activation. -// assert_eq!(*rx2.borrow(), 1); -// assert_eq!(*rx3.borrow(), 1); -// let status = driver.task_status(&h2); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 1); -// let status = driver.task_status(&h3); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 1); -// -// // Explicitly wake up all of our tasks by reporting that dep1 has -// // changed. -// println!("firing dependency tx1"); -// dep_tx1.send_replace(1); -// wait_until_count(rx2.clone(), 2).await; -// wait_until_count(rx3.clone(), 2).await; -// assert_eq!(*rx2.borrow(), 2); -// assert_eq!(*rx3.borrow(), 2); -// let status = driver.task_status(&h2); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 2); -// let status = driver.task_status(&h3); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 2); -// -// // Explicitly wake up just "t3" by reporting that dep2 has changed. -// println!("firing dependency tx2"); -// dep_tx2.send_replace(1); -// wait_until_count(rx3.clone(), 3).await; -// assert_eq!(*rx2.borrow(), 2); -// assert_eq!(*rx3.borrow(), 3); -// let status = driver.task_status(&h2); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 2); -// let status = driver.task_status(&h3); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 3); -// -// // Explicitly activate just "t3". -// driver.activate(&h3); -// wait_until_count(rx3.clone(), 4).await; -// assert_eq!(*rx2.borrow(), 2); -// assert_eq!(*rx3.borrow(), 4); -// let status = driver.task_status(&h2); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 2); -// let status = driver.task_status(&h3); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.iteration, 4); -// } -// -// /// Simple background task that moves in lockstep with a consumer, allowing -// /// the creator to be notified when it becomes active and to determine when -// /// the activation finishes. -// struct PausingTask { -// counter: usize, -// ready_tx: mpsc::Sender, -// wait_rx: mpsc::Receiver<()>, -// } -// -// impl PausingTask { -// fn new( -// wait_rx: mpsc::Receiver<()>, -// ) -> (PausingTask, mpsc::Receiver) { -// let (ready_tx, ready_rx) = mpsc::channel(10); -// (PausingTask { counter: 1, wait_rx, ready_tx }, ready_rx) -// } -// } -// -// impl BackgroundTask for PausingTask { -// fn activate<'a>( -// &'a mut self, -// _: &'a OpContext, -// ) -> BoxFuture<'a, serde_json::Value> { -// async { -// let count = self.counter; -// self.counter += 1; -// let _ = self.ready_tx.send(count).await; -// let _ = self.wait_rx.recv().await; -// serde_json::Value::Null -// } -// .boxed() -// } -// } -// -// // Exercises various case of activation while a background task is currently -// // activated. -// #[nexus_test(server = crate::Server)] -// async fn test_activation_in_progress(cptestctx: &ControlPlaneTestContext) { -// let nexus = &cptestctx.server.server_context().nexus; -// let datastore = nexus.datastore(); -// let opctx = OpContext::for_tests( -// cptestctx.logctx.log.clone(), -// datastore.clone(), -// ); -// -// let mut driver = Driver::new(); -// let (tx1, rx1) = mpsc::channel(10); -// let (t1, mut ready_rx1) = PausingTask::new(rx1); -// let (dep_tx1, dep_rx1) = watch::channel(0); -// let before_wall = Utc::now(); -// let before_instant = Instant::now(); -// let h1 = driver.register( -// "t1".to_string(), -// "test task".to_string(), -// Duration::from_secs(300), // should not elapse during test -// Box::new(t1), -// opctx.child(std::collections::BTreeMap::new()), -// vec![Box::new(dep_rx1.clone())], -// ); -// -// // Wait to enter the first activation. -// let which = ready_rx1.recv().await.unwrap(); -// assert_eq!(which, 1); -// let after_wall = Utc::now(); -// let after_instant = Instant::now(); -// // Verify that it's a timeout-based activation. -// let status = driver.task_status(&h1); -// assert!(!status.last.has_completed()); -// let current = status.current.unwrap_running(); -// assert!(current.start_time >= before_wall); -// assert!(current.start_time <= after_wall); -// assert!(current.start_instant >= before_instant); -// assert!(current.start_instant <= after_instant); -// assert_eq!(current.iteration, 1); -// assert_eq!(current.reason, ActivationReason::Timeout); -// // Enqueue another activation by dependency while this one is still -// // running. -// dep_tx1.send_replace(1); -// // Complete the activation. -// tx1.send(()).await.unwrap(); -// -// // We should immediately see another activation. -// let which = ready_rx1.recv().await.unwrap(); -// assert_eq!(which, 2); -// assert!(after_instant.elapsed().as_millis() < 5000); -// // Verify that it's a dependency-caused activation. -// let status = driver.task_status(&h1); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.start_time, current.start_time); -// assert_eq!(last.iteration, current.iteration); -// let current = status.current.unwrap_running(); -// assert!(current.start_time >= after_wall); -// assert!(current.start_instant >= after_instant); -// assert_eq!(current.iteration, 2); -// assert_eq!(current.reason, ActivationReason::Dependency); -// // Enqueue another activation by explicit signal while this one is still -// // running. -// driver.activate(&h1); -// // Complete the activation. -// tx1.send(()).await.unwrap(); -// -// // We should immediately see another activation. -// let which = ready_rx1.recv().await.unwrap(); -// assert_eq!(which, 3); -// assert!(after_instant.elapsed().as_millis() < 10000); -// // Verify that it's a signal-caused activation. -// let status = driver.task_status(&h1); -// let last = status.last.unwrap_completion(); -// assert_eq!(last.start_time, current.start_time); -// assert_eq!(last.iteration, current.iteration); -// let current = status.current.unwrap_running(); -// assert_eq!(current.iteration, 3); -// assert_eq!(current.reason, ActivationReason::Signaled); -// // This time, queue up several explicit activations. -// driver.activate(&h1); -// driver.activate(&h1); -// driver.activate(&h1); -// tx1.send(()).await.unwrap(); -// -// // Again, we should see an activation basically immediately. -// let which = ready_rx1.recv().await.unwrap(); -// assert_eq!(which, 4); -// tx1.send(()).await.unwrap(); -// -// // But we should not see any more activations. Those multiple -// // notifications should have gotten collapsed. It is hard to know -// // there's not another one coming, so we just wait long enough that we -// // expect to have seen it if it is coming. -// tokio::time::sleep(Duration::from_secs(1)).await; -// let status = driver.task_status(&h1); -// assert!(status.current.is_idle()); -// assert_eq!(status.last.unwrap_completion().iteration, 4); -// assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); -// -// // Now, trigger several dependency-based activations. We should see the -// // same result: these get collapsed. -// dep_tx1.send_replace(2); -// dep_tx1.send_replace(3); -// dep_tx1.send_replace(4); -// let which = ready_rx1.recv().await.unwrap(); -// assert_eq!(which, 5); -// tx1.send(()).await.unwrap(); -// tokio::time::sleep(Duration::from_secs(1)).await; -// let status = driver.task_status(&h1); -// assert!(status.current.is_idle()); -// assert_eq!(status.last.unwrap_completion().iteration, 5); -// assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); -// -// // It would be nice to also verify that multiple time-based activations -// // also get collapsed, but this is a fair bit trickier. Using the same -// // approach, we'd need to wait long enough that we'd catch any -// // _erroneous_ activation, but not so long that we might catch the next -// // legitimate periodic activation. It's hard to choose a period for -// // such a task that would allow us to reliably distinguish between these -// // two without also spending a lot of wall-clock time on this test. -// } -// -// /// Simple BackgroundTask impl that sends a test-only SagaRequest -// struct SagaRequestTask { -// saga_request: Sender, -// } -// -// impl SagaRequestTask { -// fn new(saga_request: Sender) -> SagaRequestTask { -// SagaRequestTask { saga_request } -// } -// } -// -// impl BackgroundTask for SagaRequestTask { -// fn activate<'a>( -// &'a mut self, -// _: &'a OpContext, -// ) -> BoxFuture<'a, serde_json::Value> { -// async { -// let _ = self.saga_request.send(SagaRequest::TestOnly).await; -// serde_json::Value::Null -// } -// .boxed() -// } -// } -// -// #[nexus_test(server = crate::Server)] -// async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { -// let nexus = &cptestctx.server.server_context().nexus; -// let datastore = nexus.datastore(); -// let opctx = OpContext::for_tests( -// cptestctx.logctx.log.clone(), -// datastore.clone(), -// ); -// -// let (saga_request, mut saga_request_recv) = SagaRequest::channel(); -// let t1 = SagaRequestTask::new(saga_request); -// -// let mut driver = Driver::new(); -// let (_dep_tx1, dep_rx1) = watch::channel(0); -// -// let h1 = driver.register( -// "t1".to_string(), -// "test saga request flow task".to_string(), -// Duration::from_secs(300), // should not fire in this test -// Box::new(t1), -// opctx.child(std::collections::BTreeMap::new()), -// vec![Box::new(dep_rx1.clone())], -// ); -// -// assert!(matches!( -// saga_request_recv.try_recv(), -// Err(mpsc::error::TryRecvError::Empty), -// )); -// -// driver.activate(&h1); -// -// // wait 1 second for the saga request to arrive -// tokio::select! { -// _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { -// assert!(false); -// } -// -// saga_request = saga_request_recv.recv() => { -// match saga_request { -// None => { -// assert!(false); -// } -// -// Some(saga_request) => { -// assert!(matches!( -// saga_request, -// SagaRequest::TestOnly, -// )); -// } -// } -// } -// } -// } -// } +#[cfg(test)] +mod test { + use super::BackgroundTask; + use super::Driver; + use crate::app::sagas::SagaRequest; + use assert_matches::assert_matches; + use chrono::Utc; + use futures::future::BoxFuture; + use futures::FutureExt; + use nexus_db_queries::context::OpContext; + use nexus_test_utils_macros::nexus_test; + use nexus_types::internal_api::views::ActivationReason; + use std::time::Duration; + use std::time::Instant; + use tokio::sync::mpsc; + use tokio::sync::mpsc::error::TryRecvError; + use tokio::sync::mpsc::Sender; + use tokio::sync::watch; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + /// Simple BackgroundTask impl that just reports how many times it's run. + struct ReportingTask { + counter: usize, + tx: watch::Sender, + } + + impl ReportingTask { + fn new() -> (ReportingTask, watch::Receiver) { + let (tx, rx) = watch::channel(0); + (ReportingTask { counter: 1, tx }, rx) + } + } + + impl BackgroundTask for ReportingTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let count = self.counter; + self.counter += 1; + self.tx.send_replace(count); + serde_json::Value::Number(serde_json::Number::from(count)) + } + .boxed() + } + } + + async fn wait_until_count(mut rx: watch::Receiver, count: usize) { + loop { + let v = rx.borrow_and_update(); + assert!(*v <= count, "count went past what we expected"); + if *v == count { + return; + } + drop(v); + + tokio::time::timeout(Duration::from_secs(5), rx.changed()) + .await + .unwrap() + .unwrap(); + } + } + + // Verifies that activation through each of the three mechanisms (explicit + // signal, timeout, or dependency) causes exactly the right tasks to be + // activated + #[nexus_test(server = crate::Server)] + async fn test_driver_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Create up front: + // + // - three ReportingTasks (our background tasks) + // - two "watch" channels used as dependencies for these tasks + + let (t1, rx1) = ReportingTask::new(); + let (t2, rx2) = ReportingTask::new(); + let (t3, rx3) = ReportingTask::new(); + let (dep_tx1, dep_rx1) = watch::channel(0); + let (dep_tx2, dep_rx2) = watch::channel(0); + let mut driver = Driver::new(); + + assert_eq!(*rx1.borrow(), 0); + let h1 = driver.register( + "t1".to_string(), + "test task".to_string(), + Duration::from_millis(100), + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone()), Box::new(dep_rx2.clone())], + None, + ); + + let h2 = driver.register( + "t2".to_string(), + "test task".to_string(), + Duration::from_secs(300), // should never fire in this test + Box::new(t2), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + None, + ); + + let h3 = driver.register( + "t3".to_string(), + "test task".to_string(), + Duration::from_secs(300), // should never fire in this test + Box::new(t3), + opctx, + vec![Box::new(dep_rx1), Box::new(dep_rx2)], + None, + ); + + // Wait for four activations of our task. (This is three periods.) That + // should take between 300ms and 400ms. Allow extra time for a busy + // system. + let start = Instant::now(); + let wall_start = Utc::now(); + wait_until_count(rx1.clone(), 4).await; + assert!(*rx1.borrow() == 4 || *rx1.borrow() == 5); + let duration = start.elapsed(); + println!("rx1 -> 3 took {:?}", duration); + assert!( + duration.as_millis() < 1000, + "took longer than 1s to activate our every-100ms-task three times" + ); + assert!(duration.as_millis() >= 300); + // Check how the last activation was reported. + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); + // It's conceivable that there's been another activation already. + assert!(last.iteration == 3 || last.iteration == 4); + assert!(last.start_time >= wall_start); + assert!(last.start_time <= Utc::now()); + assert!(last.elapsed <= duration); + assert_matches!( + last.details, + serde_json::Value::Number(n) + if n.as_u64().unwrap() == last.iteration + ); + + // Tasks "t2" and "t3" ought to have seen only one activation in this + // time, from its beginning-of-time activation. + assert_eq!(*rx2.borrow(), 1); + assert_eq!(*rx3.borrow(), 1); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 1); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 1); + + // Explicitly wake up all of our tasks by reporting that dep1 has + // changed. + println!("firing dependency tx1"); + dep_tx1.send_replace(1); + wait_until_count(rx2.clone(), 2).await; + wait_until_count(rx3.clone(), 2).await; + assert_eq!(*rx2.borrow(), 2); + assert_eq!(*rx3.borrow(), 2); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 2); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 2); + + // Explicitly wake up just "t3" by reporting that dep2 has changed. + println!("firing dependency tx2"); + dep_tx2.send_replace(1); + wait_until_count(rx3.clone(), 3).await; + assert_eq!(*rx2.borrow(), 2); + assert_eq!(*rx3.borrow(), 3); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 2); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 3); + + // Explicitly activate just "t3". + driver.activate(&h3); + wait_until_count(rx3.clone(), 4).await; + assert_eq!(*rx2.borrow(), 2); + assert_eq!(*rx3.borrow(), 4); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 2); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 4); + } + + /// Simple background task that moves in lockstep with a consumer, allowing + /// the creator to be notified when it becomes active and to determine when + /// the activation finishes. + struct PausingTask { + counter: usize, + ready_tx: mpsc::Sender, + wait_rx: mpsc::Receiver<()>, + } + + impl PausingTask { + fn new( + wait_rx: mpsc::Receiver<()>, + ) -> (PausingTask, mpsc::Receiver) { + let (ready_tx, ready_rx) = mpsc::channel(10); + (PausingTask { counter: 1, wait_rx, ready_tx }, ready_rx) + } + } + + impl BackgroundTask for PausingTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let count = self.counter; + self.counter += 1; + let _ = self.ready_tx.send(count).await; + let _ = self.wait_rx.recv().await; + serde_json::Value::Null + } + .boxed() + } + } + + // Exercises various case of activation while a background task is currently + // activated. + #[nexus_test(server = crate::Server)] + async fn test_activation_in_progress(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let mut driver = Driver::new(); + let (tx1, rx1) = mpsc::channel(10); + let (t1, mut ready_rx1) = PausingTask::new(rx1); + let (dep_tx1, dep_rx1) = watch::channel(0); + let before_wall = Utc::now(); + let before_instant = Instant::now(); + let h1 = driver.register( + "t1".to_string(), + "test task".to_string(), + Duration::from_secs(300), // should not elapse during test + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + None, + ); + + // Wait to enter the first activation. + let which = ready_rx1.recv().await.unwrap(); + assert_eq!(which, 1); + let after_wall = Utc::now(); + let after_instant = Instant::now(); + // Verify that it's a timeout-based activation. + let status = driver.task_status(&h1); + assert!(!status.last.has_completed()); + let current = status.current.unwrap_running(); + assert!(current.start_time >= before_wall); + assert!(current.start_time <= after_wall); + assert!(current.start_instant >= before_instant); + assert!(current.start_instant <= after_instant); + assert_eq!(current.iteration, 1); + assert_eq!(current.reason, ActivationReason::Timeout); + // Enqueue another activation by dependency while this one is still + // running. + dep_tx1.send_replace(1); + // Complete the activation. + tx1.send(()).await.unwrap(); + + // We should immediately see another activation. + let which = ready_rx1.recv().await.unwrap(); + assert_eq!(which, 2); + assert!(after_instant.elapsed().as_millis() < 5000); + // Verify that it's a dependency-caused activation. + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); + assert_eq!(last.start_time, current.start_time); + assert_eq!(last.iteration, current.iteration); + let current = status.current.unwrap_running(); + assert!(current.start_time >= after_wall); + assert!(current.start_instant >= after_instant); + assert_eq!(current.iteration, 2); + assert_eq!(current.reason, ActivationReason::Dependency); + // Enqueue another activation by explicit signal while this one is still + // running. + driver.activate(&h1); + // Complete the activation. + tx1.send(()).await.unwrap(); + + // We should immediately see another activation. + let which = ready_rx1.recv().await.unwrap(); + assert_eq!(which, 3); + assert!(after_instant.elapsed().as_millis() < 10000); + // Verify that it's a signal-caused activation. + let status = driver.task_status(&h1); + let last = status.last.unwrap_completion(); + assert_eq!(last.start_time, current.start_time); + assert_eq!(last.iteration, current.iteration); + let current = status.current.unwrap_running(); + assert_eq!(current.iteration, 3); + assert_eq!(current.reason, ActivationReason::Signaled); + // This time, queue up several explicit activations. + driver.activate(&h1); + driver.activate(&h1); + driver.activate(&h1); + tx1.send(()).await.unwrap(); + + // Again, we should see an activation basically immediately. + let which = ready_rx1.recv().await.unwrap(); + assert_eq!(which, 4); + tx1.send(()).await.unwrap(); + + // But we should not see any more activations. Those multiple + // notifications should have gotten collapsed. It is hard to know + // there's not another one coming, so we just wait long enough that we + // expect to have seen it if it is coming. + tokio::time::sleep(Duration::from_secs(1)).await; + let status = driver.task_status(&h1); + assert!(status.current.is_idle()); + assert_eq!(status.last.unwrap_completion().iteration, 4); + assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); + + // Now, trigger several dependency-based activations. We should see the + // same result: these get collapsed. + dep_tx1.send_replace(2); + dep_tx1.send_replace(3); + dep_tx1.send_replace(4); + let which = ready_rx1.recv().await.unwrap(); + assert_eq!(which, 5); + tx1.send(()).await.unwrap(); + tokio::time::sleep(Duration::from_secs(1)).await; + let status = driver.task_status(&h1); + assert!(status.current.is_idle()); + assert_eq!(status.last.unwrap_completion().iteration, 5); + assert_matches!(ready_rx1.try_recv(), Err(TryRecvError::Empty)); + + // It would be nice to also verify that multiple time-based activations + // also get collapsed, but this is a fair bit trickier. Using the same + // approach, we'd need to wait long enough that we'd catch any + // _erroneous_ activation, but not so long that we might catch the next + // legitimate periodic activation. It's hard to choose a period for + // such a task that would allow us to reliably distinguish between these + // two without also spending a lot of wall-clock time on this test. + } + + /// Simple BackgroundTask impl that sends a test-only SagaRequest + struct SagaRequestTask { + saga_request: Sender, + } + + impl SagaRequestTask { + fn new(saga_request: Sender) -> SagaRequestTask { + SagaRequestTask { saga_request } + } + } + + impl BackgroundTask for SagaRequestTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let _ = self.saga_request.send(SagaRequest::TestOnly).await; + serde_json::Value::Null + } + .boxed() + } + } + + #[nexus_test(server = crate::Server)] + async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let t1 = SagaRequestTask::new(saga_request); + + let mut driver = Driver::new(); + let (_dep_tx1, dep_rx1) = watch::channel(0); + + let h1 = driver.register( + "t1".to_string(), + "test saga request flow task".to_string(), + Duration::from_secs(300), // should not fire in this test + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + None, + ); + + assert!(matches!( + saga_request_recv.try_recv(), + Err(mpsc::error::TryRecvError::Empty), + )); + + driver.activate(&h1); + + // wait 1 second for the saga request to arrive + tokio::select! { + _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { + assert!(false); + } + + saga_request = saga_request_recv.recv() => { + match saga_request { + None => { + assert!(false); + } + + Some(saga_request) => { + assert!(matches!( + saga_request, + SagaRequest::TestOnly, + )); + } + } + } + } + } +} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ec698e7d07..9590d6aab4 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -47,32 +47,32 @@ use uuid::Uuid; // XXX-dap TODO-doc describe two-phase initialization pub struct BackgroundTasks { // handles to individual background tasks, used to activate them - pub task_internal_dns_config: ExternalTaskHandle, - pub task_internal_dns_servers: ExternalTaskHandle, - pub task_external_dns_config: ExternalTaskHandle, - pub task_external_dns_servers: ExternalTaskHandle, - pub task_metrics_producer_gc: ExternalTaskHandle, - pub task_external_endpoints: ExternalTaskHandle, - pub task_nat_cleanup: ExternalTaskHandle, - pub task_bfd_manager: ExternalTaskHandle, - pub task_inventory_collection: ExternalTaskHandle, - pub task_physical_disk_adoption: ExternalTaskHandle, - pub task_phantom_disks: ExternalTaskHandle, - pub task_blueprint_loader: ExternalTaskHandle, - pub task_blueprint_executor: ExternalTaskHandle, - pub task_crdb_node_id_collector: ExternalTaskHandle, - pub task_service_zone_nat_tracker: ExternalTaskHandle, - pub task_switch_port_settings_manager: ExternalTaskHandle, - pub task_v2p_manager: ExternalTaskHandle, - pub task_region_replacement: ExternalTaskHandle, - pub task_instance_watcher: ExternalTaskHandle, - pub task_service_firewall_propagation: ExternalTaskHandle, - pub task_abandoned_vmm_reaper: ExternalTaskHandle, + pub task_internal_dns_config: Activator, + pub task_internal_dns_servers: Activator, + pub task_external_dns_config: Activator, + pub task_external_dns_servers: Activator, + pub task_metrics_producer_gc: Activator, + pub task_external_endpoints: Activator, + pub task_nat_cleanup: Activator, + pub task_bfd_manager: Activator, + pub task_inventory_collection: Activator, + pub task_physical_disk_adoption: Activator, + pub task_phantom_disks: Activator, + pub task_blueprint_loader: Activator, + pub task_blueprint_executor: Activator, + pub task_crdb_node_id_collector: Activator, + pub task_service_zone_nat_tracker: Activator, + pub task_switch_port_settings_manager: Activator, + pub task_v2p_manager: Activator, + pub task_region_replacement: Activator, + pub task_instance_watcher: Activator, + pub task_service_firewall_propagation: Activator, + pub task_abandoned_vmm_reaper: Activator, // XXX-dap stuff that wasn't here before because it couldn't be externally // activated - task_internal_dns_propagation: ExternalTaskHandle, - task_external_dns_propagation: ExternalTaskHandle, + task_internal_dns_propagation: Activator, + task_external_dns_propagation: Activator, // data exposed by various background tasks to Nexus at-large /// external endpoints read by the background task @@ -130,30 +130,30 @@ impl BackgroundTasksInitializer { }; let background_tasks = BackgroundTasks { - task_internal_dns_config: ExternalTaskHandle::new_stub(), - task_internal_dns_servers: ExternalTaskHandle::new_stub(), - task_external_dns_config: ExternalTaskHandle::new_stub(), - task_external_dns_servers: ExternalTaskHandle::new_stub(), - task_metrics_producer_gc: ExternalTaskHandle::new_stub(), - task_external_endpoints: ExternalTaskHandle::new_stub(), - task_nat_cleanup: ExternalTaskHandle::new_stub(), - task_bfd_manager: ExternalTaskHandle::new_stub(), - task_inventory_collection: ExternalTaskHandle::new_stub(), - task_physical_disk_adoption: ExternalTaskHandle::new_stub(), - task_phantom_disks: ExternalTaskHandle::new_stub(), - task_blueprint_loader: ExternalTaskHandle::new_stub(), - task_blueprint_executor: ExternalTaskHandle::new_stub(), - task_crdb_node_id_collector: ExternalTaskHandle::new_stub(), - task_service_zone_nat_tracker: ExternalTaskHandle::new_stub(), - task_switch_port_settings_manager: ExternalTaskHandle::new_stub(), - task_v2p_manager: ExternalTaskHandle::new_stub(), - task_region_replacement: ExternalTaskHandle::new_stub(), - task_instance_watcher: ExternalTaskHandle::new_stub(), - task_service_firewall_propagation: ExternalTaskHandle::new_stub(), - task_abandoned_vmm_reaper: ExternalTaskHandle::new_stub(), - - task_internal_dns_propagation: ExternalTaskHandle::new_stub(), - task_external_dns_propagation: ExternalTaskHandle::new_stub(), + task_internal_dns_config: Activator::new_stub(), + task_internal_dns_servers: Activator::new_stub(), + task_external_dns_config: Activator::new_stub(), + task_external_dns_servers: Activator::new_stub(), + task_metrics_producer_gc: Activator::new_stub(), + task_external_endpoints: Activator::new_stub(), + task_nat_cleanup: Activator::new_stub(), + task_bfd_manager: Activator::new_stub(), + task_inventory_collection: Activator::new_stub(), + task_physical_disk_adoption: Activator::new_stub(), + task_phantom_disks: Activator::new_stub(), + task_blueprint_loader: Activator::new_stub(), + task_blueprint_executor: Activator::new_stub(), + task_crdb_node_id_collector: Activator::new_stub(), + task_service_zone_nat_tracker: Activator::new_stub(), + task_switch_port_settings_manager: Activator::new_stub(), + task_v2p_manager: Activator::new_stub(), + task_region_replacement: Activator::new_stub(), + task_instance_watcher: Activator::new_stub(), + task_service_firewall_propagation: Activator::new_stub(), + task_abandoned_vmm_reaper: Activator::new_stub(), + + task_internal_dns_propagation: Activator::new_stub(), + task_external_dns_propagation: Activator::new_stub(), external_endpoints: external_endpoints_rx, }; @@ -241,7 +241,7 @@ impl BackgroundTasksInitializer { Box::new(gc), opctx.child(BTreeMap::new()), vec![], - task_metrics_producer_gc, + Some(task_metrics_producer_gc), ) }; @@ -262,7 +262,7 @@ impl BackgroundTasksInitializer { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], - task_external_endpoints, + Some(task_external_endpoints), ); } @@ -279,7 +279,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - task_nat_cleanup, + Some(task_nat_cleanup), ); driver.register( @@ -292,7 +292,7 @@ impl BackgroundTasksInitializer { Box::new(bfd::BfdManager::new(datastore.clone(), resolver.clone())), opctx.child(BTreeMap::new()), vec![], - task_bfd_manager, + Some(task_bfd_manager), ); // Background task: phantom disk detection @@ -306,7 +306,7 @@ impl BackgroundTasksInitializer { Box::new(detector), opctx.child(BTreeMap::new()), vec![], - task_phantom_disks, + Some(task_phantom_disks), ); }; @@ -321,7 +321,7 @@ impl BackgroundTasksInitializer { Box::new(blueprint_loader), opctx.child(BTreeMap::new()), vec![], - task_blueprint_loader, + Some(task_blueprint_loader), ); // Background task: blueprint executor @@ -338,7 +338,7 @@ impl BackgroundTasksInitializer { Box::new(blueprint_executor), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint.clone())], - task_blueprint_executor, + Some(task_blueprint_executor), ); // Background task: CockroachDB node ID collector @@ -354,7 +354,7 @@ impl BackgroundTasksInitializer { Box::new(crdb_node_id_collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint)], - task_crdb_node_id_collector, + Some(task_crdb_node_id_collector), ); // Background task: inventory collector @@ -384,7 +384,7 @@ impl BackgroundTasksInitializer { Box::new(collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint_exec)], - task_inventory_collection, + Some(task_inventory_collection), ); inventory_watcher @@ -403,7 +403,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![Box::new(inventory_watcher)], - task_physical_disk_adoption, + Some(task_physical_disk_adoption), ); driver.register( @@ -419,7 +419,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - task_service_zone_nat_tracker, + Some(task_service_zone_nat_tracker), ); driver.register( @@ -432,7 +432,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - task_switch_port_settings_manager, + Some(task_switch_port_settings_manager), ); driver.register( @@ -442,7 +442,7 @@ impl BackgroundTasksInitializer { Box::new(V2PManager::new(datastore.clone())), opctx.child(BTreeMap::new()), vec![Box::new(v2p_watcher.1)], - task_v2p_manager, + Some(task_v2p_manager), ); // Background task: detect if a region needs replacement and begin the @@ -463,7 +463,7 @@ impl BackgroundTasksInitializer { Box::new(detector), opctx.child(BTreeMap::new()), vec![], - task_region_replacement, + Some(task_region_replacement), ); }; @@ -482,7 +482,7 @@ impl BackgroundTasksInitializer { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], - task_instance_watcher, + Some(task_instance_watcher), ) }; @@ -499,7 +499,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - task_service_firewall_propagation, + Some(task_service_firewall_propagation), ); // Background task: abandoned VMM reaping @@ -513,7 +513,7 @@ impl BackgroundTasksInitializer { Box::new(abandoned_vmm_reaper::AbandonedVmmReaper::new(datastore)), opctx.child(BTreeMap::new()), vec![], - task_abandoned_vmm_reaper, + Some(task_abandoned_vmm_reaper), ); // XXX-dap consider checking all the bools inside BackgroundTasks @@ -522,18 +522,22 @@ impl BackgroundTasksInitializer { } } -pub struct ExternalTaskHandle { +pub struct Activator { pub(super) notify: Arc, pub(super) wired_up: AtomicBool, } -impl ExternalTaskHandle { - fn new_stub() -> ExternalTaskHandle { - ExternalTaskHandle { +impl Activator { + fn new_stub() -> Activator { + Activator { notify: Arc::new(Notify::new()), wired_up: AtomicBool::new(false), } } + + pub fn activate(&self) { + self.notify.notify_one(); + } } impl BackgroundTasks { @@ -541,8 +545,8 @@ impl BackgroundTasks { /// /// If the task is currently running, it will be activated again when it /// finishes. - pub fn activate(&self, _task: &ExternalTaskHandle) { - todo!(); // XXX-dap + pub fn activate(&self, task: &Activator) { + task.activate(); } } @@ -553,9 +557,9 @@ fn init_dns( dns_group: DnsGroup, resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, - task_config: &ExternalTaskHandle, - task_servers: &ExternalTaskHandle, - task_propagation: &ExternalTaskHandle, + task_config: &Activator, + task_servers: &Activator, + task_propagation: &Activator, ) { let dns_group_name = dns_group.to_string(); let metadata = BTreeMap::from([("dns_group".to_string(), dns_group_name)]); @@ -572,7 +576,7 @@ fn init_dns( Box::new(dns_config), opctx.child(metadata.clone()), vec![], - task_config, + Some(task_config), ); // Background task: DNS server list watcher @@ -589,7 +593,7 @@ fn init_dns( Box::new(dns_servers), opctx.child(metadata.clone()), vec![], - task_servers, + Some(task_servers), ); // Background task: DNS propagation @@ -610,7 +614,7 @@ fn init_dns( Box::new(dns_propagate), opctx.child(metadata), vec![Box::new(dns_config_watcher), Box::new(dns_servers_watcher)], - task_propagation, + Some(task_propagation), ); } From 87160da3fc255ab0d873c9a8a5ccbd3181c40fcb Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 16:16:33 -0700 Subject: [PATCH 11/23] use OnceLock; defer the arguments to start() --- nexus/src/app/background/driver.rs | 1 + nexus/src/app/background/init.rs | 63 +++++++++--------------------- nexus/src/app/background/status.rs | 19 ++++----- nexus/src/app/mod.rs | 50 ++++++++++++++---------- 4 files changed, 58 insertions(+), 75 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index cb1436de7c..0f0f0e21c0 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -84,6 +84,7 @@ impl Driver { /// another one. // XXX-dap TODO-doc activator // XXX-dap TODO-coverage activator + #[allow(clippy::too_many_arguments)] pub fn register( &mut self, name: String, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 57fbb65827..be60f144c4 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -86,53 +86,19 @@ pub struct BackgroundTasks { pub struct BackgroundTasksInitializer { driver: Driver, - external_endpoints_tx: watch::Sender>, - - opctx: OpContext, - datastore: Arc, - config: BackgroundTaskConfig, - rack_id: Uuid, - nexus_id: Uuid, - resolver: internal_dns::resolver::Resolver, - saga_request: Sender, - v2p_watcher: (watch::Sender<()>, watch::Receiver<()>), - producer_registry: ProducerRegistry, } impl BackgroundTasksInitializer { - // XXX-dap now that we're accepting owned copies of these, there's no need - // for them to be in `new()` or the struct - pub fn new( - opctx: OpContext, - datastore: Arc, - config: BackgroundTaskConfig, - rack_id: Uuid, - nexus_id: Uuid, - resolver: internal_dns::resolver::Resolver, - saga_request: Sender, - v2p_watcher: (watch::Sender<()>, watch::Receiver<()>), - producer_registry: ProducerRegistry, - ) -> (BackgroundTasksInitializer, BackgroundTasks) { + pub fn new() -> (BackgroundTasksInitializer, BackgroundTasks) { let (external_endpoints_tx, external_endpoints_rx) = watch::channel(None); let initializer = BackgroundTasksInitializer { driver: Driver::new(), external_endpoints_tx, - - opctx, - datastore, - config, - rack_id, - nexus_id, - resolver, - saga_request, - v2p_watcher, - producer_registry, }; - let background_tasks = BackgroundTasks { task_internal_dns_config: Activator::new_stub(), task_internal_dns_servers: Activator::new_stub(), @@ -167,17 +133,23 @@ impl BackgroundTasksInitializer { (initializer, background_tasks) } - pub fn start(self, background_tasks: &'_ BackgroundTasks) -> Driver { + #[allow(clippy::too_many_arguments)] + pub fn start( + self, + background_tasks: &'_ BackgroundTasks, + opctx: OpContext, + datastore: Arc, + config: BackgroundTaskConfig, + rack_id: Uuid, + nexus_id: Uuid, + resolver: internal_dns::resolver::Resolver, + saga_request: Sender, + v2p_watcher: (watch::Sender<()>, watch::Receiver<()>), + producer_registry: ProducerRegistry, + ) -> Driver { let mut driver = self.driver; - let opctx = &self.opctx; - let datastore = self.datastore; - let config = self.config; - let rack_id = self.rack_id; - let nexus_id = self.nexus_id; - let resolver = self.resolver; - let saga_request = self.saga_request; - let v2p_watcher = self.v2p_watcher; - let producer_registry = &self.producer_registry; + let opctx = &opctx; + let producer_registry = &producer_registry; // XXX-dap TODO-doc this construction helps ensure we don't miss // anything @@ -591,6 +563,7 @@ impl BackgroundTasks { } } +#[allow(clippy::too_many_arguments)] fn init_dns( driver: &mut Driver, opctx: &OpContext, diff --git a/nexus/src/app/background/status.rs b/nexus/src/app/background/status.rs index d5e3f73204..9cf9f564c9 100644 --- a/nexus/src/app/background/status.rs +++ b/nexus/src/app/background/status.rs @@ -4,6 +4,7 @@ //! View status of background tasks (for support and debugging) +use super::Driver; use crate::Nexus; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; @@ -21,9 +22,7 @@ impl Nexus { opctx: &OpContext, ) -> Result, Error> { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - // XXX-dap - let driver_wrapper = self.background_tasks_driver.lock().unwrap(); - let driver = driver_wrapper.as_ref().unwrap(); + let driver = self.driver()?; Ok(driver .tasks() .map(|t| { @@ -45,9 +44,7 @@ impl Nexus { name: &str, ) -> LookupResult { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - // XXX-dap - let driver_wrapper = self.background_tasks_driver.lock().unwrap(); - let driver = driver_wrapper.as_ref().unwrap(); + let driver = self.driver()?; let task = driver.tasks().find(|t| t.as_str() == name).ok_or_else(|| { LookupType::ByName(name.to_owned()) @@ -65,9 +62,7 @@ impl Nexus { mut names: BTreeSet, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - // XXX-dap - let driver_wrapper = self.background_tasks_driver.lock().unwrap(); - let driver = driver_wrapper.as_ref().unwrap(); + let driver = self.driver()?; // Ensure all task names are valid by removing them from the set of // names as we find them. @@ -94,4 +89,10 @@ impl Nexus { Ok(()) } + + fn driver(&self) -> Result<&Driver, Error> { + self.background_tasks_driver.get().ok_or_else(|| { + Error::internal_error("background tasks not yet initialized") + }) + } } diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 203a1191d1..47f51d706d 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -34,6 +34,7 @@ use std::collections::HashMap; use std::net::SocketAddrV6; use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; +use std::sync::OnceLock; use uuid::Uuid; // The implementation of Nexus is large, and split into a number of submodules @@ -197,8 +198,7 @@ pub struct Nexus { external_dns_servers: Vec, /// Background task driver - // XXX-dap - background_tasks_driver: std::sync::Mutex>, + background_tasks_driver: OnceLock, /// Handles to various specific tasks background_tasks: background::BackgroundTasks, @@ -403,17 +403,7 @@ impl Nexus { let (saga_request, mut saga_request_recv) = SagaRequest::channel(); let (background_tasks_initializer, background_tasks) = - background::BackgroundTasksInitializer::new( - background_ctx, - Arc::clone(&db_datastore), - config.pkg.background_tasks.clone(), - rack_id, - config.deployment.id, - resolver.clone(), - saga_request, - v2p_watcher_channel.clone(), - producer_registry.clone(), - ); + background::BackgroundTasksInitializer::new(); let external_resolver = { if config.deployment.external_dns_servers.is_empty() { @@ -456,19 +446,19 @@ impl Nexus { as Arc, ), samael_max_issue_delay: std::sync::Mutex::new(None), - internal_resolver: resolver, + internal_resolver: resolver.clone(), external_resolver, external_dns_servers: config .deployment .external_dns_servers .clone(), - background_tasks_driver: std::sync::Mutex::new(None), + background_tasks_driver: OnceLock::new(), background_tasks, default_region_allocation_strategy: config .pkg .default_region_allocation_strategy .clone(), - v2p_notification_tx: v2p_watcher_channel.0, + v2p_notification_tx: v2p_watcher_channel.0.clone(), }; // TODO-cleanup all the extra Arcs here seems wrong @@ -488,7 +478,7 @@ impl Nexus { saga_logger, Arc::clone(&authz), ))), - db_datastore, + Arc::clone(&db_datastore), Arc::clone(&sec_client), sagas::ACTION_REGISTRY.clone(), ); @@ -502,6 +492,8 @@ impl Nexus { // not be retried for a while. let task_nexus = nexus.clone(); let task_log = nexus.log.clone(); + let task_registry = producer_registry.clone(); + let task_config = config.clone(); tokio::spawn(async move { match task_nexus.wait_for_populate().await { Ok(_) => { @@ -510,11 +502,27 @@ impl Nexus { "populate complete; activating background tasks" ); - let driver = background_tasks_initializer - .start(&task_nexus.background_tasks); + let driver = background_tasks_initializer.start( + &task_nexus.background_tasks, + background_ctx, + db_datastore, + task_config.pkg.background_tasks, + rack_id, + task_config.deployment.id, + resolver, + saga_request, + v2p_watcher_channel.clone(), + task_registry, + ); - *task_nexus.background_tasks_driver.lock().unwrap() = - Some(driver); + if let Err(_) = + task_nexus.background_tasks_driver.set(driver) + { + panic!( + "concurrent initialization of \ + background_tasks_driver?!" + ) + } } Err(_) => { error!(task_log, "populate failed"); From 85eff7a9de8a3779e45c22100bf54476ba31607c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 16:56:46 -0700 Subject: [PATCH 12/23] fix up ExternalEndpoints test code --- .../background/tasks/external_endpoints.rs | 164 +++++++++--------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/nexus/src/app/background/tasks/external_endpoints.rs b/nexus/src/app/background/tasks/external_endpoints.rs index 2c3921bb97..c866cade75 100644 --- a/nexus/src/app/background/tasks/external_endpoints.rs +++ b/nexus/src/app/background/tasks/external_endpoints.rs @@ -108,85 +108,85 @@ impl BackgroundTask for ExternalEndpointsWatcher { } } -// XXX-dap -// #[cfg(test)] -// mod test { -// use super::ExternalEndpointsWatcher; -// use crate::app::background::BackgroundTask; -// use nexus_db_queries::context::OpContext; -// use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; -// use nexus_test_utils::resource_helpers::create_silo; -// use nexus_test_utils_macros::nexus_test; -// use nexus_types::external_api::shared::SiloIdentityMode; -// use nexus_types::identity::Resource; -// -// type ControlPlaneTestContext = -// nexus_test_utils::ControlPlaneTestContext; -// -// #[nexus_test(server = crate::Server)] -// async fn test_basic(cptestctx: &ControlPlaneTestContext) { -// let nexus = &cptestctx.server.server_context().nexus; -// let datastore = nexus.datastore(); -// let opctx = OpContext::for_tests( -// cptestctx.logctx.log.clone(), -// datastore.clone(), -// ); -// -// // Verify the initial state. -// let mut task = ExternalEndpointsWatcher::new(datastore.clone()); -// let watcher = task.watcher(); -// assert!(watcher.borrow().is_none()); -// -// // The datastore from the ControlPlaneTestContext is initialized with -// // two Silos: the built-in Silo and the recovery Silo. -// let recovery_silo_dns_name = format!( -// "{}.sys.{}", -// cptestctx.silo_name.as_str(), -// cptestctx.external_dns_zone_name -// ); -// let builtin_silo_dns_name = format!( -// "{}.sys.{}", -// DEFAULT_SILO.identity().name.as_str(), -// cptestctx.external_dns_zone_name, -// ); -// let _ = task.activate(&opctx).await; -// let initial_state_raw = watcher.borrow(); -// let initial_state = initial_state_raw.as_ref().unwrap(); -// assert!(initial_state.has_domain(recovery_silo_dns_name.as_str())); -// assert!(initial_state.has_domain(builtin_silo_dns_name.as_str())); -// // There are no other Silos. -// assert_eq!(initial_state.ndomains(), 2); -// // Neither of these will have a valid certificate in this configuration. -// assert_eq!(initial_state.nwarnings(), 2); -// drop(initial_state_raw); -// -// // If we create another Silo, we should see that one, too. -// let new_silo_name = "test-silo"; -// create_silo( -// &cptestctx.external_client, -// new_silo_name, -// false, -// SiloIdentityMode::LocalOnly, -// ) -// .await; -// let new_silo_dns_name = format!( -// "{}.sys.{}", -// new_silo_name, cptestctx.external_dns_zone_name -// ); -// let _ = task.activate(&opctx).await; -// let new_state_raw = watcher.borrow(); -// let new_state = new_state_raw.as_ref().unwrap(); -// assert!(new_state.has_domain(recovery_silo_dns_name.as_str())); -// assert!(new_state.has_domain(builtin_silo_dns_name.as_str())); -// assert!(new_state.has_domain(new_silo_dns_name.as_str())); -// // There are no other Silos. -// assert_eq!(new_state.ndomains(), 3); -// // None of these will have a valid certificate in this configuration. -// assert_eq!(new_state.nwarnings(), 3); -// -// // That's it. We're not testing all possible cases. That's done with -// // unit tests for the underlying function. We're just testing that the -// // background task reports updated state when the underlying state -// // changes. -// } -// } +#[cfg(test)] +mod test { + use super::ExternalEndpointsWatcher; + use crate::app::background::BackgroundTask; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; + use nexus_test_utils::resource_helpers::create_silo; + use nexus_test_utils_macros::nexus_test; + use nexus_types::external_api::shared::SiloIdentityMode; + use nexus_types::identity::Resource; + use tokio::sync::watch; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_basic(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Verify the initial state. + let (tx, watcher) = watch::channel(None); + let mut task = ExternalEndpointsWatcher::new(datastore.clone(), tx); + assert!(watcher.borrow().is_none()); + + // The datastore from the ControlPlaneTestContext is initialized with + // two Silos: the built-in Silo and the recovery Silo. + let recovery_silo_dns_name = format!( + "{}.sys.{}", + cptestctx.silo_name.as_str(), + cptestctx.external_dns_zone_name + ); + let builtin_silo_dns_name = format!( + "{}.sys.{}", + DEFAULT_SILO.identity().name.as_str(), + cptestctx.external_dns_zone_name, + ); + let _ = task.activate(&opctx).await; + let initial_state_raw = watcher.borrow(); + let initial_state = initial_state_raw.as_ref().unwrap(); + assert!(initial_state.has_domain(recovery_silo_dns_name.as_str())); + assert!(initial_state.has_domain(builtin_silo_dns_name.as_str())); + // There are no other Silos. + assert_eq!(initial_state.ndomains(), 2); + // Neither of these will have a valid certificate in this configuration. + assert_eq!(initial_state.nwarnings(), 2); + drop(initial_state_raw); + + // If we create another Silo, we should see that one, too. + let new_silo_name = "test-silo"; + create_silo( + &cptestctx.external_client, + new_silo_name, + false, + SiloIdentityMode::LocalOnly, + ) + .await; + let new_silo_dns_name = format!( + "{}.sys.{}", + new_silo_name, cptestctx.external_dns_zone_name + ); + let _ = task.activate(&opctx).await; + let new_state_raw = watcher.borrow(); + let new_state = new_state_raw.as_ref().unwrap(); + assert!(new_state.has_domain(recovery_silo_dns_name.as_str())); + assert!(new_state.has_domain(builtin_silo_dns_name.as_str())); + assert!(new_state.has_domain(new_silo_dns_name.as_str())); + // There are no other Silos. + assert_eq!(new_state.ndomains(), 3); + // None of these will have a valid certificate in this configuration. + assert_eq!(new_state.nwarnings(), 3); + + // That's it. We're not testing all possible cases. That's done with + // unit tests for the underlying function. We're just testing that the + // background task reports updated state when the underlying state + // changes. + } +} From a8e9629ddb3f9e9b41495d85d33a2f30bf5b3a96 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 17:03:01 -0700 Subject: [PATCH 13/23] make Activator non-optional --- nexus/src/app/background/driver.rs | 57 ++++++++--------- nexus/src/app/background/init.rs | 98 +++++++++++++++--------------- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index 0f0f0e21c0..ff1745a250 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -93,12 +93,10 @@ impl Driver { imp: Box, opctx: OpContext, watchers: Vec>, - // XXX-dap consider making this non-optional? If it's just for the - // tests, we can have them create an Activator. // XXX-dap can we do something about TaskName? It's only used by the // tests, too. It's nice to have it for sure but is it really // necessary? - activator: Option<&Activator>, + activator: &Activator, ) -> TaskName { // Activation of the background task happens in a separate tokio task. // Set up a channel so that tokio task can report status back to us. @@ -108,30 +106,23 @@ impl Driver { }); // We'll use a `Notify` to wake up that tokio task when an activation is - // requested. The caller may provide their own Activator, which - // just provides a specific Notify for us to use here. This allows them - // to have set up the activator before the task is actually created. - // They can also choose not to bother with this, in which case we'll - // just create our own. - let notify = if let Some(activator) = activator { - if let Err(previous) = activator.wired_up.compare_exchange( - false, - true, - Ordering::SeqCst, - Ordering::SeqCst, - ) { - panic!( - "attempted to wire up the same background task handle \ + // requested. The caller provides their own Activator, which just + // provides a specific Notify for us to use here. This allows them to + // have set up the activator before the task is actually created. + if let Err(previous) = activator.wired_up.compare_exchange( + false, + true, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + panic!( + "attempted to wire up the same background task handle \ twice (previous \"wired_up\" = {}): currently attempting \ to wire it up to task {:?}", - previous, name - ); - } - - Arc::clone(&activator.notify) - } else { - Arc::new(Notify::new()) - }; + previous, name + ); + } + let notify = Arc::clone(&activator.notify); // Spawn the tokio task that will manage activation of the background // task. @@ -352,6 +343,7 @@ impl GenericWatcher for watch::Receiver { mod test { use super::BackgroundTask; use super::Driver; + use crate::app::background::init::Activator; use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; @@ -436,6 +428,9 @@ mod test { let (t3, rx3) = ReportingTask::new(); let (dep_tx1, dep_rx1) = watch::channel(0); let (dep_tx2, dep_rx2) = watch::channel(0); + let act1 = Activator::new(); + let act2 = Activator::new(); + let act3 = Activator::new(); let mut driver = Driver::new(); assert_eq!(*rx1.borrow(), 0); @@ -446,7 +441,7 @@ mod test { Box::new(t1), opctx.child(std::collections::BTreeMap::new()), vec![Box::new(dep_rx1.clone()), Box::new(dep_rx2.clone())], - None, + &act1, ); let h2 = driver.register( @@ -456,7 +451,7 @@ mod test { Box::new(t2), opctx.child(std::collections::BTreeMap::new()), vec![Box::new(dep_rx1.clone())], - None, + &act2, ); let h3 = driver.register( @@ -466,7 +461,7 @@ mod test { Box::new(t3), opctx, vec![Box::new(dep_rx1), Box::new(dep_rx2)], - None, + &act3, ); // Wait for four activations of our task. (This is three periods.) That @@ -600,6 +595,7 @@ mod test { let (dep_tx1, dep_rx1) = watch::channel(0); let before_wall = Utc::now(); let before_instant = Instant::now(); + let act1 = Activator::new(); let h1 = driver.register( "t1".to_string(), "test task".to_string(), @@ -607,7 +603,7 @@ mod test { Box::new(t1), opctx.child(std::collections::BTreeMap::new()), vec![Box::new(dep_rx1.clone())], - None, + &act1, ); // Wait to enter the first activation. @@ -745,6 +741,7 @@ mod test { let mut driver = Driver::new(); let (_dep_tx1, dep_rx1) = watch::channel(0); + let act1 = Activator::new(); let h1 = driver.register( "t1".to_string(), @@ -753,7 +750,7 @@ mod test { Box::new(t1), opctx.child(std::collections::BTreeMap::new()), vec![Box::new(dep_rx1.clone())], - None, + &act1, ); assert!(matches!( diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index be60f144c4..257391e8e3 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -100,32 +100,32 @@ impl BackgroundTasksInitializer { external_endpoints_tx, }; let background_tasks = BackgroundTasks { - task_internal_dns_config: Activator::new_stub(), - task_internal_dns_servers: Activator::new_stub(), - task_external_dns_config: Activator::new_stub(), - task_external_dns_servers: Activator::new_stub(), - task_metrics_producer_gc: Activator::new_stub(), - task_external_endpoints: Activator::new_stub(), - task_nat_cleanup: Activator::new_stub(), - task_bfd_manager: Activator::new_stub(), - task_inventory_collection: Activator::new_stub(), - task_physical_disk_adoption: Activator::new_stub(), - task_phantom_disks: Activator::new_stub(), - task_blueprint_loader: Activator::new_stub(), - task_blueprint_executor: Activator::new_stub(), - task_crdb_node_id_collector: Activator::new_stub(), - task_service_zone_nat_tracker: Activator::new_stub(), - task_switch_port_settings_manager: Activator::new_stub(), - task_v2p_manager: Activator::new_stub(), - task_region_replacement: Activator::new_stub(), - task_region_replacement_driver: Activator::new_stub(), - task_instance_watcher: Activator::new_stub(), - task_service_firewall_propagation: Activator::new_stub(), - task_abandoned_vmm_reaper: Activator::new_stub(), - task_vpc_route_manager: Activator::new_stub(), - - task_internal_dns_propagation: Activator::new_stub(), - task_external_dns_propagation: Activator::new_stub(), + task_internal_dns_config: Activator::new(), + task_internal_dns_servers: Activator::new(), + task_external_dns_config: Activator::new(), + task_external_dns_servers: Activator::new(), + task_metrics_producer_gc: Activator::new(), + task_external_endpoints: Activator::new(), + task_nat_cleanup: Activator::new(), + task_bfd_manager: Activator::new(), + task_inventory_collection: Activator::new(), + task_physical_disk_adoption: Activator::new(), + task_phantom_disks: Activator::new(), + task_blueprint_loader: Activator::new(), + task_blueprint_executor: Activator::new(), + task_crdb_node_id_collector: Activator::new(), + task_service_zone_nat_tracker: Activator::new(), + task_switch_port_settings_manager: Activator::new(), + task_v2p_manager: Activator::new(), + task_region_replacement: Activator::new(), + task_region_replacement_driver: Activator::new(), + task_instance_watcher: Activator::new(), + task_service_firewall_propagation: Activator::new(), + task_abandoned_vmm_reaper: Activator::new(), + task_vpc_route_manager: Activator::new(), + + task_internal_dns_propagation: Activator::new(), + task_external_dns_propagation: Activator::new(), external_endpoints: external_endpoints_rx, }; @@ -221,7 +221,7 @@ impl BackgroundTasksInitializer { Box::new(gc), opctx.child(BTreeMap::new()), vec![], - Some(task_metrics_producer_gc), + task_metrics_producer_gc, ) }; @@ -242,7 +242,7 @@ impl BackgroundTasksInitializer { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], - Some(task_external_endpoints), + task_external_endpoints, ); } @@ -259,7 +259,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - Some(task_nat_cleanup), + task_nat_cleanup, ); driver.register( @@ -272,7 +272,7 @@ impl BackgroundTasksInitializer { Box::new(bfd::BfdManager::new(datastore.clone(), resolver.clone())), opctx.child(BTreeMap::new()), vec![], - Some(task_bfd_manager), + task_bfd_manager, ); // Background task: phantom disk detection @@ -286,7 +286,7 @@ impl BackgroundTasksInitializer { Box::new(detector), opctx.child(BTreeMap::new()), vec![], - Some(task_phantom_disks), + task_phantom_disks, ); }; @@ -301,7 +301,7 @@ impl BackgroundTasksInitializer { Box::new(blueprint_loader), opctx.child(BTreeMap::new()), vec![], - Some(task_blueprint_loader), + task_blueprint_loader, ); // Background task: blueprint executor @@ -318,7 +318,7 @@ impl BackgroundTasksInitializer { Box::new(blueprint_executor), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint.clone())], - Some(task_blueprint_executor), + task_blueprint_executor, ); // Background task: CockroachDB node ID collector @@ -334,7 +334,7 @@ impl BackgroundTasksInitializer { Box::new(crdb_node_id_collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint)], - Some(task_crdb_node_id_collector), + task_crdb_node_id_collector, ); // Background task: inventory collector @@ -364,7 +364,7 @@ impl BackgroundTasksInitializer { Box::new(collector), opctx.child(BTreeMap::new()), vec![Box::new(rx_blueprint_exec)], - Some(task_inventory_collection), + task_inventory_collection, ); inventory_watcher @@ -383,7 +383,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![Box::new(inventory_watcher)], - Some(task_physical_disk_adoption), + task_physical_disk_adoption, ); driver.register( @@ -399,7 +399,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - Some(task_service_zone_nat_tracker), + task_service_zone_nat_tracker, ); driver.register( @@ -412,7 +412,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - Some(task_switch_port_settings_manager), + task_switch_port_settings_manager, ); driver.register( @@ -422,7 +422,7 @@ impl BackgroundTasksInitializer { Box::new(V2PManager::new(datastore.clone())), opctx.child(BTreeMap::new()), vec![Box::new(v2p_watcher.1)], - Some(task_v2p_manager), + task_v2p_manager, ); // Background task: detect if a region needs replacement and begin the @@ -443,7 +443,7 @@ impl BackgroundTasksInitializer { Box::new(detector), opctx.child(BTreeMap::new()), vec![], - Some(task_region_replacement), + task_region_replacement, ); }; @@ -462,7 +462,7 @@ impl BackgroundTasksInitializer { Box::new(detector), opctx.child(BTreeMap::new()), vec![], - Some(task_region_replacement_driver), + task_region_replacement_driver, ); }; @@ -481,7 +481,7 @@ impl BackgroundTasksInitializer { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], - Some(task_instance_watcher), + task_instance_watcher, ) }; @@ -498,7 +498,7 @@ impl BackgroundTasksInitializer { )), opctx.child(BTreeMap::new()), vec![], - Some(task_service_firewall_propagation), + task_service_firewall_propagation, ); // Background task: OPTE port route propagation @@ -511,7 +511,7 @@ impl BackgroundTasksInitializer { Box::new(watcher), opctx.child(BTreeMap::new()), vec![], - Some(task_vpc_route_manager), + task_vpc_route_manager, ) }; @@ -526,7 +526,7 @@ impl BackgroundTasksInitializer { Box::new(abandoned_vmm_reaper::AbandonedVmmReaper::new(datastore)), opctx.child(BTreeMap::new()), vec![], - Some(task_abandoned_vmm_reaper), + task_abandoned_vmm_reaper, ); // XXX-dap consider checking all the bools inside BackgroundTasks @@ -541,7 +541,7 @@ pub struct Activator { } impl Activator { - fn new_stub() -> Activator { + pub fn new() -> Activator { Activator { notify: Arc::new(Notify::new()), wired_up: AtomicBool::new(false), @@ -590,7 +590,7 @@ fn init_dns( Box::new(dns_config), opctx.child(metadata.clone()), vec![], - Some(task_config), + task_config, ); // Background task: DNS server list watcher @@ -607,7 +607,7 @@ fn init_dns( Box::new(dns_servers), opctx.child(metadata.clone()), vec![], - Some(task_servers), + task_servers, ); // Background task: DNS propagation @@ -628,7 +628,7 @@ fn init_dns( Box::new(dns_propagate), opctx.child(metadata), vec![Box::new(dns_config_watcher), Box::new(dns_servers_watcher)], - Some(task_propagation), + task_propagation, ); } From e35fbde7f62d002891396d7a69b755153f489d26 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 21:15:03 -0700 Subject: [PATCH 14/23] flesh out some docs --- nexus/src/app/background/driver.rs | 5 +- nexus/src/app/background/init.rs | 166 ++++++++++++++++++++++++----- 2 files changed, 145 insertions(+), 26 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index ff1745a250..ddb75a30f4 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -82,7 +82,10 @@ impl Driver { /// changed. This can be used to create dependencies between background /// tasks, so that when one of them finishes doing something, it kicks off /// another one. - // XXX-dap TODO-doc activator + /// + /// `activator` is an [`Activator`] that has not previously been used in a + /// call to this function. It will be wired up so that using it will + /// activate this newly-registered background task. // XXX-dap TODO-coverage activator #[allow(clippy::too_many_arguments)] pub fn register( diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 257391e8e3..3bea72a103 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Specific background task initialization -// XXX-dap TODO-doc this whole file use super::tasks::abandoned_vmm_reaper; use super::tasks::bfd; @@ -44,11 +43,10 @@ use tokio::sync::watch; use tokio::sync::Notify; use uuid::Uuid; -/// Describes ongoing background tasks and provides interfaces for activating -/// them and accessing any data that they expose to Nexus at-large -// XXX-dap TODO-doc describe two-phase initialization +/// Interface for activating various background tasks and read data that they +/// expose to Nexus at-large pub struct BackgroundTasks { - // handles to individual background tasks, used to activate them + // Handles to activate specific background tasks pub task_internal_dns_config: Activator, pub task_internal_dns_servers: Activator, pub task_external_dns_config: Activator, @@ -73,17 +71,55 @@ pub struct BackgroundTasks { pub task_abandoned_vmm_reaper: Activator, pub task_vpc_route_manager: Activator, - // XXX-dap stuff that wasn't here before because it couldn't be externally - // activated + // Handles to activate background tasks that do not get used by Nexus + // at-large. These background tasks are implementation details as far as + // the rest of Nexus is concerned. These handles don't even really need to + // be here, but it's convenient. task_internal_dns_propagation: Activator, task_external_dns_propagation: Activator, - // data exposed by various background tasks to Nexus at-large - /// external endpoints read by the background task + // Data exposed by various background tasks to the rest of Nexus + /// list of currently configured external endpoints pub external_endpoints: watch::Receiver>, } +impl BackgroundTasks { + /// Activate the specified background task + /// + /// If the task is currently running, it will be activated again when it + /// finishes. + pub fn activate(&self, task: &Activator) { + task.activate(); + } +} + +/// Initializes the background task subsystem +/// +/// The background task subsystem is initialized in two phases: +/// +/// 1. First, the caller invokes [`BackgroundTasksInitializer::new()`] and gets +/// back both a [`BackgroundTasksInitializer`] object and a +/// [`BackgroundTasks`] object. As the name suggests, the initializer is a +/// transient object that's just used to finish setting up the subsystem. +/// The `BackgroundTasks` object will provide a handle for the consumer to +/// activate background tasks and read data provided by those tasks after the +/// subsystem is up and running. +/// +/// 2. Some time later, the caller resumes initialization by invoking +/// [`BackgroundTasksInitializer::start()`] to actually start all the +/// specific background tasks that are part of Nexus. +/// +/// These two steps are separate so that the `BackgroundTasks` object can be +/// stored into Nexus and used by other subsystems, _including_ other background +/// tasks, to activate background tasks. In other words: if `BackgroundTasks` +/// was created _with_ all the individual tasks, then it wouldn't be possible to +/// _provide_ it to the tasks and allow them to activate each other. (It's less +/// obvious why, but it also wouldn't be possible to provide this to subsystems +/// like sagas that want to activate background tasks that also want to kick off +/// sagas.) +/// +// See the definition of `Activator` for more design notes about this interface. pub struct BackgroundTasksInitializer { driver: Driver, external_endpoints_tx: @@ -91,6 +127,14 @@ pub struct BackgroundTasksInitializer { } impl BackgroundTasksInitializer { + /// Begin initializing the Nexus background task subsystem + /// + /// This step does not start any background tasks. It just returns: + /// + /// * a short-lived `BackgroundTasksInitializer` object, on which you can + /// call `start()` to actually start the tasks + /// * a long-lived `BackgroundTasks` object that you can use to activate any + /// of the tasks that will be started pub fn new() -> (BackgroundTasksInitializer, BackgroundTasks) { let (external_endpoints_tx, external_endpoints_rx) = watch::channel(None); @@ -99,6 +143,7 @@ impl BackgroundTasksInitializer { driver: Driver::new(), external_endpoints_tx, }; + let background_tasks = BackgroundTasks { task_internal_dns_config: Activator::new(), task_internal_dns_servers: Activator::new(), @@ -133,6 +178,10 @@ impl BackgroundTasksInitializer { (initializer, background_tasks) } + /// Starts all the Nexus background tasks + /// + /// This function will wire up the `Activator`s in `background_tasks` to the + /// corresponding tasks once they've been started. #[allow(clippy::too_many_arguments)] pub fn start( self, @@ -151,8 +200,8 @@ impl BackgroundTasksInitializer { let opctx = &opctx; let producer_registry = &producer_registry; - // XXX-dap TODO-doc this construction helps ensure we don't miss - // anything + // This "let" construction helps catch mistakes where someone forgets to + // wire up an activator to its corresponding background task. let BackgroundTasks { task_internal_dns_config, task_internal_dns_servers, @@ -179,7 +228,14 @@ impl BackgroundTasksInitializer { task_service_firewall_propagation, task_abandoned_vmm_reaper, task_vpc_route_manager, + // Add new background tasks here. Be sure to use this binding in a + // call to `Driver::register()` below. That's what actually wires + // up the Activator to the corresponding background task. + + // The following fields can be safely ignored here because they're + // already wired up as needed. external_endpoints: _external_endpoints, + // Do NOT add a `..` catch-all here! See above. } = &background_tasks; init_dns( @@ -343,8 +399,9 @@ impl BackgroundTasksInitializer { // order to automatically trigger inventory collection whenever the // blueprint executor runs. In the limit, this could become a problem // because the blueprint executor might also depend indirectly on the - // inventory collector. In that case, we may need to do something more - // complicated. But for now, this works. + // inventory collector. In that case, we could expose `Activator`s to + // one or both of these tasks to directly activate the other precisely + // when needed. But for now, this works. let inventory_watcher = { let collector = inventory_collection::InventoryCollector::new( datastore.clone(), @@ -529,18 +586,81 @@ impl BackgroundTasksInitializer { task_abandoned_vmm_reaper, ); - // XXX-dap consider checking all the bools inside BackgroundTasks - driver } } +/// Activates a background task +/// +/// See [`nexus::app::background`] module-level documentation for more on what +/// that means. +/// +/// Activators are created with [`Activator::new()`] and then wired up to +/// specific background tasks using [`Driver::register()`]. If you call +/// `Activator::activate()` before the activator is wired up to a background +/// task, then once the activator _is_ wired up to a task, that task will +/// immediately be activated. +/// +/// Activators are designed specifically so they can be created before the +/// corresponding task has been created and then wired up with just an +/// `&Activator` (not a `&mut Activator`). That's so that we can construct +/// `BackgroundTasks`, a static list of activators for all background tasks in +/// Nexus, before we've actually set up all the tasks. This in turn breaks what +/// would otherwise be a circular initialization, if background tasks ever need +/// to activate other background tasks or if background tasks need to use other +/// subsystems (like sagas) that themselves want to activate background tasks. +// More straightforward approaches are available (e.g., register a task, get +// back a handle and use that to activate it) so it's worth explaining why it +// works this way. We're trying to satisfy a few different constraints: +// +// - background tasks can activate other background tasks +// - background tasks can use other subsystems in Nexus (like sagas) that +// themselves can activate background tasks +// - we should be able to tell at compile-time which code activates what specific +// background tasks +// - we should be able to tell at compile-time if code is attempting to activate +// a background task that doesn't exist (this would be a risk if tasks were +// identified by names) +// +// Our compile-time goals suggest an approach where tasks are identified either +// by global constants or by methods or fields in a struct, with one method or +// field for each task. With constants, it's still possible that one of these +// might not be wired up to anything. So we opt for fields in a struct, called +// `BackgroundTasks`. But that struct then needs to be available to anything +// that wants to activate background tasks -- including other background tasks. +// By allowing these Activators to be created before the tasks are created, we +// can create this struct and pass it to all the background tasks (and anybody +// else that wants to activate background tasks), even though the actual tasks +// aren't wired up yet. Then we can wire it up behind the scenes. If someone +// uses them ahead of time, they'll get the expected behavior: the task will be +// activated shortly. +// +// There remain several ways someone could get this wrong when adding or +// reworking background task: +// +// - Forgetting to put an activator for a background task into +// `BackgroundTasks`. If you do this, you likely won't have the argument you +// need for `Driver::register()`. +// - Forgetting to wire up an Activator by passing it to `Driver::register()`. +// We attempt to avoid this with an exhaustive match in +// `BackgroundTasksInitializer::start()`. If you forget to wire something up, +// rustc should report an unused variable. +// - Wiring the activator up to the wrong task (e.g., by copying and pasting a +// `Driver::register()` call and forgetting to update the activator argument). +// If this happens, it's likely that either one Activator gets used more than +// once (which is caught with a panic only at runtime, but during Nexus +// initialization, so it should definitely be caught in testing) or else some +// Activator is unused (see the previous bullet). +// +// It's not foolproof but hopefully these mechanisms will catch the easy +// mistakes. pub struct Activator { pub(super) notify: Arc, pub(super) wired_up: AtomicBool, } impl Activator { + /// Create an activator that is not yet wired up to any background task pub fn new() -> Activator { Activator { notify: Arc::new(Notify::new()), @@ -548,21 +668,17 @@ impl Activator { } } + /// Activate the background task that this Activator has been wired up to + /// + /// If this Activator has not yet been wired up with [`Driver::register()`], + /// then whenever it _is_ wired up, that task will be immediately activated. pub fn activate(&self) { self.notify.notify_one(); } } -impl BackgroundTasks { - /// Activate the specified background task - /// - /// If the task is currently running, it will be activated again when it - /// finishes. - pub fn activate(&self, task: &Activator) { - task.activate(); - } -} - +/// Starts the three DNS-propagation-related background tasks for either +/// internal or external DNS (depending on the arguments) #[allow(clippy::too_many_arguments)] fn init_dns( driver: &mut Driver, From 1664e0d698e73f98263c7448bcd88b677f13de89 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 21:27:14 -0700 Subject: [PATCH 15/23] doc comment editing --- nexus/src/app/background/driver.rs | 5 ++- nexus/src/app/background/init.rs | 56 +++++++++++++++++------------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index ddb75a30f4..b7d0537f68 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Manages execution of background tasks +//! Generic system for managing execution of background tasks +//! +//! None of this file is specific to Nexus or any of the specific background +//! tasks in Nexus, although the design is pretty bespoke for what Nexus needs. use super::init::Activator; use super::BackgroundTask; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 3bea72a103..0cc7c5323f 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -2,7 +2,34 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Specific background task initialization +//! Initialize Nexus background tasks +//! +//! This file contains entirely Nexus-specific initialization (as opposed to +//! driver.rs, which doesn't really know much about Nexus). +//! +//! The design here is oriented around being able to initialize background tasks +//! in two phases: +//! +//! 1. Phase 1 assembles a `BackgroundTasks` struct containing `Activator` +//! objects that will be used by any part of Nexus (including background +//! tasks) to activate any background task or read data provided by another +//! background task. This is the interface between this subsystem and the +//! rest of Nexus. At this point in startup, none of the background tasks +//! themselves have been started yet. +//! +//! 2. Phase 2 starts all of the individual background tasks and then wires up +//! the Activators created in phase 1. +//! +//! This allows us to break what would otherwise be a circular dependency during +//! initialization. Concretely: Nexus startup does phase 1, stores the +//! `BackgroundTasks` into the `Arc` to which all of Nexus has a +//! reference, and _then_ starts the background tasks. If we didn't break it up +//! like this, then we couldn't make the `Arc` available to background +//! tasks during _their_ initialization (because it couldn't be constructed +//! yet), which means background tasks could not activate other background +//! tasks. We'd also have trouble allowing background tasks to use other +//! subsystems in Nexus (e.g., sagas), especially if those subsystems wanted to +//! activate background tasks. use super::tasks::abandoned_vmm_reaper; use super::tasks::bfd; @@ -96,29 +123,8 @@ impl BackgroundTasks { /// Initializes the background task subsystem /// -/// The background task subsystem is initialized in two phases: -/// -/// 1. First, the caller invokes [`BackgroundTasksInitializer::new()`] and gets -/// back both a [`BackgroundTasksInitializer`] object and a -/// [`BackgroundTasks`] object. As the name suggests, the initializer is a -/// transient object that's just used to finish setting up the subsystem. -/// The `BackgroundTasks` object will provide a handle for the consumer to -/// activate background tasks and read data provided by those tasks after the -/// subsystem is up and running. -/// -/// 2. Some time later, the caller resumes initialization by invoking -/// [`BackgroundTasksInitializer::start()`] to actually start all the -/// specific background tasks that are part of Nexus. -/// -/// These two steps are separate so that the `BackgroundTasks` object can be -/// stored into Nexus and used by other subsystems, _including_ other background -/// tasks, to activate background tasks. In other words: if `BackgroundTasks` -/// was created _with_ all the individual tasks, then it wouldn't be possible to -/// _provide_ it to the tasks and allow them to activate each other. (It's less -/// obvious why, but it also wouldn't be possible to provide this to subsystems -/// like sagas that want to activate background tasks that also want to kick off -/// sagas.) -/// +/// See the module-level documentation for more on the two-phase initialization +/// of this subsystem. // See the definition of `Activator` for more design notes about this interface. pub struct BackgroundTasksInitializer { driver: Driver, @@ -134,7 +140,7 @@ impl BackgroundTasksInitializer { /// * a short-lived `BackgroundTasksInitializer` object, on which you can /// call `start()` to actually start the tasks /// * a long-lived `BackgroundTasks` object that you can use to activate any - /// of the tasks that will be started + /// of the tasks that will be started and read data that they provide pub fn new() -> (BackgroundTasksInitializer, BackgroundTasks) { let (external_endpoints_tx, external_endpoints_rx) = watch::channel(None); From aefe76536f7a1046f17d51f3e5a06ddcac119a9d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 21:49:25 -0700 Subject: [PATCH 16/23] move Activator; continue editing docs --- nexus/src/app/background/driver.rs | 38 ++++++++ nexus/src/app/background/init.rs | 143 +++++++++++------------------ 2 files changed, 94 insertions(+), 87 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index b7d0537f68..e4628d5446 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -209,6 +209,44 @@ impl Drop for Driver { } } +/// Activates a background task +/// +/// See [`nexus::app::background`] module-level documentation for more on what +/// that means. +/// +/// Activators are created with [`Activator::new()`] and then wired up to +/// specific background tasks using [`Driver::register()`]. If you call +/// `Activator::activate()` before the activator is wired up to a background +/// task, then once the Activator _is_ wired up to a task, that task will +/// immediately be activated. +/// +/// Activators are designed specifically so they can be created before the +/// corresponding task has been created and then wired up with just an +/// `&Activator` (not a `&mut Activator`). See the [`init`] module-level +/// documentation for more on why. +pub struct Activator { + pub(super) notify: Arc, + pub(super) wired_up: AtomicBool, +} + +impl Activator { + /// Create an activator that is not yet wired up to any background task + pub fn new() -> Activator { + Activator { + notify: Arc::new(Notify::new()), + wired_up: AtomicBool::new(false), + } + } + + /// Activate the background task that this Activator has been wired up to + /// + /// If this Activator has not yet been wired up with [`Driver::register()`], + /// then whenever it _is_ wired up, that task will be immediately activated. + pub fn activate(&self) { + self.notify.notify_one(); + } +} + /// Encapsulates state needed by the background tokio task to manage activation /// of the background task struct TaskExec { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 0cc7c5323f..26630ba784 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -30,6 +30,62 @@ //! tasks. We'd also have trouble allowing background tasks to use other //! subsystems in Nexus (e.g., sagas), especially if those subsystems wanted to //! activate background tasks. +//! +//! Why do we do things this way? We're trying to satisfy a few different +//! goals: +//! +//! - Background tasks should be able to activate other background tasks. +//! - Background tasks should be able to use other subsystems in Nexus (like +//! sagas) that themselves can activate background tasks. +//! - It should be hard to mess any of this up when adding or removing +//! background tasks. This means: +//! - We should be able to tell at compile-time which code activates what +//! specific background tasks. +//! - We should be able to tell at compile-time if code is attempting to +//! activate a background task that doesn't exist. +//! - It should be hard to add an Activator for a background task that is +//! not wired up to that task or is wired up to a different task. +//! +//! Ultimately, tasks are activated via the `Driver` which keeps track of tasks +//! by name. So how can we have code paths in Nexus refer to tasks in a way +//! that satisfies these goals? A conventional approach would be to have +//! `Driver::register()` return a handle that could be used to activate the +//! task, but then we wouldn't have the handle available until the task was +//! running, which is too late -- see the note above about the circular +//! dependency during initialization. We could make the task identifiers global +//! constants, but this is easy to mess up: someone could remove the task +//! without removing its constant. Then code paths could appear to activate the +//! task but fail at _runtime_ (rather than compile-time) because the task +//! actually doesn't exist. +//! +//! Instead, we assemble the `BackgroundTasks` struct, whose fields correspond +//! to specific tasks. This makes it super explicit what code paths are using +//! which tasks. And since the Activators in the struct can be created before +//! the tasks are created, we can create this whole struct and pass it to all +//! the background tasks (and anybody else that wants to activate background +//! tasks), even though the actual tasks aren't wired up yet. Then we can wire +//! it up behind the scenes. If someone uses the activators ahead of time, +//! they'll get the expected behavior: the task will be activated shortly. +//! +//! There remain several ways someone could get this wrong when adding or +//! reworking background tasks: +//! +//! - Forgetting to put an Activator for a background task into +//! `BackgroundTasks`. If you make this mistake, you won't get far because +//! you won't have the argument you need for `Driver::register()`. +//! - Forgetting to wire up an Activator by passing it to `Driver::register()`. +//! We attempt to avoid this with an exhaustive match inside +//! `BackgroundTasksInitializer::start()`. If you forget to wire something +//! up, rustc should report an unused variable. +//! - Wiring the Activator up to the wrong task (e.g., by copying and pasting a +//! `Driver::register()` call and forgetting to update the activator +//! argument). If this happens, it's likely that either one Activator gets +//! used more than once (which is caught with a panic only at runtime, but +//! it _is_ during Nexus initialization, so it should definitely be caught in +//! testing) or else some Activator is unused (see the previous bullet). +//! +//! It's not foolproof but hopefully these mechanisms will catch the easy +//! mistakes. use super::tasks::abandoned_vmm_reaper; use super::tasks::bfd; @@ -596,93 +652,6 @@ impl BackgroundTasksInitializer { } } -/// Activates a background task -/// -/// See [`nexus::app::background`] module-level documentation for more on what -/// that means. -/// -/// Activators are created with [`Activator::new()`] and then wired up to -/// specific background tasks using [`Driver::register()`]. If you call -/// `Activator::activate()` before the activator is wired up to a background -/// task, then once the activator _is_ wired up to a task, that task will -/// immediately be activated. -/// -/// Activators are designed specifically so they can be created before the -/// corresponding task has been created and then wired up with just an -/// `&Activator` (not a `&mut Activator`). That's so that we can construct -/// `BackgroundTasks`, a static list of activators for all background tasks in -/// Nexus, before we've actually set up all the tasks. This in turn breaks what -/// would otherwise be a circular initialization, if background tasks ever need -/// to activate other background tasks or if background tasks need to use other -/// subsystems (like sagas) that themselves want to activate background tasks. -// More straightforward approaches are available (e.g., register a task, get -// back a handle and use that to activate it) so it's worth explaining why it -// works this way. We're trying to satisfy a few different constraints: -// -// - background tasks can activate other background tasks -// - background tasks can use other subsystems in Nexus (like sagas) that -// themselves can activate background tasks -// - we should be able to tell at compile-time which code activates what specific -// background tasks -// - we should be able to tell at compile-time if code is attempting to activate -// a background task that doesn't exist (this would be a risk if tasks were -// identified by names) -// -// Our compile-time goals suggest an approach where tasks are identified either -// by global constants or by methods or fields in a struct, with one method or -// field for each task. With constants, it's still possible that one of these -// might not be wired up to anything. So we opt for fields in a struct, called -// `BackgroundTasks`. But that struct then needs to be available to anything -// that wants to activate background tasks -- including other background tasks. -// By allowing these Activators to be created before the tasks are created, we -// can create this struct and pass it to all the background tasks (and anybody -// else that wants to activate background tasks), even though the actual tasks -// aren't wired up yet. Then we can wire it up behind the scenes. If someone -// uses them ahead of time, they'll get the expected behavior: the task will be -// activated shortly. -// -// There remain several ways someone could get this wrong when adding or -// reworking background task: -// -// - Forgetting to put an activator for a background task into -// `BackgroundTasks`. If you do this, you likely won't have the argument you -// need for `Driver::register()`. -// - Forgetting to wire up an Activator by passing it to `Driver::register()`. -// We attempt to avoid this with an exhaustive match in -// `BackgroundTasksInitializer::start()`. If you forget to wire something up, -// rustc should report an unused variable. -// - Wiring the activator up to the wrong task (e.g., by copying and pasting a -// `Driver::register()` call and forgetting to update the activator argument). -// If this happens, it's likely that either one Activator gets used more than -// once (which is caught with a panic only at runtime, but during Nexus -// initialization, so it should definitely be caught in testing) or else some -// Activator is unused (see the previous bullet). -// -// It's not foolproof but hopefully these mechanisms will catch the easy -// mistakes. -pub struct Activator { - pub(super) notify: Arc, - pub(super) wired_up: AtomicBool, -} - -impl Activator { - /// Create an activator that is not yet wired up to any background task - pub fn new() -> Activator { - Activator { - notify: Arc::new(Notify::new()), - wired_up: AtomicBool::new(false), - } - } - - /// Activate the background task that this Activator has been wired up to - /// - /// If this Activator has not yet been wired up with [`Driver::register()`], - /// then whenever it _is_ wired up, that task will be immediately activated. - pub fn activate(&self) { - self.notify.notify_one(); - } -} - /// Starts the three DNS-propagation-related background tasks for either /// internal or external DNS (depending on the arguments) #[allow(clippy::too_many_arguments)] From f991eec27022a278fed63961df5425c1e39347b7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 22:06:05 -0700 Subject: [PATCH 17/23] fixups --- nexus/src/app/background/driver.rs | 8 ++++---- nexus/src/app/background/init.rs | 3 +-- nexus/src/app/background/mod.rs | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index e4628d5446..2b35c023c7 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -7,7 +7,6 @@ //! None of this file is specific to Nexus or any of the specific background //! tasks in Nexus, although the design is pretty bespoke for what Nexus needs. -use super::init::Activator; use super::BackgroundTask; use super::TaskName; use assert_matches::assert_matches; @@ -24,6 +23,7 @@ use nexus_types::internal_api::views::LastResult; use nexus_types::internal_api::views::LastResultCompleted; use nexus_types::internal_api::views::TaskStatus; use std::collections::BTreeMap; +use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; @@ -123,8 +123,8 @@ impl Driver { ) { panic!( "attempted to wire up the same background task handle \ - twice (previous \"wired_up\" = {}): currently attempting \ - to wire it up to task {:?}", + twice (previous \"wired_up\" = {}): currently attempting \ + to wire it up to task {:?}", previous, name ); } @@ -387,7 +387,7 @@ impl GenericWatcher for watch::Receiver { mod test { use super::BackgroundTask; use super::Driver; - use crate::app::background::init::Activator; + use crate::app::background::Activator; use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 26630ba784..ea9242d404 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -109,6 +109,7 @@ use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::v2p_mappings::V2PManager; use super::tasks::vpc_routes; +use super::Activator; use super::Driver; use crate::app::oximeter::PRODUCER_LEASE_DURATION; use crate::app::sagas::SagaRequest; @@ -119,11 +120,9 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use oximeter::types::ProducerRegistry; use std::collections::BTreeMap; -use std::sync::atomic::AtomicBool; use std::sync::Arc; use tokio::sync::mpsc::Sender; use tokio::sync::watch; -use tokio::sync::Notify; use uuid::Uuid; /// Interface for activating various background tasks and read data that they diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 64bbf1563c..1bd7a323c3 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -133,6 +133,7 @@ mod init; mod status; mod tasks; +pub use driver::Activator; pub use driver::Driver; pub use init::BackgroundTasks; pub use init::BackgroundTasksInitializer; From 92ee2cd332c0d16e1c66e2287beace43b07c739d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 26 Jun 2024 22:10:04 -0700 Subject: [PATCH 18/23] doc fixup --- nexus/src/app/background/driver.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index 2b35c023c7..a766469b94 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -211,7 +211,7 @@ impl Drop for Driver { /// Activates a background task /// -/// See [`nexus::app::background`] module-level documentation for more on what +/// See [`crate::app::background`] module-level documentation for more on what /// that means. /// /// Activators are created with [`Activator::new()`] and then wired up to @@ -222,7 +222,7 @@ impl Drop for Driver { /// /// Activators are designed specifically so they can be created before the /// corresponding task has been created and then wired up with just an -/// `&Activator` (not a `&mut Activator`). See the [`init`] module-level +/// `&Activator` (not a `&mut Activator`). See the [`super::init`] module-level /// documentation for more on why. pub struct Activator { pub(super) notify: Arc, From 20f7c41bcc3514da419a392454415ddcd87671f9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 27 Jun 2024 09:52:09 -0700 Subject: [PATCH 19/23] exercise Activator in tests --- nexus/src/app/background/driver.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index a766469b94..f9149d0e37 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -89,7 +89,6 @@ impl Driver { /// `activator` is an [`Activator`] that has not previously been used in a /// call to this function. It will be wired up so that using it will /// activate this newly-registered background task. - // XXX-dap TODO-coverage activator #[allow(clippy::too_many_arguments)] pub fn register( &mut self, @@ -99,9 +98,6 @@ impl Driver { imp: Box, opctx: OpContext, watchers: Vec>, - // XXX-dap can we do something about TaskName? It's only used by the - // tests, too. It's nice to have it for sure but is it really - // necessary? activator: &Activator, ) -> TaskName { // Activation of the background task happens in a separate tokio task. @@ -586,6 +582,17 @@ mod test { let status = driver.task_status(&h3); let last = status.last.unwrap_completion(); assert_eq!(last.iteration, 4); + + // Explicitly activate just "t2", this time using its Activator. + act2.activate(); + wait_until_count(rx2.clone(), 3).await; + assert_eq!(*rx3.borrow(), 4); + let status = driver.task_status(&h2); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 3); + let status = driver.task_status(&h3); + let last = status.last.unwrap_completion(); + assert_eq!(last.iteration, 4); } /// Simple background task that moves in lockstep with a consumer, allowing From 088335897ba533fc6d563d229efe5d2a5e518d3a Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 27 Jun 2024 10:05:57 -0700 Subject: [PATCH 20/23] that should be a 503 --- nexus/src/app/background/status.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/status.rs b/nexus/src/app/background/status.rs index 9cf9f564c9..fae66942f4 100644 --- a/nexus/src/app/background/status.rs +++ b/nexus/src/app/background/status.rs @@ -92,7 +92,7 @@ impl Nexus { fn driver(&self) -> Result<&Driver, Error> { self.background_tasks_driver.get().ok_or_else(|| { - Error::internal_error("background tasks not yet initialized") + Error::unavail("background tasks not yet initialized") }) } } From 249ee59d0a677c3a88223c30766ca62f1b6632be Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 27 Jun 2024 10:08:36 -0700 Subject: [PATCH 21/23] fix omdb test output --- dev-tools/omdb/tests/successes.out | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 032a574c8e..47d091443d 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -403,28 +403,28 @@ task: "dns_propagation_external" task: "nat_v4_garbage_collector" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "blueprint_loader" configured period: every 1m 40s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set task: "blueprint_executor" configured period: every 10m currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: no blueprint task: "abandoned_vmm_reaper" configured period: every 1m currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms total abandoned VMMs found: 0 VMM records deleted: 0 @@ -434,14 +434,14 @@ task: "abandoned_vmm_reaper" task: "bfd_manager" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } task: "crdb_node_id_collector" configured period: every 10m currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: no blueprint @@ -465,7 +465,7 @@ task: "external_endpoints" task: "instance_watcher" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms total instances checked: 0 checks completed: 0 @@ -486,14 +486,14 @@ task: "inventory_collection" task: "metrics_producer_gc" configured period: every 1m currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms warning: unknown background task: "metrics_producer_gc" (don't know how to interpret details: Object {"expiration": String(""), "pruned": Array []}) task: "phantom_disks" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms number of phantom disks deleted: 0 number of phantom disk delete errors: 0 @@ -508,7 +508,7 @@ task: "physical_disk_adoption" task: "region_replacement" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms number of region replacements started ok: 0 number of region replacement start errors: 0 @@ -516,7 +516,7 @@ task: "region_replacement" task: "region_replacement_driver" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms number of region replacement drive sagas started ok: 0 number of region replacement finish sagas started ok: 0 @@ -525,34 +525,34 @@ task: "region_replacement_driver" task: "service_firewall_rule_propagation" configured period: every 5m currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms task: "service_zone_nat_tracker" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms last completion reported error: inventory collection is None task: "switch_port_config_manager" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {}) task: "v2p_manager" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms warning: unknown background task: "v2p_manager" (don't know how to interpret details: Object {}) task: "vpc_route_manager" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms warning: unknown background task: "vpc_route_manager" (don't know how to interpret details: Object {}) From 1dcf10f620ae3eb536a044ca0044160cef76687d Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 28 Jun 2024 13:42:37 -0700 Subject: [PATCH 22/23] review feedback --- nexus/src/app/background/driver.rs | 13 ++++++++++--- nexus/src/app/background/init.rs | 26 +++++++++++++------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs index f9149d0e37..ad547a443c 100644 --- a/nexus/src/app/background/driver.rs +++ b/nexus/src/app/background/driver.rs @@ -88,7 +88,15 @@ impl Driver { /// /// `activator` is an [`Activator`] that has not previously been used in a /// call to this function. It will be wired up so that using it will - /// activate this newly-registered background task. + /// activate this newly-registered background task. This is an argument + /// rather than a returned value because it's useful for consumers to create + /// these ahead of time, before tasks have been set up. See [`super::init`] + /// module-level docs for more on this. + /// + /// # Panics + /// + /// This function panics if the `name` or `activator` has previously been + /// passed to a call to this function. #[allow(clippy::too_many_arguments)] pub fn register( &mut self, @@ -109,8 +117,7 @@ impl Driver { // We'll use a `Notify` to wake up that tokio task when an activation is // requested. The caller provides their own Activator, which just - // provides a specific Notify for us to use here. This allows them to - // have set up the activator before the task is actually created. + // provides a specific Notify for us to use here. if let Err(previous) = activator.wired_up.compare_exchange( false, true, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ea9242d404..fd37de344d 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -18,7 +18,7 @@ //! themselves have been started yet. //! //! 2. Phase 2 starts all of the individual background tasks and then wires up -//! the Activators created in phase 1. +//! the `Activator`s created in phase 1. //! //! This allows us to break what would otherwise be a circular dependency during //! initialization. Concretely: Nexus startup does phase 1, stores the @@ -43,7 +43,7 @@ //! specific background tasks. //! - We should be able to tell at compile-time if code is attempting to //! activate a background task that doesn't exist. -//! - It should be hard to add an Activator for a background task that is +//! - It should be hard to add an `Activator` for a background task that is //! not wired up to that task or is wired up to a different task. //! //! Ultimately, tasks are activated via the `Driver` which keeps track of tasks @@ -60,7 +60,7 @@ //! //! Instead, we assemble the `BackgroundTasks` struct, whose fields correspond //! to specific tasks. This makes it super explicit what code paths are using -//! which tasks. And since the Activators in the struct can be created before +//! which tasks. And since the `Activator`s in the struct can be created before //! the tasks are created, we can create this whole struct and pass it to all //! the background tasks (and anybody else that wants to activate background //! tasks), even though the actual tasks aren't wired up yet. Then we can wire @@ -70,19 +70,19 @@ //! There remain several ways someone could get this wrong when adding or //! reworking background tasks: //! -//! - Forgetting to put an Activator for a background task into +//! - Forgetting to put an `Activator` for a background task into //! `BackgroundTasks`. If you make this mistake, you won't get far because //! you won't have the argument you need for `Driver::register()`. -//! - Forgetting to wire up an Activator by passing it to `Driver::register()`. -//! We attempt to avoid this with an exhaustive match inside -//! `BackgroundTasksInitializer::start()`. If you forget to wire something -//! up, rustc should report an unused variable. -//! - Wiring the Activator up to the wrong task (e.g., by copying and pasting a -//! `Driver::register()` call and forgetting to update the activator -//! argument). If this happens, it's likely that either one Activator gets +//! - Forgetting to wire up an `Activator` by passing it to +//! `Driver::register()`. We attempt to avoid this with an exhaustive match +//! inside `BackgroundTasksInitializer::start()`. If you forget to wire +//! something up, rustc should report an unused variable. +//! - Wiring the `Activator` up to the wrong task (e.g., by copying and pasting +//! a `Driver::register()` call and forgetting to update the activator +//! argument). If this happens, it's likely that either one `Activator` gets //! used more than once (which is caught with a panic only at runtime, but //! it _is_ during Nexus initialization, so it should definitely be caught in -//! testing) or else some Activator is unused (see the previous bullet). +//! testing) or else some `Activator` is unused (see the previous bullet). //! //! It's not foolproof but hopefully these mechanisms will catch the easy //! mistakes. @@ -295,7 +295,7 @@ impl BackgroundTasksInitializer { // The following fields can be safely ignored here because they're // already wired up as needed. - external_endpoints: _external_endpoints, + external_endpoints: _, // Do NOT add a `..` catch-all here! See above. } = &background_tasks; From 514e2b21ef1ff7cbe210a77ef5b4545debf56b8a Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 28 Jun 2024 17:06:24 -0700 Subject: [PATCH 23/23] start background tasks even if populate fails (see 5980) --- nexus/src/app/mod.rs | 61 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 47f51d706d..58b405878e 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -485,11 +485,11 @@ impl Nexus { *nexus.recovery_task.lock().unwrap() = Some(recovery_task); - // Kick all background tasks once the populate step finishes. Among - // other things, the populate step installs role assignments for - // internal identities that are used by the background tasks. If we - // don't do this here, those tasks might fail spuriously on startup and - // not be retried for a while. + // Wait to start background tasks until after the populate step + // finishes. Among other things, the populate step installs role + // assignments for internal identities that are used by the background + // tasks. If we don't do this here, those tasks would fail spuriously + // on startup and not be retried for a while. let task_nexus = nexus.clone(); let task_log = nexus.log.clone(); let task_registry = producer_registry.clone(); @@ -497,36 +497,35 @@ impl Nexus { tokio::spawn(async move { match task_nexus.wait_for_populate().await { Ok(_) => { - info!( - task_log, - "populate complete; activating background tasks" - ); - - let driver = background_tasks_initializer.start( - &task_nexus.background_tasks, - background_ctx, - db_datastore, - task_config.pkg.background_tasks, - rack_id, - task_config.deployment.id, - resolver, - saga_request, - v2p_watcher_channel.clone(), - task_registry, - ); - - if let Err(_) = - task_nexus.background_tasks_driver.set(driver) - { - panic!( - "concurrent initialization of \ - background_tasks_driver?!" - ) - } + info!(task_log, "populate complete"); } Err(_) => { error!(task_log, "populate failed"); } + }; + + // That said, even if the populate step fails, we may as well try to + // start the background tasks so that whatever can work will work. + info!(task_log, "activating background tasks"); + + let driver = background_tasks_initializer.start( + &task_nexus.background_tasks, + background_ctx, + db_datastore, + task_config.pkg.background_tasks, + rack_id, + task_config.deployment.id, + resolver, + saga_request, + v2p_watcher_channel.clone(), + task_registry, + ); + + if let Err(_) = task_nexus.background_tasks_driver.set(driver) { + panic!( + "concurrent initialization of \ + background_tasks_driver?!" + ) } });