From 9eba46df9980aa76a5043cf0300feddc6e79202c Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 9 Jan 2024 18:27:32 -0800 Subject: [PATCH] [nexus-test-utils] set 60s timeouts for each init step (#4789) In #4779 we're tracking what appears to be a ClickHouse initialization failure during Nexus startup. Set a timeout of 60s for each step in the initialization process. These steps should usually not take more than 5 seconds each, so 60s is a really comfortable buffer. --- Cargo.lock | 2 + common/src/api/external/mod.rs | 2 +- nexus/test-interface/src/lib.rs | 4 +- nexus/test-utils/Cargo.toml | 2 + nexus/test-utils/src/lib.rs | 160 +++++++++++++++++++++++++++----- 5 files changed, 142 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3db966876d..2974dfe98e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4265,6 +4265,7 @@ dependencies = [ "dns-server", "dns-service-client", "dropshot", + "futures", "gateway-messages", "gateway-test-utils", "headers", @@ -4286,6 +4287,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "slog", + "tokio", "trust-dns-resolver", "uuid", ] diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 312d400d2f..899f15a04b 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1919,7 +1919,7 @@ impl MacAddr { /// Iterate the MAC addresses in the system address range /// (used as an allocator in contexts where collisions are not expected and /// determinism is useful, like in the test suite) - pub fn iter_system() -> impl Iterator { + pub fn iter_system() -> impl Iterator + Send { ((Self::MAX_SYSTEM_RESV + 1)..=Self::MAX_SYSTEM_ADDR) .map(Self::from_i64) } diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 2456f27684..23326a5ecb 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -38,8 +38,8 @@ use std::net::{SocketAddr, SocketAddrV6}; use uuid::Uuid; #[async_trait] -pub trait NexusServer { - type InternalServer; +pub trait NexusServer: Send + Sync + 'static { + type InternalServer: Send + Sync + 'static; async fn start_internal( config: &Config, diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 024cba958b..4a7924770e 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -14,6 +14,7 @@ crucible-agent-client.workspace = true dns-server.workspace = true dns-service-client.workspace = true dropshot.workspace = true +futures.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true headers.workspace = true @@ -34,6 +35,7 @@ serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true slog.workspace = true +tokio.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index d2ac0405fc..c6dc9fefe9 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -14,6 +14,8 @@ use dropshot::ConfigDropshot; use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; +use futures::future::BoxFuture; +use futures::FutureExt; use gateway_test_utils::setup::GatewayTestContext; use nexus_test_interface::NexusServer; use nexus_types::external_api::params::UserId; @@ -39,7 +41,7 @@ use omicron_test_utils::dev; use oximeter_collector::Oximeter; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; -use slog::{debug, o, Logger}; +use slog::{debug, error, o, Logger}; use std::collections::HashMap; use std::fmt::Debug; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -158,7 +160,7 @@ struct RackInitRequestBuilder { services: Vec, datasets: Vec, internal_dns_config: internal_dns::DnsConfigBuilder, - mac_addrs: Box>, + mac_addrs: Box + Send>, } impl RackInitRequestBuilder { @@ -254,11 +256,18 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> { pub external_dns_zone_name: Option, pub external_dns: Option, pub internal_dns: Option, + dns_config: Option, pub silo_name: Option, pub user_name: Option, } +type StepInitFn<'a, N> = Box< + dyn for<'b> FnOnce( + &'b mut ControlPlaneTestContextBuilder<'a, N>, + ) -> BoxFuture<'b, ()>, +>; + impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { pub fn new( test_name: &'a str, @@ -290,11 +299,37 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { external_dns_zone_name: None, external_dns: None, internal_dns: None, + dns_config: None, silo_name: None, user_name: None, } } + pub async fn init_with_steps( + &mut self, + steps: Vec<(&str, StepInitFn<'a, N>)>, + timeout: Duration, + ) { + let log = self.logctx.log.new(o!("component" => "init_with_steps")); + for (step_name, step) in steps { + debug!(log, "Running step {step_name}"); + let step_fut = step(self); + match tokio::time::timeout(timeout, step_fut).await { + Ok(()) => {} + Err(_) => { + error!( + log, + "Timed out after {timeout:?} \ + while running step {step_name}, failing test" + ); + panic!( + "Timed out after {timeout:?} while running step {step_name}", + ); + } + } + } + } + pub async fn start_crdb(&mut self, populate: bool) { let populate = if populate { PopulateCrdb::FromEnvironmentSeed @@ -581,7 +616,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.nexus_internal_addr = Some(nexus_internal_addr); } - pub async fn populate_internal_dns(&mut self) -> DnsConfigParams { + pub async fn populate_internal_dns(&mut self) { let log = &self.logctx.log; debug!(log, "Populating Internal DNS"); @@ -604,18 +639,21 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { dns_config_client.dns_config_put(&dns_config).await.expect( "Failed to send initial DNS records to internal DNS server", ); - dns_config + self.dns_config = Some(dns_config); } // Perform RSS handoff pub async fn start_nexus_external( &mut self, - dns_config: DnsConfigParams, tls_certificates: Vec, ) { let log = &self.logctx.log; debug!(log, "Starting Nexus (external API)"); + let dns_config = self.dns_config.clone().expect( + "populate_internal_dns must be called before start_nexus_external", + ); + // Create a recovery silo let external_dns_zone_name = internal_dns::names::DNS_ZONE_EXTERNAL_TESTING.to_string(); @@ -956,30 +994,102 @@ async fn setup_with_config_impl( sim_mode: sim::SimMode, initial_cert: Option, ) -> ControlPlaneTestContext { - builder.start_crdb_impl(populate).await; - builder.start_clickhouse().await; - builder.start_gateway().await; - builder.start_dendrite(SwitchLocation::Switch0).await; - builder.start_dendrite(SwitchLocation::Switch1).await; - builder.start_mgd(SwitchLocation::Switch0).await; - builder.start_mgd(SwitchLocation::Switch1).await; - builder.start_internal_dns().await; - builder.start_external_dns().await; - builder.start_nexus_internal().await; - builder.start_sled(sim_mode).await; - builder.start_crucible_pantry().await; - builder.scrimlet_dns_setup().await; - - // Give Nexus necessary information to find the Crucible Pantry - let dns_config = builder.populate_internal_dns().await; + const STEP_TIMEOUT: Duration = Duration::from_secs(60); builder - .start_nexus_external(dns_config, initial_cert.into_iter().collect()) + .init_with_steps( + vec![ + ( + "start_crdb", + Box::new(|builder| { + builder.start_crdb_impl(populate).boxed() + }), + ), + ( + "start_clickhouse", + Box::new(|builder| builder.start_clickhouse().boxed()), + ), + ( + "start_gateway", + Box::new(|builder| builder.start_gateway().boxed()), + ), + ( + "start_dendrite_switch0", + Box::new(|builder| { + builder.start_dendrite(SwitchLocation::Switch0).boxed() + }), + ), + ( + "start_dendrite_switch1", + Box::new(|builder| { + builder.start_dendrite(SwitchLocation::Switch1).boxed() + }), + ), + ( + "start_mgd_switch0", + Box::new(|builder| { + builder.start_mgd(SwitchLocation::Switch0).boxed() + }), + ), + ( + "start_mgd_switch1", + Box::new(|builder| { + builder.start_mgd(SwitchLocation::Switch1).boxed() + }), + ), + ( + "start_internal_dns", + Box::new(|builder| builder.start_internal_dns().boxed()), + ), + ( + "start_external_dns", + Box::new(|builder| builder.start_external_dns().boxed()), + ), + ( + "start_nexus_internal", + Box::new(|builder| builder.start_nexus_internal().boxed()), + ), + ( + "start_sled", + Box::new(move |builder| { + builder.start_sled(sim_mode).boxed() + }), + ), + ( + "start_crucible_pantry", + Box::new(|builder| builder.start_crucible_pantry().boxed()), + ), + ( + "scrimlet_dns_setup", + Box::new(|builder| builder.scrimlet_dns_setup().boxed()), + ), + ( + "populate_internal_dns", + Box::new(|builder| builder.populate_internal_dns().boxed()), + ), + ( + "start_nexus_external", + Box::new(|builder| { + builder + .start_nexus_external( + initial_cert.into_iter().collect(), + ) + .boxed() + }), + ), + ( + "start_oximeter", + Box::new(|builder| builder.start_oximeter().boxed()), + ), + ( + "start_producer_server", + Box::new(|builder| builder.start_producer_server().boxed()), + ), + ], + STEP_TIMEOUT, + ) .await; - builder.start_oximeter().await; - builder.start_producer_server().await; - builder.build() }