Skip to content

Commit

Permalink
[nexus-test-utils] set 60s timeouts for each init step (#4789)
Browse files Browse the repository at this point in the history
In #4779 we're tracking what appears to be a ClickHouse initialization
failure during Nexus startup. Set a timeout of 60s for each step in the
initialization process.

These steps should usually not take more than 5 seconds each, so 60s is
a really comfortable buffer.
  • Loading branch information
sunshowers authored Jan 10, 2024
1 parent 45b6651 commit 9eba46d
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 28 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion common/src/api/external/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1919,7 +1919,7 @@ impl MacAddr {
/// Iterate the MAC addresses in the system address range
/// (used as an allocator in contexts where collisions are not expected and
/// determinism is useful, like in the test suite)
pub fn iter_system() -> impl Iterator<Item = MacAddr> {
pub fn iter_system() -> impl Iterator<Item = MacAddr> + Send {
((Self::MAX_SYSTEM_RESV + 1)..=Self::MAX_SYSTEM_ADDR)
.map(Self::from_i64)
}
Expand Down
4 changes: 2 additions & 2 deletions nexus/test-interface/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ use std::net::{SocketAddr, SocketAddrV6};
use uuid::Uuid;

#[async_trait]
pub trait NexusServer {
type InternalServer;
pub trait NexusServer: Send + Sync + 'static {
type InternalServer: Send + Sync + 'static;

async fn start_internal(
config: &Config,
Expand Down
2 changes: 2 additions & 0 deletions nexus/test-utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ crucible-agent-client.workspace = true
dns-server.workspace = true
dns-service-client.workspace = true
dropshot.workspace = true
futures.workspace = true
gateway-messages.workspace = true
gateway-test-utils.workspace = true
headers.workspace = true
Expand All @@ -34,6 +35,7 @@ serde.workspace = true
serde_json.workspace = true
serde_urlencoded.workspace = true
slog.workspace = true
tokio.workspace = true
trust-dns-resolver.workspace = true
uuid.workspace = true
omicron-workspace-hack.workspace = true
Expand Down
160 changes: 135 additions & 25 deletions nexus/test-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use dropshot::ConfigDropshot;
use dropshot::ConfigLogging;
use dropshot::ConfigLoggingLevel;
use dropshot::HandlerTaskMode;
use futures::future::BoxFuture;
use futures::FutureExt;
use gateway_test_utils::setup::GatewayTestContext;
use nexus_test_interface::NexusServer;
use nexus_types::external_api::params::UserId;
Expand All @@ -39,7 +41,7 @@ use omicron_test_utils::dev;
use oximeter_collector::Oximeter;
use oximeter_producer::LogConfig;
use oximeter_producer::Server as ProducerServer;
use slog::{debug, o, Logger};
use slog::{debug, error, o, Logger};
use std::collections::HashMap;
use std::fmt::Debug;
use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6};
Expand Down Expand Up @@ -158,7 +160,7 @@ struct RackInitRequestBuilder {
services: Vec<nexus_types::internal_api::params::ServicePutRequest>,
datasets: Vec<nexus_types::internal_api::params::DatasetCreateRequest>,
internal_dns_config: internal_dns::DnsConfigBuilder,
mac_addrs: Box<dyn Iterator<Item = MacAddr>>,
mac_addrs: Box<dyn Iterator<Item = MacAddr> + Send>,
}

impl RackInitRequestBuilder {
Expand Down Expand Up @@ -254,11 +256,18 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> {
pub external_dns_zone_name: Option<String>,
pub external_dns: Option<dns_server::TransientServer>,
pub internal_dns: Option<dns_server::TransientServer>,
dns_config: Option<DnsConfigParams>,

pub silo_name: Option<Name>,
pub user_name: Option<UserId>,
}

type StepInitFn<'a, N> = Box<
dyn for<'b> FnOnce(
&'b mut ControlPlaneTestContextBuilder<'a, N>,
) -> BoxFuture<'b, ()>,
>;

impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
pub fn new(
test_name: &'a str,
Expand Down Expand Up @@ -290,11 +299,37 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
external_dns_zone_name: None,
external_dns: None,
internal_dns: None,
dns_config: None,
silo_name: None,
user_name: None,
}
}

pub async fn init_with_steps(
&mut self,
steps: Vec<(&str, StepInitFn<'a, N>)>,
timeout: Duration,
) {
let log = self.logctx.log.new(o!("component" => "init_with_steps"));
for (step_name, step) in steps {
debug!(log, "Running step {step_name}");
let step_fut = step(self);
match tokio::time::timeout(timeout, step_fut).await {
Ok(()) => {}
Err(_) => {
error!(
log,
"Timed out after {timeout:?} \
while running step {step_name}, failing test"
);
panic!(
"Timed out after {timeout:?} while running step {step_name}",
);
}
}
}
}

pub async fn start_crdb(&mut self, populate: bool) {
let populate = if populate {
PopulateCrdb::FromEnvironmentSeed
Expand Down Expand Up @@ -581,7 +616,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
self.nexus_internal_addr = Some(nexus_internal_addr);
}

pub async fn populate_internal_dns(&mut self) -> DnsConfigParams {
pub async fn populate_internal_dns(&mut self) {
let log = &self.logctx.log;
debug!(log, "Populating Internal DNS");

Expand All @@ -604,18 +639,21 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
dns_config_client.dns_config_put(&dns_config).await.expect(
"Failed to send initial DNS records to internal DNS server",
);
dns_config
self.dns_config = Some(dns_config);
}

// Perform RSS handoff
pub async fn start_nexus_external(
&mut self,
dns_config: DnsConfigParams,
tls_certificates: Vec<Certificate>,
) {
let log = &self.logctx.log;
debug!(log, "Starting Nexus (external API)");

let dns_config = self.dns_config.clone().expect(
"populate_internal_dns must be called before start_nexus_external",
);

// Create a recovery silo
let external_dns_zone_name =
internal_dns::names::DNS_ZONE_EXTERNAL_TESTING.to_string();
Expand Down Expand Up @@ -956,30 +994,102 @@ async fn setup_with_config_impl<N: NexusServer>(
sim_mode: sim::SimMode,
initial_cert: Option<Certificate>,
) -> ControlPlaneTestContext<N> {
builder.start_crdb_impl(populate).await;
builder.start_clickhouse().await;
builder.start_gateway().await;
builder.start_dendrite(SwitchLocation::Switch0).await;
builder.start_dendrite(SwitchLocation::Switch1).await;
builder.start_mgd(SwitchLocation::Switch0).await;
builder.start_mgd(SwitchLocation::Switch1).await;
builder.start_internal_dns().await;
builder.start_external_dns().await;
builder.start_nexus_internal().await;
builder.start_sled(sim_mode).await;
builder.start_crucible_pantry().await;
builder.scrimlet_dns_setup().await;

// Give Nexus necessary information to find the Crucible Pantry
let dns_config = builder.populate_internal_dns().await;
const STEP_TIMEOUT: Duration = Duration::from_secs(60);

builder
.start_nexus_external(dns_config, initial_cert.into_iter().collect())
.init_with_steps(
vec![
(
"start_crdb",
Box::new(|builder| {
builder.start_crdb_impl(populate).boxed()
}),
),
(
"start_clickhouse",
Box::new(|builder| builder.start_clickhouse().boxed()),
),
(
"start_gateway",
Box::new(|builder| builder.start_gateway().boxed()),
),
(
"start_dendrite_switch0",
Box::new(|builder| {
builder.start_dendrite(SwitchLocation::Switch0).boxed()
}),
),
(
"start_dendrite_switch1",
Box::new(|builder| {
builder.start_dendrite(SwitchLocation::Switch1).boxed()
}),
),
(
"start_mgd_switch0",
Box::new(|builder| {
builder.start_mgd(SwitchLocation::Switch0).boxed()
}),
),
(
"start_mgd_switch1",
Box::new(|builder| {
builder.start_mgd(SwitchLocation::Switch1).boxed()
}),
),
(
"start_internal_dns",
Box::new(|builder| builder.start_internal_dns().boxed()),
),
(
"start_external_dns",
Box::new(|builder| builder.start_external_dns().boxed()),
),
(
"start_nexus_internal",
Box::new(|builder| builder.start_nexus_internal().boxed()),
),
(
"start_sled",
Box::new(move |builder| {
builder.start_sled(sim_mode).boxed()
}),
),
(
"start_crucible_pantry",
Box::new(|builder| builder.start_crucible_pantry().boxed()),
),
(
"scrimlet_dns_setup",
Box::new(|builder| builder.scrimlet_dns_setup().boxed()),
),
(
"populate_internal_dns",
Box::new(|builder| builder.populate_internal_dns().boxed()),
),
(
"start_nexus_external",
Box::new(|builder| {
builder
.start_nexus_external(
initial_cert.into_iter().collect(),
)
.boxed()
}),
),
(
"start_oximeter",
Box::new(|builder| builder.start_oximeter().boxed()),
),
(
"start_producer_server",
Box::new(|builder| builder.start_producer_server().boxed()),
),
],
STEP_TIMEOUT,
)
.await;

builder.start_oximeter().await;
builder.start_producer_server().await;

builder.build()
}

Expand Down

0 comments on commit 9eba46d

Please sign in to comment.