From e45b7f1c3caa6c336c900110ceb3afb0c44fc43b Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 6 Aug 2024 17:33:51 +0000 Subject: [PATCH] Fix flaky instance zone-boot timeout test - Fixes #6197 - Use a multi-threaded runtime for the test so we can literally sleep in the boot closure without blocking the test itself. --- sled-agent/src/instance.rs | 64 ++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index b56f5d9df0..778abf4449 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1617,6 +1617,7 @@ mod tests { use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::str::FromStr; + use std::time::Duration; use tokio::sync::watch::Receiver; use tokio::time::timeout; @@ -2043,7 +2044,7 @@ mod tests { logctx.cleanup_successful(); } - #[tokio::test] + #[tokio::test(flavor = "multi_thread")] async fn test_instance_create_timeout_while_creating_zone() { let logctx = omicron_test_utils::dev::test_setup_log( "test_instance_create_timeout_while_creating_zone", @@ -2055,17 +2056,30 @@ mod tests { // time out while booting zone, on purpose! let boot_ctx = MockZones::boot_context(); - let start = tokio::time::Instant::now(); + const TIMEOUT: Duration = Duration::from_secs(1); + let (boot_continued_tx, boot_continued_rx) = + std::sync::mpsc::sync_channel(1); boot_ctx.expect().times(1).return_once(move |_| { - // We need something that will look like the zone taking a long time - // to boot, but we cannot use a `tokio::time` construct here since - // this is a blocking context and we cannot call `block_on()` - // recursively. We advance time by this amount below, so this will - // most likely result in a small number of additional sleeps until - // the timeout has really elased. - while start.elapsed() < TIMEOUT_DURATION * 2 { - std::thread::sleep(std::time::Duration::from_millis(1)); - } + // We need a way to slow down zone boot, but that doesn't block the + // entire Tokio runtime. Since this closure is synchronous, it also + // has no way to await anything, all waits are blocking. That means + // we cannot use a single-threaded runtime, which also means no + // manually advancing time. The test has to take the full "slow boot + // time". + // + // To do this, we use a multi-threaded runtime, and call + // block_in_place so that we can just literally sleep for a while. + // The sleep duration here is twice a timeout we set on the attempt + // to actually set the instance running below. + // + // This boot method also directly signals the main test code to + // continue when it's done sleeping to synchronize with it. + tokio::task::block_in_place(move || { + println!("MockZones::boot() called, waiting for timeout"); + std::thread::sleep(TIMEOUT * 2); + println!("MockZones::boot() waited for timeout, continuing"); + boot_continued_tx.send(()).unwrap(); + }); Ok(()) }); let wait_ctx = illumos_utils::svc::wait_for_service_context(); @@ -2100,8 +2114,6 @@ mod tests { .await .expect("timed out creating Instance struct"); - tokio::time::pause(); - let (put_tx, put_rx) = oneshot::channel(); // pretending we're InstanceManager::ensure_state, try in vain to start @@ -2110,20 +2122,16 @@ mod tests { .await .expect("failed to send Instance::put_state"); - // Timeout our future waiting for the instance-state-change at - // `TIMEOUT_DURATION`, which should fail because zone boot will take - // twice that by construction. - let timeout_fut = timeout(TIMEOUT_DURATION, put_rx); - - // And advance time by twice that, so that the actual - // `MockZones::boot()` call should be exercised (or will be soon). - tokio::time::advance(TIMEOUT_DURATION * 2).await; - - tokio::time::resume(); - + // Timeout our future waiting for the instance-state-change at 1s. This + // is much shorter than the actual `TIMEOUT_DURATION`, but the test + // structure requires that we actually wait this period, since we cannot + // advance time manually in a multi-threaded runtime. + let timeout_fut = timeout(TIMEOUT, put_rx); + println!("Awaiting zone-boot timeout"); timeout_fut .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); + println!("Zone-boot timeout awaited"); if let ReceivedInstanceState::InstancePut(SledInstanceState { vmm_state: VmmRuntimeState { state: VmmState::Running, .. }, @@ -2133,6 +2141,14 @@ mod tests { panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); } + // Notify the "boot" closure that it can continue, and then wait to + // ensure it's actually called. + println!("Waiting for zone-boot to continue"); + tokio::task::spawn_blocking(move || boot_continued_rx.recv().unwrap()) + .await + .unwrap(); + println!("Received continued message from MockZones::boot()"); + storage_harness.cleanup().await; logctx.cleanup_successful(); }