diff --git a/end-to-end-tests/src/instance_launch.rs b/end-to-end-tests/src/instance_launch.rs index 1aae46fe984..377fef4c0bc 100644 --- a/end-to-end-tests/src/instance_launch.rs +++ b/end-to-end-tests/src/instance_launch.rs @@ -7,7 +7,7 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{ ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate, InstanceCpuCount, InstanceCreate, InstanceDiskAttachment, - InstanceNetworkInterfaceAttachment, SshKeyCreate, + InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate, }; use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt}; use russh::{ChannelMsg, Disconnect}; @@ -15,7 +15,6 @@ use russh_keys::key::{KeyPair, PublicKey}; use russh_keys::PublicKeyBase64; use std::sync::Arc; use std::time::Duration; -use tokio::time::sleep; #[tokio::test] async fn instance_launch() -> Result<()> { @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> { type Error = CondCheckError>; + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + let data = String::from_utf8_lossy( &ctx.client .instance_serial_console() @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> { // check that we saw it on the console eprintln!("waiting for serial console"); - sleep(Duration::from_secs(5)).await; - let data = String::from_utf8_lossy( - &ctx.client - .instance_serial_console() - .project(ctx.project_name.clone()) - .instance(instance.name.clone()) - .most_recent(1024 * 1024) - .max_bytes(1024 * 1024) - .send() - .await? - .data, + + let data = wait_for_condition( + || async { + type Error = + CondCheckError>; + + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + + let data = String::from_utf8_lossy( + &ctx.client + .instance_serial_console() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .most_recent(1024 * 1024) + .max_bytes(1024 * 1024) + .send() + .await + .map_err(|_e| Error::NotYet)? + .data, + ) + .into_owned(); + if data.contains("-----END SSH HOST KEY KEYS-----") { + Ok(data) + } else { + Err(Error::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), ) - .into_owned(); + .await?; + ensure!( data.contains("Hello, Oxide!"), "string not seen on console\n{}", diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 2d09078e189..2300bd56f26 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -986,6 +986,9 @@ impl super::Nexus { // // If the operation failed, kick the sled agent error back up to // the caller to let it decide how to handle it. + // + // When creating the zone for the first time, we just get + // Ok(None) here, which is a no-op in write_returned_instance_state. match instance_put_result { Ok(state) => self .write_returned_instance_state(&instance_id, state) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 0df2b830085..733021475e1 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _; use std::convert::TryFrom; use std::net::Ipv4Addr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; use dropshot::test_util::ClientTestContext; @@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::shared::SiloRole; use omicron_sled_agent::sim; +use omicron_test_utils::dev::poll; +use omicron_test_utils::dev::poll::CondCheckError; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { // Create an instance and poke it to ensure it's running. let instance = create_instance(client, PROJECT_NAME, instance_name).await; - instance_simulate(nexus, &instance.identity.id).await; - let instance_next = instance_get(&client, &instance_url).await; + let instance_next = poll::wait_for_condition( + || async { + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + if instance_next.runtime.run_state == InstanceState::Running { + Ok(instance_next) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(60), + ) + .await + .unwrap(); identity_eq(&instance.identity, &instance_next.identity); - assert_eq!(instance_next.runtime.run_state, InstanceState::Running); assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 4cff340c898..de37b77bcd5 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -121,7 +121,6 @@ async fn sled_agent_put( struct InstancePathParam { instance_id: Uuid, } - #[endpoint { method = PUT, path = "/instances/{instance_id}", diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 890b30137fd..60cb33306bd 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -30,7 +30,6 @@ use illumos_utils::link::VnicAllocator; use illumos_utils::opte::{DhcpCfg, PortManager}; use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; -use illumos_utils::zone::Zones; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::api::internal::nexus::{ @@ -52,6 +51,11 @@ use std::sync::Arc; use tokio::sync::{mpsc, oneshot}; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + // The depth of the request queue for the instance. const QUEUE_SIZE: usize = 32; @@ -411,7 +415,9 @@ impl InstanceRunner { }, Some(PutState{ state, tx }) => { tx.send(self.put_state(state).await - .map(|r| InstancePutStateResponse { updated_runtime: Some(r) }) + .map(|r| InstancePutStateResponse { + updated_runtime: Some(r), + }) .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, @@ -1544,21 +1550,29 @@ mod tests { use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use omicron_common::api::external::{ - ByteCount, Generation, InstanceCpuCount, InstanceState, + ByteCount, Generation, Hostname, InstanceCpuCount, InstanceState, }; use omicron_common::api::internal::nexus::InstanceProperties; use sled_storage::disk::{RawDisk, SyntheticDisk}; use sled_storage::manager::FakeStorageManager; use std::net::Ipv6Addr; + use std::str::FromStr; use tokio::sync::watch::Receiver; use tokio::time::timeout; const TIMEOUT_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(3); + #[derive(Default, Clone)] + enum ReceivedInstanceState { + #[default] + None, + InstancePut(SledInstanceState), + } + struct NexusServer { observed_runtime_state: - tokio::sync::watch::Sender>, + tokio::sync::watch::Sender, } impl FakeNexusServer for NexusServer { fn cpapi_instances_put( @@ -1566,30 +1580,38 @@ mod tests { _instance_id: Uuid, new_runtime_state: SledInstanceState, ) -> Result<(), omicron_common::api::external::Error> { - self.observed_runtime_state.send(Some(new_runtime_state)) - .map_err(|_| omicron_common::api::external::Error::internal_error("couldn't send updated SledInstanceState to test driver")) + self.observed_runtime_state + .send(ReceivedInstanceState::InstancePut(new_runtime_state)) + .map_err(|_| { + omicron_common::api::external::Error::internal_error( + "couldn't send SledInstanceState to test driver", + ) + }) } } - fn fake_nexus_server( - logctx: &LogContext, - ) -> ( - NexusClient, - HttpServer, - Receiver>, - ) { - let (state_tx, state_rx) = tokio::sync::watch::channel(None); - - let nexus_server = crate::fakes::nexus::start_test_server( - logctx.log.new(o!("component" => "FakeNexusServer")), - Box::new(NexusServer { observed_runtime_state: state_tx }), - ); - let nexus_client = NexusClient::new( - &format!("http://{}", nexus_server.local_addr()), - logctx.log.new(o!("component" => "NexusClient")), - ); + struct FakeNexusParts { + nexus_client: NexusClient, + nexus_server: HttpServer, + state_rx: Receiver, + } - (nexus_client, nexus_server, state_rx) + impl FakeNexusParts { + fn new(logctx: &LogContext) -> Self { + let (state_tx, state_rx) = + tokio::sync::watch::channel(ReceivedInstanceState::None); + + let nexus_server = crate::fakes::nexus::start_test_server( + logctx.log.new(o!("component" => "FakeNexusServer")), + Box::new(NexusServer { observed_runtime_state: state_tx }), + ); + let nexus_client = NexusClient::new( + &format!("http://{}", nexus_server.local_addr()), + logctx.log.new(o!("component" => "NexusClient")), + ); + + Self { nexus_client, nexus_server, state_rx } + } } fn mock_vnic_contexts( @@ -1610,7 +1632,7 @@ mod tests { // which calls Instance::propolis_ensure, // which spawns Instance::monitor_state_task, // which calls cpapi_instances_put - // and calls Instance::setup_propolis_locked, + // and calls Instance::setup_propolis_inner, // which creates the zone (which isn't real in these tests, of course) fn mock_zone_contexts( ) -> (MockZonesBootContext, MockWaitForServiceContext, MockZonesIdContext) @@ -1740,12 +1762,38 @@ mod tests { ) -> Instance { let id = Uuid::new_v4(); let propolis_id = Uuid::new_v4(); + let ticket = InstanceTicket::new_without_manager_for_test(id); + + let initial_state = + fake_instance_initial_state(propolis_id, propolis_addr); + + let services = fake_instance_manager_services( + logctx, + storage_handle, + nexus_client_with_resolver, + ); + + Instance::new( + logctx.log.new(o!("component" => "Instance")), + id, + propolis_id, + ticket, + initial_state, + services, + ) + .unwrap() + } + + fn fake_instance_initial_state( + propolis_id: Uuid, + propolis_addr: SocketAddr, + ) -> InstanceInitialState { let hardware = InstanceHardware { properties: InstanceProperties { ncpus: InstanceCpuCount(1), memory: ByteCount::from_gibibytes_u32(1), - hostname: "bert".to_string(), + hostname: Hostname::from_str("bert").unwrap(), }, nics: vec![], source_nat: SourceNatConfig { @@ -1765,7 +1813,7 @@ mod tests { cloud_init_bytes: None, }; - let initial_state = InstanceInitialState { + InstanceInitialState { hardware, instance_runtime: InstanceRuntimeState { propolis_id: Some(propolis_id), @@ -1780,8 +1828,14 @@ mod tests { time_updated: Default::default(), }, propolis_addr, - }; + } + } + fn fake_instance_manager_services( + logctx: &LogContext, + storage_handle: StorageHandle, + nexus_client_with_resolver: NexusClientWithResolver, + ) -> InstanceManagerServices { let vnic_allocator = VnicAllocator::new("Foo", Etherstub("mystub".to_string())); let port_manager = PortManager::new( @@ -1796,24 +1850,14 @@ mod tests { cleanup_context, ); - let services = InstanceManagerServices { + InstanceManagerServices { nexus_client: nexus_client_with_resolver, vnic_allocator, port_manager, storage: storage_handle, zone_bundler, zone_builder_factory: ZoneBuilderFactory::fake(), - }; - - Instance::new( - logctx.log.new(o!("component" => "Instance")), - id, - propolis_id, - ticket, - initial_state, - services, - ) - .unwrap() + } } #[tokio::test] @@ -1830,8 +1874,8 @@ mod tests { let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let (nexus_client, nexus_server, mut state_rx) = - fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1855,29 +1899,30 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout( - TIMEOUT_DURATION, - inst.put_state(InstanceStateRequested::Running), - ) - .await - .expect("timed out waiting for Instance::put_state") - .unwrap(); + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) + .await + .expect("timed out waiting for Instance::put_state result") + .expect("failed to receive Instance::put_state result") + .expect("Instance::put_state failed"); timeout( TIMEOUT_DURATION, - state_rx.wait_for(|maybe_state| { - maybe_state - .as_ref() - .map(|sled_inst_state| { - sled_inst_state.vmm_state.state - == InstanceState::Running - }) - .unwrap_or(false) + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, }), ) .await .expect("timed out waiting for InstanceState::Running in FakeNexus") - .unwrap(); + .expect("failed to receive FakeNexus' InstanceState"); logctx.cleanup_successful(); } @@ -1893,7 +1938,8 @@ mod tests { let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1918,11 +1964,17 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); - if let Some(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledInstanceState { vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -1949,13 +2001,12 @@ mod tests { Ok(()) }); let wait_ctx = illumos_utils::svc::wait_for_service_context(); - wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + wait_ctx.expect().times(1..).returning(|_, _, _| Ok(())); let zone_id_ctx = MockZones::id_context(); - zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); - let halt_rm_ctx = MockZones::halt_and_remove_logged_context(); - halt_rm_ctx.expect().times(..).returning(|_, _| Ok(())); + zone_id_ctx.expect().times(1..).returning(|_| Ok(Some(1))); - let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1980,11 +2031,17 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); - if let Some(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledInstanceState { vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -1994,4 +2051,97 @@ mod tests { logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_instance_manager_creation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_manager_creation", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let storage_handle = fake_storage_manager_with_u2().await; + + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let InstanceManagerServices { + nexus_client, + vnic_allocator: _, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + } = fake_instance_manager_services( + &logctx, + storage_handle, + nexus_client_with_resolver, + ); + + let etherstub = Etherstub("mystub".to_string()); + + let mgr = crate::instance_manager::InstanceManager::new( + logctx.log.new(o!("component" => "InstanceManager")), + nexus_client, + etherstub, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + ) + .unwrap(); + + let (propolis_server, _propolis_client) = + propolis_mock_server(&logctx.log); + let propolis_addr = propolis_server.local_addr(); + + let instance_id = Uuid::new_v4(); + let propolis_id = Uuid::new_v4(); + let InstanceInitialState { + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + } = fake_instance_initial_state(propolis_id, propolis_addr); + + mgr.ensure_registered( + instance_id, + propolis_id, + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + ) + .await + .unwrap(); + + mgr.ensure_state(instance_id, InstanceStateRequested::Running) + .await + .unwrap(); + + timeout( + TIMEOUT_DURATION, + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, + }), + ) + .await + .expect("timed out waiting for InstanceState::Running in FakeNexus") + .expect("failed to receive FakeNexus' InstanceState"); + + logctx.cleanup_successful(); + } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index fee42849f42..47c9dfcb58c 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -185,6 +185,7 @@ impl InstanceManager { target: InstanceStateRequested, ) -> Result { let (tx, rx) = oneshot::channel(); + self.inner .tx .send(InstanceManagerRequest::EnsureState { @@ -194,7 +195,20 @@ impl InstanceManager { }) .await .map_err(|_| Error::FailedSendInstanceManagerClosed)?; - rx.await? + + match target { + // these may involve a long-running zone creation, so avoid HTTP + // request timeouts by decoupling the response + // (see InstanceRunner::put_state) + InstanceStateRequested::MigrationTarget(_) + | InstanceStateRequested::Running => { + // don't error on channel being closed + tokio::spawn(rx); + Ok(InstancePutStateResponse { updated_runtime: None }) + } + InstanceStateRequested::Stopped + | InstanceStateRequested::Reboot => rx.await?, + } } pub async fn put_migration_ids( @@ -736,7 +750,7 @@ impl InstanceTicket { #[cfg(test)] pub(crate) fn new_without_manager_for_test(id: Uuid) -> Self { - Self { id, inner: None } + Self { id, terminate_tx: None } } /// Idempotently removes this instance from the tracked set of diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 82813d3f1e0..fa9276dec2e 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -129,7 +129,7 @@ pub struct InstancePutStateBody { /// The response sent from a request to move an instance into a specific runtime /// state. -#[derive(Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Serialize, Deserialize, JsonSchema)] pub struct InstancePutStateResponse { /// The current runtime state of the instance after handling the request to /// change its state. If the instance's state did not change, this field is diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 0b90bef590e..48d35861658 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -45,6 +45,7 @@ use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; /// Simulates management of the control plane on a sled @@ -74,6 +75,7 @@ pub struct SledAgent { config: Config, fake_zones: Mutex, instance_ensure_state_error: Mutex>, + pub log: Logger, } fn extract_targets_from_volume_construction_request( @@ -172,6 +174,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), + log, }) } @@ -401,7 +404,28 @@ impl SledAgent { )); } InstanceStateRequested::Running => { - propolis_client::types::InstanceStateRequested::Run + let instances = self.instances.clone(); + let log = self.log.new( + o!("component" => "SledAgent-insure_instance_state"), + ); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_secs(10)).await; + match instances + .sim_ensure(&instance_id, current, Some(state)) + .await + { + Ok(state) => { + let instance_state: nexus_client::types::SledInstanceState = state.into(); + info!(log, "sim_ensure success"; "instance_state" => #?instance_state); + } + Err(instance_put_error) => { + error!(log, "sim_ensure failure"; "error" => #?instance_put_error); + } + } + }); + return Ok(InstancePutStateResponse { + updated_runtime: None, + }); } InstanceStateRequested::Stopped => { propolis_client::types::InstanceStateRequested::Stop