Skip to content

Commit

Permalink
sled-agent: don't block during instance creation request from nexus
Browse files Browse the repository at this point in the history
Alleviating request timeouts occurring when propolis zone installation takes too long
(Propolis zone installation took 81 seconds and caused instance start to time out oxidecomputer#3927)
by making the zone installation not happen during a request handler.

Since the instance creation request no longer blocks, we need to wait before proceeding in some cases where we had assumed that a successful return from the Nexus call meant the instance existed,
e.g. test_instance_serial now polls for the instance's running state before attempting to send serial console data requests.
  • Loading branch information
lif committed Mar 16, 2024
1 parent c0ba701 commit a3da136
Show file tree
Hide file tree
Showing 8 changed files with 337 additions and 90 deletions.
70 changes: 56 additions & 14 deletions end-to-end-tests/src/instance_launch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
use oxide_client::types::{
ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate,
InstanceCpuCount, InstanceCreate, InstanceDiskAttachment,
InstanceNetworkInterfaceAttachment, SshKeyCreate,
InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate,
};
use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt};
use russh::{ChannelMsg, Disconnect};
use russh_keys::key::{KeyPair, PublicKey};
use russh_keys::PublicKeyBase64;
use std::sync::Arc;
use std::time::Duration;
use tokio::time::sleep;

#[tokio::test]
async fn instance_launch() -> Result<()> {
Expand Down Expand Up @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
Expand Down Expand Up @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> {

// check that we saw it on the console
eprintln!("waiting for serial console");
sleep(Duration::from_secs(5)).await;
let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await?
.data,

let data = wait_for_condition(
|| async {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await
.map_err(|_e| Error::NotYet)?
.data,
)
.into_owned();
if data.contains("-----END SSH HOST KEY KEYS-----") {
Ok(data)
} else {
Err(Error::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(300),
)
.into_owned();
.await?;

ensure!(
data.contains("Hello, Oxide!"),
"string not seen on console\n{}",
Expand Down
3 changes: 3 additions & 0 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,9 @@ impl super::Nexus {
//
// If the operation failed, kick the sled agent error back up to
// the caller to let it decide how to handle it.
//
// When creating the zone for the first time, we just get
// Ok(None) here, which is a no-op in write_returned_instance_state.
match instance_put_result {
Ok(state) => self
.write_returned_instance_state(&instance_id, state)
Expand Down
21 changes: 18 additions & 3 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _;
use std::convert::TryFrom;
use std::net::Ipv4Addr;
use std::sync::Arc;
use std::time::Duration;
use uuid::Uuid;

use dropshot::test_util::ClientTestContext;
Expand All @@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{
use nexus_test_utils_macros::nexus_test;
use nexus_types::external_api::shared::SiloRole;
use omicron_sled_agent::sim;
use omicron_test_utils::dev::poll;
use omicron_test_utils::dev::poll::CondCheckError;

type ControlPlaneTestContext =
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
Expand Down Expand Up @@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) {

// Create an instance and poke it to ensure it's running.
let instance = create_instance(client, PROJECT_NAME, instance_name).await;
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
let instance_next = poll::wait_for_condition(
|| async {
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
if instance_next.runtime.run_state == InstanceState::Running {
Ok(instance_next)
} else {
Err(CondCheckError::<()>::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(60),
)
.await
.unwrap();
identity_eq(&instance.identity, &instance_next.identity);
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
assert!(
instance_next.runtime.time_run_state_updated
> instance.runtime.time_run_state_updated
Expand Down
1 change: 0 additions & 1 deletion sled-agent/src/fakes/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ async fn sled_agent_put(
struct InstancePathParam {
instance_id: Uuid,
}

#[endpoint {
method = PUT,
path = "/instances/{instance_id}",
Expand Down
Loading

0 comments on commit a3da136

Please sign in to comment.