From 12e2dcb9aa1125b3d22d2597e9fff8b1c64aa4fe Mon Sep 17 00:00:00 2001 From: lif <> Date: Sat, 13 Jan 2024 01:57:22 -0800 Subject: [PATCH] messy WIP --- nexus/src/app/instance.rs | 40 ++++++++++++--- nexus/src/internal_api/http_entrypoints.rs | 57 +++++++++++++++------- nexus/types/src/internal_api/views.rs | 9 ++++ sled-agent/src/instance_manager.rs | 17 ++++--- 4 files changed, 93 insertions(+), 30 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index b5107be84a9..ac3cefe91da 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -55,6 +55,7 @@ use sled_agent_client::types::InstancePutStateBody; use std::matches; use std::net::SocketAddr; use std::sync::Arc; +use std::time::Duration; use tokio::io::{AsyncRead, AsyncWrite}; use uuid::Uuid; @@ -956,14 +957,39 @@ impl super::Nexus { // the caller to let it decide how to handle it. // // When creating the zone for the first time, we just get - // Ok(None) here, which is a no-op. We later asynchronously get - // a cpapi call invoking Self::write_returned_instance_state + // Ok(None) here, which is a no-op in write_returned_instance_state. + // (We later asynchronously receive a cpapi call that invokes + // write_returned_instance_state with the outcome.) match instance_put_result { - Ok(state) => self - .write_returned_instance_state(&instance_id, state) - .await - .map(|_| ()) - .map_err(Into::into), + Ok(state) => { + if state.is_none() + && matches!( + requested, + InstanceStateChangeRequest::Run + | InstanceStateChangeRequest::Migrate(..) + ) + { + // TODO: This is fragile -- suppose the nexus with + // this task crashes *and* the instance creation + // happens to also hang. The new nexus won't know + // to give up on the instance! In an ideal world + // this would be handled by a RPW that polls all + // the instances nexus knows about and finds any + // that have been in a still-starting state for too + // long -- say, InstanceRuntimeState::time_updated + // plus the timeout, assuming time_updated is the + // right point to measure from. + tokio::spawn(async { + tokio::time::sleep(Duration::from_secs(120)) + .await; + todo: fail instance + }) + } + self.write_returned_instance_state(&instance_id, state) + .await + .map(|_| ()) + .map_err(Into::into) + } Err(e) => Err(InstanceStateChangeError::SledAgent(e)), } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index acb6f02410a..1f90a8b1fd2 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -31,6 +31,7 @@ use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; use nexus_types::internal_api::views::BackgroundTask; +use nexus_types::internal_api::views::HandleInstancePutResultResult; use nexus_types::internal_api::views::Saga; use omicron_common::api::external::http_pagination::data_page_params_for; use omicron_common::api::external::http_pagination::PaginatedById; @@ -283,42 +284,55 @@ async fn cpapi_instances_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Asynchronously report the successful result of a potentially long-running -/// instance_put call to sled-agent made during instance creation. +/// Asynchronously report the successful result of certain instance_put calls +/// (such as the potentially long-running one made during instance creation) #[endpoint { -method = PUT, -path = "/instances/{instance_id}/creation-success", -}] + method = PUT, + path = "/instances/{instance_id}/creation-success", + }] async fn cpapi_handle_instance_put_success( rqctx: RequestContext>, path_params: Path, instance_state: TypedBody, -) -> Result { +) -> Result, HttpError> { let apictx = rqctx.context(); let nexus = &apictx.nexus; let path = path_params.into_inner(); let result = Ok(instance_state.into_inner()); let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { - nexus + // TODO: if instance_handle_creation_result errors because nexus gave + // up waiting and marked the instance as failed, tell sled-agent to + // destroy the instance + match nexus .instance_handle_creation_result(&opctx, &path.instance_id, result) - .await?; + .await + { + Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()), + Err(err) => { + if todo { + Ok(HandleInstancePutResultResult::TimedOut.into()) + } else { + Err(err) + } + } + } Ok(HttpResponseUpdatedNoContent()) }; apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Asynchronously report the unsuccessful result of a potentially long-running -/// instance_put call to sled-agent made during instance creation. +/// Asynchronously report the unsuccessful result of certain instance_put calls +/// (such as the potentially long-running one made during instance creation) #[endpoint { -method = PUT, -path = "/instances/{instance_id}/creation-failure", -}] + method = PUT, + path = "/instances/{instance_id}/creation-failure", + }] async fn cpapi_handle_instance_put_failure( rqctx: RequestContext>, path_params: Path, error: TypedBody, -) -> Result { +) -> Result, HttpError> { let apictx = rqctx.context(); let nexus = &apictx.nexus; let path = path_params.into_inner(); @@ -327,10 +341,19 @@ async fn cpapi_handle_instance_put_failure( }); let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { - nexus + match nexus .instance_handle_creation_result(&opctx, &path.instance_id, result) - .await?; - Ok(HttpResponseUpdatedNoContent()) + .await + { + Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()), + Err(err) => { + if todo { + Ok(HandleInstancePutResultResult::TimedOut.into()) + } else { + Err(err) + } + } + } }; apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index b7a097431b0..e23cd03549d 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -296,3 +296,12 @@ pub struct LastResultCompleted { /// arbitrary datum emitted by the background task pub details: serde_json::Value, } + +/// Tells sled-agent whether an instance whose status it's reporting is still +/// relevant, or if it's timed out during creation and been marked as failed +/// (such that sled-agent can destroy the tardy instance) +#[derive(Serialize, JsonSchema)] +pub enum HandleInstancePutResultResult { + Ok, + TimedOut, +} diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 096e7b78d1f..13a1f524143 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -376,18 +376,23 @@ impl InstanceManager { match instance.put_state(target).await { Ok(state) => { let instance_state: nexus_client::types::SledInstanceState = state.into(); - if let Err(err) = nexus_client + match nexus_client .cpapi_handle_instance_put_success( &instance_id, &instance_state, ) .await { - error!(log, "Failed to inform Nexus of instance_put success"; - "err" => %err, - "instance_state" => ?instance_state, - "instance_id" => %instance_id, - ); + Ok( + TODO HandleInstancePutResultResult::Ok + ) => {} + Err(err) => { + error!(log, "Failed to inform Nexus of instance_put success"; + "err" => %err, + "instance_state" => ?instance_state, + "instance_id" => %instance_id, + ) + } } } Err(instance_put_error) => {