From b6f0150dbe301d4fb6a0251c59b1bb28672bb7ce Mon Sep 17 00:00:00 2001 From: lif <> Date: Fri, 19 Jan 2024 02:01:28 -0800 Subject: [PATCH] wip: plumbing timeout state in internal api response --- nexus/src/app/instance.rs | 28 +++++++--- nexus/src/internal_api/http_entrypoints.rs | 29 +++------- nexus/types/src/internal_api/views.rs | 1 + openapi/nexus-internal.json | 63 +++++++++++++++++++--- sled-agent/src/instance_manager.rs | 28 ++++++---- 5 files changed, 101 insertions(+), 48 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index ac3cefe91da..863b77c8e8a 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -26,6 +26,7 @@ use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; +use nexus_types::internal_api::views::HandleInstancePutResultResult; use omicron_common::address::PROPOLIS_PORT; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::ByteCount; @@ -140,6 +141,7 @@ impl From for omicron_common::api::external::Error { /// The kinds of state changes that can be requested of an instance's current /// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field). +#[derive(Clone)] pub(crate) enum InstanceStateChangeRequest { Run, Reboot, @@ -943,7 +945,9 @@ impl super::Nexus { let instance_put_result = sa .instance_put_state( &instance_id, - &InstancePutStateBody { state: requested.into() }, + &InstancePutStateBody { + state: requested.clone().into(), + }, ) .await .map(|res| res.into_inner().updated_runtime.map(Into::into)) @@ -982,8 +986,8 @@ impl super::Nexus { tokio::spawn(async { tokio::time::sleep(Duration::from_secs(120)) .await; - todo: fail instance - }) + todo!("fail instance") + }); } self.write_returned_instance_state(&instance_id, state) .await @@ -1009,17 +1013,25 @@ impl super::Nexus { opctx: &OpContext, instance_id: &Uuid, result: Result, - ) -> Result<(), Error> { + ) -> Result { let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) .instance_id(*instance_id) .lookup_for(authz::Action::Modify) .await?; match result { - Ok(new_state) => self - .write_returned_instance_state(instance_id, Some(new_state)) - .await - .map(|_| ()), + Ok(new_state) => { + let (inst_updated, vmm_updated) = self + .write_returned_instance_state(instance_id, Some(new_state)) + .await?; + if !inst_updated && !vmm_updated { + // the generation number bumped up by the timeout task. + // TODO check actual state / put it in the TimedOut variant? + Ok(HandleInstancePutResultResult::TimedOut) + } else { + Ok(HandleInstancePutResultResult::Ok) + } + } Err(error) => { let state = self .db_datastore diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 1f90a8b1fd2..c1d6cbd9c11 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -304,20 +304,11 @@ async fn cpapi_handle_instance_put_success( // TODO: if instance_handle_creation_result errors because nexus gave // up waiting and marked the instance as failed, tell sled-agent to // destroy the instance - match nexus + nexus .instance_handle_creation_result(&opctx, &path.instance_id, result) .await - { - Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()), - Err(err) => { - if todo { - Ok(HandleInstancePutResultResult::TimedOut.into()) - } else { - Err(err) - } - } - } - Ok(HttpResponseUpdatedNoContent()) + .map(HttpResponseOk) + .map_err(Into::into) }; apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } @@ -341,19 +332,11 @@ async fn cpapi_handle_instance_put_failure( }); let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { - match nexus + nexus .instance_handle_creation_result(&opctx, &path.instance_id, result) .await - { - Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()), - Err(err) => { - if todo { - Ok(HandleInstancePutResultResult::TimedOut.into()) - } else { - Err(err) - } - } - } + .map(HttpResponseOk) + .map_err(Into::into) }; apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index e23cd03549d..f36b912da79 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -301,6 +301,7 @@ pub struct LastResultCompleted { /// relevant, or if it's timed out during creation and been marked as failed /// (such that sled-agent can destroy the tardy instance) #[derive(Serialize, JsonSchema)] +#[serde(rename_all = "snake_case", tag = "last_result", content = "details")] pub enum HandleInstancePutResultResult { Ok, TimedOut, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 1363a66b938..e7843d5f1b9 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -466,8 +466,8 @@ }, "/instances/{instance_id}/creation-failure": { "put": { - "summary": "Asynchronously report the unsuccessful result of a potentially long-running", - "description": "instance_put call to sled-agent made during instance creation.", + "summary": "Asynchronously report the unsuccessful result of certain instance_put calls", + "description": "(such as the potentially long-running one made during instance creation)", "operationId": "cpapi_handle_instance_put_failure", "parameters": [ { @@ -492,8 +492,15 @@ "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HandleInstancePutResultResult" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -506,8 +513,8 @@ }, "/instances/{instance_id}/creation-success": { "put": { - "summary": "Asynchronously report the successful result of a potentially long-running", - "description": "instance_put call to sled-agent made during instance creation.", + "summary": "Asynchronously report the successful result of certain instance_put calls", + "description": "(such as the potentially long-running one made during instance creation)", "operationId": "cpapi_handle_instance_put_success", "parameters": [ { @@ -531,8 +538,15 @@ "required": true }, "responses": { - "204": { - "description": "resource updated" + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HandleInstancePutResultResult" + } + } + } }, "4XX": { "$ref": "#/components/responses/Error" @@ -3753,6 +3767,39 @@ "format": "uint64", "minimum": 0 }, + "HandleInstancePutResultResult": { + "description": "Tells sled-agent whether an instance whose status it's reporting is still relevant, or if it's timed out during creation and been marked as failed (such that sled-agent can destroy the tardy instance)", + "oneOf": [ + { + "type": "object", + "properties": { + "last_result": { + "type": "string", + "enum": [ + "ok" + ] + } + }, + "required": [ + "last_result" + ] + }, + { + "type": "object", + "properties": { + "last_result": { + "type": "string", + "enum": [ + "timed_out" + ] + } + }, + "required": [ + "last_result" + ] + } + ] + }, "HistogramError": { "description": "Errors related to constructing histograms or adding samples into them.", "oneOf": [ diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 13a1f524143..641433d752d 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -19,6 +19,7 @@ use illumos_utils::link::VnicAllocator; use illumos_utils::opte::PortManager; use illumos_utils::running_zone::ZoneBuilderFactory; use illumos_utils::vmm_reservoir; +use nexus_client::types::HandleInstancePutResultResult; use omicron_common::api::external::ByteCount; use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; @@ -382,10 +383,12 @@ impl InstanceManager { &instance_state, ) .await + .map(nexus_client::ResponseValue::into_inner) { - Ok( - TODO HandleInstancePutResultResult::Ok - ) => {} + Ok(HandleInstancePutResultResult::Ok) => {} + Ok(HandleInstancePutResultResult::TimedOut) => { + todo!("nexus doesn't want us any more, terminate instance") + } Err(err) => { error!(log, "Failed to inform Nexus of instance_put success"; "err" => %err, @@ -396,18 +399,25 @@ impl InstanceManager { } } Err(instance_put_error) => { - if let Err(err) = nexus_client + match nexus_client .cpapi_handle_instance_put_failure( &instance_id, &instance_put_error.to_string(), ) .await + .map(nexus_client::ResponseValue::into_inner) { - error!(log, "Failed to inform Nexus of instance_put failure"; - "err" => %err, - "instance_id" => %instance_id, - "instance_put_error" => %instance_put_error, - ); + Ok(HandleInstancePutResultResult::Ok) => {} + Ok(HandleInstancePutResultResult::TimedOut) => { + todo!("well, i guess this is less awkward but clean up if we have to") + } + Err(err) => { + error!(log, "Failed to inform Nexus of instance_put failure"; + "err" => %err, + "instance_id" => %instance_id, + "instance_put_error" => %instance_put_error, + ); + } } } };