diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 81c97ca1084..81adbaa53fd 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -976,16 +976,12 @@ impl super::Nexus { &self, opctx: &OpContext, instance_id: &Uuid, - result: Result< - Option, - sled_agent_client::types::Error, - >, + result: Result, ) -> Result<(), Error> { - let (.., authz_instance, db_instance) = - LookupPath::new(&opctx, &self.db_datastore) - .instance_id(*instance_id) - .lookup_for(authz::Action::Modify) - .await?; + let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) + .instance_id(*instance_id) + .lookup_for(authz::Action::Modify) + .await?; let state = self .db_datastore @@ -993,14 +989,29 @@ impl super::Nexus { .await?; // TODO: add param for sled-agent to show its 'previous' and compare with this + // to validate consistency between nexus and sled-agent let prev_instance_runtime = &state.instance().runtime_state; - self.handle_instance_put_result( - instance_id, - prev_instance_runtime, - result.map_err(Into::into), // TODO: this isn't real - ).await?; - todo!() + match result { + Ok(new_state) => self + .db_datastore + .instance_and_vmm_update_runtime( + instance_id, + &new_state.instance_state.into(), + &new_state.propolis_id, + &new_state.vmm_state.into(), + ) + .await + .map(|_| ()), + Err(error) => { + self.mark_instance_failed( + instance_id, + prev_instance_runtime, + error, + ) + .await + } + } } /// Modifies the runtime state of the Instance as requested. This generally @@ -1296,7 +1307,7 @@ impl super::Nexus { &self, instance_id: &Uuid, prev_instance_runtime: &db::model::InstanceRuntimeState, - reason: &SledAgentInstancePutError, + reason: impl std::fmt::Debug, ) -> Result<(), Error> { error!(self.log, "marking instance failed due to sled agent API error"; "instance_id" => %instance_id, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 762c0ead5a0..904342bfed5 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -58,6 +58,8 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(physical_disk_delete)?; api.register(zpool_put)?; api.register(cpapi_instances_put)?; + api.register(cpapi_handle_instance_put_success)?; + api.register(cpapi_handle_instance_put_failure)?; api.register(cpapi_disks_put)?; api.register(cpapi_volume_remove_read_only_parent)?; api.register(cpapi_disk_remove_read_only_parent)?; @@ -269,24 +271,48 @@ async fn cpapi_instances_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Asynchronously report the result of a potentially long-running +/// Asynchronously report the successful result of a potentially long-running /// instance_put call to sled-agent made during instance creation. #[endpoint { - method = PUT, - path = "/instances/{instance_id}/creation-result", +method = PUT, +path = "/instances/{instance_id}/creation-success", +}] +async fn cpapi_handle_instance_put_success( + rqctx: RequestContext>, + path_params: Path, + instance_state: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let result = Ok(instance_state.into_inner()); + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let handler = async { + nexus + .instance_handle_creation_result(&opctx, &path.instance_id, result) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Asynchronously report the unsuccessful result of a potentially long-running +/// instance_put call to sled-agent made during instance creation. +#[endpoint { +method = PUT, +path = "/instances/{instance_id}/creation-failure", }] -async fn cpapi_handle_instance_put_result( +async fn cpapi_handle_instance_put_failure( rqctx: RequestContext>, path_params: Path, - result: TypedBody, - sled_agent_client::types::Error - >>, + error: TypedBody, ) -> Result { let apictx = rqctx.context(); let nexus = &apictx.nexus; let path = path_params.into_inner(); - let result = result.into_inner(); + let result = Err(omicron_common::api::external::Error::InternalError { + internal_message: error.into_inner(), + }); let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { nexus diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a1d70d838b1..4665691f61c 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -230,6 +230,85 @@ } } }, + "/instances/{instance_id}/creation-failure": { + "put": { + "summary": "Asynchronously report the unsuccessful result of a potentially long-running", + "description": "instance_put call to sled-agent made during instance creation.", + "operationId": "cpapi_handle_instance_put_failure", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "String", + "type": "string" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/instances/{instance_id}/creation-success": { + "put": { + "summary": "Asynchronously report the successful result of a potentially long-running", + "description": "instance_put call to sled-agent made during instance creation.", + "operationId": "cpapi_handle_instance_put_success", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SledInstanceState" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/metrics/collect/{producer_id}": { "get": { "summary": "Endpoint for oximeter to collect nexus server metrics.", diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 1d9db9d4127..31cd9f06e73 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -362,8 +362,60 @@ impl InstanceManager { } }; - let new_state = instance.put_state(target).await?; - Ok(InstancePutStateResponse { updated_runtime: Some(new_state) }) + match target { + // these may involve a long-running zone creation, so avoid HTTP + // request timeouts by decoupling the response + InstanceStateRequested::MigrationTarget(_) + | InstanceStateRequested::Running => { + let inst_mgr = self.inner.clone(); + let log = + self.inner.log.new(o!("component" => "InstanceManager")); + tokio::spawn(async move { + let nexus_client = inst_mgr.nexus_client.client(); + match instance.put_state(target).await { + Ok(state) => { + let instance_state: nexus_client::types::SledInstanceState = state.into(); + if let Err(err) = nexus_client + .cpapi_handle_instance_put_success( + &instance_id, + &instance_state, + ) + .await + { + error!(log, "Failed to inform Nexus of instance_put success"; + "err" => %err, + "instance_state" => ?instance_state, + "instance_id" => %instance_id, + ); + } + } + Err(instance_put_error) => { + if let Err(err) = nexus_client + .cpapi_handle_instance_put_failure( + &instance_id, + &instance_put_error.to_string(), + ) + .await + { + error!(log, "Failed to inform Nexus of instance_put failure"; + "err" => %err, + "instance_id" => %instance_id, + "instance_put_error" => %instance_put_error, + ); + } + } + }; + }); + Ok(InstancePutStateResponse { updated_runtime: None }) + } + InstanceStateRequested::Stopped + | InstanceStateRequested::Reboot => { + let new_state = instance.put_state(target).await?; + Ok(InstancePutStateResponse { + updated_runtime: Some(new_state), + }) + } + } } /// Idempotently attempts to set the instance's migration IDs to the