diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 987a8ac794..93386a66d0 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -61,6 +61,83 @@ use uuid::Uuid; const MAX_KEYS_PER_INSTANCE: u32 = 8; +type SledAgentClientError = + sled_agent_client::Error; + +// Newtype wrapper to avoid the orphan type rule. +#[derive(Debug)] +pub struct SledAgentInstancePutError(pub SledAgentClientError); + +impl std::fmt::Display for SledAgentInstancePutError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for SledAgentInstancePutError { + fn from(value: SledAgentClientError) -> Self { + Self(value) + } +} + +impl From for omicron_common::api::external::Error { + fn from(value: SledAgentInstancePutError) -> Self { + value.0.into() + } +} + +impl SledAgentInstancePutError { + /// Returns `true` if this error is of a class that indicates that Nexus + /// cannot assume anything about the health of the instance or its sled. + pub fn instance_unhealthy(&self) -> bool { + // TODO(#3238) TODO(#4226) For compatibility, this logic is lifted from + // the From impl that converts Progenitor client errors to + // `omicron_common::api::external::Error`s and from previous logic in + // this module that inferred instance health from those converted + // errors. In particular, some of the outer Progenitor client error + // variants (e.g. CommunicationError) can indicate transient conditions + // that aren't really fatal to an instance and don't otherwise indicate + // that it's unhealthy. + // + // To match old behavior until this is revisited, however, treat all + // Progenitor errors except for explicit error responses as signs of an + // unhealthy instance, and then only treat an instance as healthy if its + // sled returned a 400-level status code. + match &self.0 { + progenitor_client::Error::ErrorResponse(rv) => { + !rv.status().is_client_error() + } + _ => true, + } + } +} + +/// An error that can be returned from an operation that changes the state of an +/// instance on a specific sled. +#[derive(Debug, thiserror::Error)] +pub enum InstanceStateChangeError { + /// Sled agent returned an error from one of its instance endpoints. + #[error("sled agent client error: {0}")] + SledAgent(SledAgentInstancePutError), + + /// Some other error occurred outside of the attempt to communicate with + /// sled agent. + #[error(transparent)] + Other(#[from] omicron_common::api::external::Error), +} + +// Allow direct conversion of instance state change errors into API errors for +// callers who don't care about the specific reason the update failed and just +// need to return an API error. +impl From for omicron_common::api::external::Error { + fn from(value: InstanceStateChangeError) -> Self { + match value { + InstanceStateChangeError::SledAgent(e) => e.into(), + InstanceStateChangeError::Other(e) => e, + } + } +} + /// The kinds of state changes that can be requested of an instance's current /// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field). pub(crate) enum InstanceStateChangeRequest { @@ -438,19 +515,30 @@ impl super::Nexus { }, ) .await - .map(|res| Some(res.into_inner())); + .map(|res| Some(res.into_inner().into())) + .map_err(|e| SledAgentInstancePutError(e)); // Write the updated instance runtime state back to CRDB. If this // outright fails, this operation fails. If the operation nominally // succeeds but nothing was updated, this action is outdated and the // caller should not proceed with migration. - let (updated, _) = self - .handle_instance_put_result( - &instance_id, - prev_instance_runtime, - instance_put_result.map(|state| state.map(Into::into)), - ) - .await?; + let (updated, _) = match instance_put_result { + Ok(state) => { + self.write_returned_instance_state(&instance_id, state).await? + } + Err(e) => { + if e.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &instance_id, + &prev_instance_runtime, + &e, + ) + .await; + } + return Err(e.into()); + } + }; if updated { Ok(self @@ -498,14 +586,26 @@ impl super::Nexus { }, ) .await - .map(|res| Some(res.into_inner())); + .map(|res| Some(res.into_inner().into())) + .map_err(|e| SledAgentInstancePutError(e)); - self.handle_instance_put_result( - &instance_id, - prev_instance_runtime, - instance_put_result.map(|state| state.map(Into::into)), - ) - .await?; + match instance_put_result { + Ok(state) => { + self.write_returned_instance_state(&instance_id, state).await?; + } + Err(e) => { + if e.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &instance_id, + &prev_instance_runtime, + &e, + ) + .await; + } + return Err(e.into()); + } + } Ok(()) } @@ -631,22 +731,18 @@ impl super::Nexus { opctx: &OpContext, authz_instance: &authz::Instance, sled_id: &Uuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - ) -> Result<(), Error> { + ) -> Result, InstanceStateChangeError> + { opctx.authorize(authz::Action::Modify, authz_instance).await?; let sa = self.sled_client(&sled_id).await?; - let result = sa - .instance_unregister(&authz_instance.id()) + sa.instance_unregister(&authz_instance.id()) .await - .map(|res| res.into_inner().updated_runtime); - - self.handle_instance_put_result( - &authz_instance.id(), - prev_instance_runtime, - result.map(|state| state.map(Into::into)), - ) - .await - .map(|_| ()) + .map(|res| res.into_inner().updated_runtime.map(Into::into)) + .map_err(|e| { + InstanceStateChangeError::SledAgent(SledAgentInstancePutError( + e, + )) + }) } /// Determines the action to take on an instance's active VMM given a @@ -799,7 +895,7 @@ impl super::Nexus { prev_instance_state: &db::model::Instance, prev_vmm_state: &Option, requested: InstanceStateChangeRequest, - ) -> Result<(), Error> { + ) -> Result<(), InstanceStateChangeError> { opctx.authorize(authz::Action::Modify, authz_instance).await?; let instance_id = authz_instance.id(); @@ -817,16 +913,23 @@ impl super::Nexus { &InstancePutStateBody { state: requested.into() }, ) .await - .map(|res| res.into_inner().updated_runtime) - .map(|state| state.map(Into::into)); + .map(|res| res.into_inner().updated_runtime.map(Into::into)) + .map_err(|e| SledAgentInstancePutError(e)); - self.handle_instance_put_result( - &instance_id, - prev_instance_state.runtime(), - instance_put_result, - ) - .await - .map(|_| ()) + // If the operation succeeded, write the instance state back, + // returning any subsequent errors that occurred during that + // write. + // + // If the operation failed, kick the sled agent error back up to + // the caller to let it decide how to handle it. + match instance_put_result { + Ok(state) => self + .write_returned_instance_state(&instance_id, state) + .await + .map(|_| ()) + .map_err(Into::into), + Err(e) => Err(InstanceStateChangeError::SledAgent(e)), + } } } } @@ -1046,143 +1149,117 @@ impl super::Nexus { }, ) .await - .map(|res| Some(res.into_inner())); + .map(|res| Some(res.into_inner().into())) + .map_err(|e| SledAgentInstancePutError(e)); - self.handle_instance_put_result( - &db_instance.id(), - db_instance.runtime(), - instance_register_result.map(|state| state.map(Into::into)), - ) - .await - .map(|_| ()) + match instance_register_result { + Ok(state) => { + self.write_returned_instance_state(&db_instance.id(), state) + .await?; + } + Err(e) => { + if e.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &db_instance.id(), + db_instance.runtime(), + &e, + ) + .await; + } + return Err(e.into()); + } + } + + Ok(()) } - /// Updates an instance's CRDB record based on the result of a call to sled - /// agent that tried to update the instance's state. - /// - /// # Parameters - /// - /// - `db_instance`: The CRDB instance record observed by the caller before - /// it attempted to update the instance's state. - /// - `result`: The result of the relevant sled agent operation. If this is - /// `Ok`, the payload is the updated instance runtime state returned from - /// sled agent, if there was one. + /// Takes an updated instance state returned from a call to sled agent and + /// writes it back to the database. /// /// # Return value /// - /// - `Ok(true)` if the caller supplied an updated instance record and this - /// routine successfully wrote it to CRDB. - /// - `Ok(false)` if the sled agent call succeeded, but this routine did not - /// update CRDB. - /// This can happen either because sled agent didn't return an updated - /// record or because the updated record was superseded by a state update - /// with a more advanced generation number. - /// - `Err` if the sled agent operation failed or this routine received an - /// error while trying to update CRDB. - async fn handle_instance_put_result( + /// - `Ok((instance_updated, vmm_updated))` if no failures occurred. The + /// tuple fields indicate which database records (if any) were updated. + /// Note that it is possible for sled agent not to return an updated + /// instance state from a particular API call. In that case, the `state` + /// parameter is `None` and this routine returns `Ok((false, false))`. + /// - `Err` if an error occurred while writing state to the database. A + /// database operation that succeeds but doesn't update anything (e.g. + /// owing to an outdated generation number) will return `Ok`. + async fn write_returned_instance_state( &self, instance_id: &Uuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - result: Result< - Option, - sled_agent_client::Error, - >, + state: Option, ) -> Result<(bool, bool), Error> { - slog::debug!(&self.log, "Handling sled agent instance PUT result"; + slog::debug!(&self.log, + "writing instance state returned from sled agent"; "instance_id" => %instance_id, - "result" => ?result); - - match result { - Ok(Some(new_state)) => { - let update_result = self - .db_datastore - .instance_and_vmm_update_runtime( - instance_id, - &new_state.instance_state.into(), - &new_state.propolis_id, - &new_state.vmm_state.into(), - ) - .await; + "new_state" => ?state); - slog::debug!(&self.log, - "Attempted DB update after instance PUT"; - "instance_id" => %instance_id, - "propolis_id" => %new_state.propolis_id, - "result" => ?update_result); - - update_result - } - Ok(None) => Ok((false, false)), - Err(e) => { - // The sled-agent has told us that it can't do what we - // requested, but does that mean a failure? One example would be - // if we try to "reboot" a stopped instance. That shouldn't - // transition the instance to failed. But if the sled-agent - // *can't* boot a stopped instance, that should transition - // to failed. - // - // Without a richer error type, let the sled-agent tell Nexus - // what to do with status codes. - error!(self.log, "received error from instance PUT"; - "instance_id" => %instance_id, - "error" => ?e); + if let Some(state) = state { + let update_result = self + .db_datastore + .instance_and_vmm_update_runtime( + instance_id, + &state.instance_state.into(), + &state.propolis_id, + &state.vmm_state.into(), + ) + .await; - // Convert to the Omicron API error type. - // - // TODO(#3238): This is an extremely lossy conversion: if the - // operation failed without getting a response from sled agent, - // this unconditionally converts to Error::InternalError. - let e = e.into(); - - match &e { - // Bad request shouldn't change the instance state. - Error::InvalidRequest { .. } => Err(e), - - // Internal server error (or anything else) should change - // the instance state to failed, we don't know what state - // the instance is in. - // - // TODO(#4226): This logic needs to be revisited: - // - Some errors that don't get classified as - // Error::InvalidRequest (timeouts, disconnections due to - // network weather, etc.) are not necessarily fatal to the - // instance and shouldn't mark it as Failed. - // - If the instance still has a running VMM, this operation - // won't terminate it or reclaim its resources. (The - // resources will be reclaimed if the sled later reports - // that the VMM is gone, however.) - _ => { - let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::new( - InstanceState::Failed, - ), + slog::debug!(&self.log, + "attempted to write instance state from sled agent"; + "instance_id" => %instance_id, + "propolis_id" => %state.propolis_id, + "result" => ?update_result); - // TODO(#4226): Clearing the Propolis ID is required - // to allow the instance to be deleted, but this - // doesn't actually terminate the VMM (see above). - propolis_id: None, - gen: prev_instance_runtime.gen.next().into(), - ..prev_instance_runtime.clone() - }; + update_result + } else { + Ok((false, false)) + } + } - // XXX what if this fails? - let result = self - .db_datastore - .instance_update_runtime(&instance_id, &new_runtime) - .await; + /// Attempts to move an instance from `prev_instance_runtime` to the + /// `Failed` state in response to an error returned from a call to a sled + /// agent instance API, supplied in `reason`. + pub(crate) async fn mark_instance_failed( + &self, + instance_id: &Uuid, + prev_instance_runtime: &db::model::InstanceRuntimeState, + reason: &SledAgentInstancePutError, + ) -> Result<(), Error> { + error!(self.log, "marking instance failed due to sled agent API error"; + "instance_id" => %instance_id, + "error" => ?reason); + + let new_runtime = db::model::InstanceRuntimeState { + nexus_state: db::model::InstanceState::new(InstanceState::Failed), + + // TODO(#4226): Clearing the Propolis ID is required to allow the + // instance to be deleted, but this doesn't actually terminate the + // VMM. + propolis_id: None, + gen: prev_instance_runtime.gen.next().into(), + ..prev_instance_runtime.clone() + }; - error!( - self.log, - "attempted to set instance to Failed after bad put"; + match self + .db_datastore + .instance_update_runtime(&instance_id, &new_runtime) + .await + { + Ok(_) => info!(self.log, "marked instance as Failed"; + "instance_id" => %instance_id), + // XXX: It's not clear what to do with this error; should it be + // bubbled back up to the caller? + Err(e) => error!(self.log, + "failed to write Failed instance state to DB"; "instance_id" => %instance_id, - "result" => ?result, - ); - - Err(e) - } - } - } + "error" => ?e), } + + Ok(()) } /// Lists disks attached to the instance. diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 5149825842..fd86e2052a 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -903,8 +903,7 @@ pub mod test { }; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ - BoolExpressionMethods, ExpressionMethods, OptionalExtension, QueryDsl, - SelectableHelper, + ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; use dropshot::test_util::ClientTestContext; use nexus_db_queries::authn::saga::Serialized; @@ -1073,68 +1072,6 @@ pub mod test { .unwrap() } - async fn no_virtual_provisioning_resource_records_exist( - datastore: &DataStore, - ) -> bool { - use nexus_db_queries::db::model::VirtualProvisioningResource; - use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; - - let conn = datastore.pool_connection_for_tests().await.unwrap(); - - datastore - .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") - .transaction(&conn, |conn| async move { - conn - .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) - .await - .unwrap(); - - Ok( - dsl::virtual_provisioning_resource - .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string())) - .select(VirtualProvisioningResource::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty() - ) - }).await.unwrap() - } - - async fn no_virtual_provisioning_collection_records_using_instances( - datastore: &DataStore, - ) -> bool { - use nexus_db_queries::db::model::VirtualProvisioningCollection; - use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; - - let conn = datastore.pool_connection_for_tests().await.unwrap(); - - datastore - .transaction_retry_wrapper( - "no_virtual_provisioning_collection_records_using_instances", - ) - .transaction(&conn, |conn| async move { - conn.batch_execute_async( - nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, - ) - .await - .unwrap(); - Ok(dsl::virtual_provisioning_collection - .filter( - dsl::cpus_provisioned - .ne(0) - .or(dsl::ram_provisioned.ne(0)), - ) - .select(VirtualProvisioningCollection::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty()) - }) - .await - .unwrap() - } - async fn disk_is_detached(datastore: &DataStore) -> bool { use nexus_db_queries::db::model::Disk; use nexus_db_queries::db::schema::disk::dsl; @@ -1170,11 +1107,14 @@ pub mod test { assert!(no_external_ip_records_exist(datastore).await); assert!(no_sled_resource_instance_records_exist(datastore).await); assert!( - no_virtual_provisioning_resource_records_exist(datastore).await + test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await ); assert!( - no_virtual_provisioning_collection_records_using_instances( - datastore + test_helpers::no_virtual_provisioning_collection_records_using_instances( + cptestctx ) .await ); diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index d32a20bc40..7a417a5781 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -3,7 +3,9 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NexusSaga, ACTION_GENERATE_ID}; -use crate::app::instance::InstanceStateChangeRequest; +use crate::app::instance::{ + InstanceStateChangeError, InstanceStateChangeRequest, +}; use crate::app::sagas::{ declare_saga_actions, instance_common::allocate_sled_ipv6, }; @@ -387,28 +389,26 @@ async fn sim_ensure_destination_propolis_undo( "prev_runtime_state" => ?db_instance.runtime()); // Ensure that the destination sled has no Propolis matching the description - // the saga previously generated. - // - // Sled agent guarantees that if an instance is unregistered from a sled - // that does not believe it holds the "active" Propolis for the instance, - // then the sled's copy of the instance record will not change during - // unregistration. This precondition always holds here because the "start - // migration" step is not allowed to unwind once migration has possibly - // started. Not changing the instance is important here because the next - // undo step (clearing migration IDs) needs to advance the instance's - // generation number to succeed. - osagactx + // the saga previously generated. If this succeeds, or if it fails because + // the destination sled no longer knows about this instance, allow the rest + // of unwind to take care of cleaning up the migration IDs in the instance + // record. Otherwise the unwind has failed and manual intervention is + // needed. + match osagactx .nexus() - .instance_ensure_unregistered( - &opctx, - &authz_instance, - &dst_sled_id, - db_instance.runtime(), - ) + .instance_ensure_unregistered(&opctx, &authz_instance, &dst_sled_id) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + { + Ok(_) => Ok(()), + Err(InstanceStateChangeError::SledAgent(inner)) => { + if !inner.instance_unhealthy() { + Ok(()) + } else { + Err(inner.0.into()) + } + } + Err(e) => Err(e.into()), + } } async fn sim_instance_migrate( @@ -454,7 +454,7 @@ async fn sim_instance_migrate( // // Possibly sled agent can help with this by using state or Propolis // generation numbers to filter out stale destruction requests. - osagactx + match osagactx .nexus() .instance_request_state( &opctx, @@ -469,9 +469,30 @@ async fn sim_instance_migrate( ), ) .await - .map_err(ActionError::action_failed)?; + { + Ok(_) => Ok(()), + // Failure to initiate migration to a specific target doesn't entail + // that the entire instance has failed, so handle errors by unwinding + // the saga without otherwise touching the instance's state. + Err(InstanceStateChangeError::SledAgent(inner)) => { + info!(osagactx.log(), + "migration saga: sled agent failed to start migration"; + "instance_id" => %db_instance.id(), + "error" => ?inner); + + Err(ActionError::action_failed( + omicron_common::api::external::Error::from(inner), + )) + } + Err(InstanceStateChangeError::Other(inner)) => { + info!(osagactx.log(), + "migration saga: internal error changing instance state"; + "instance_id" => %db_instance.id(), + "error" => ?inner); - Ok(()) + Err(ActionError::action_failed(inner)) + } + } } #[cfg(test)] diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 76773d6369..e6717b0164 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -10,6 +10,7 @@ use super::{ instance_common::allocate_sled_ipv6, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, }; +use crate::app::instance::InstanceStateChangeError; use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_queries::db::{identity::Resource, lookup::LookupPath}; @@ -52,11 +53,6 @@ declare_saga_actions! { - sis_move_to_starting_undo } - ADD_VIRTUAL_RESOURCES -> "virtual_resources" { - + sis_account_virtual_resources - - sis_account_virtual_resources_undo - } - // TODO(#3879) This can be replaced with an action that triggers the NAT RPW // once such an RPW is available. DPD_ENSURE -> "dpd_ensure" { @@ -74,6 +70,17 @@ declare_saga_actions! { - sis_ensure_registered_undo } + // Only account for the instance's resource consumption when the saga is on + // the brink of actually starting it. This allows prior steps' undo actions + // to change the instance's generation number if warranted (e.g. by moving + // the instance to the Failed state) without disrupting this step's undo + // action (which depends on the instance bearing the same generation number + // at undo time that it had at resource accounting time). + ADD_VIRTUAL_RESOURCES -> "virtual_resources" { + + sis_account_virtual_resources + - sis_account_virtual_resources_undo + } + ENSURE_RUNNING -> "ensure_running" { + sis_ensure_running } @@ -103,10 +110,10 @@ impl NexusSaga for SagaInstanceStart { builder.append(alloc_propolis_ip_action()); builder.append(create_vmm_record_action()); builder.append(mark_as_starting_action()); - builder.append(add_virtual_resources_action()); builder.append(dpd_ensure_action()); builder.append(v2p_ensure_action()); builder.append(ensure_registered_action()); + builder.append(add_virtual_resources_action()); builder.append(ensure_running_action()); Ok(builder.build()?) } @@ -575,18 +582,87 @@ async fn sis_ensure_registered_undo( .await .map_err(ActionError::action_failed)?; - osagactx + // If the sled successfully unregistered the instance, allow the rest of + // saga unwind to restore the instance record to its prior state (without + // writing back the state returned from sled agent). Otherwise, try to + // reason about the next action from the specific kind of error that was + // returned. + if let Err(e) = osagactx .nexus() - .instance_ensure_unregistered( - &opctx, - &authz_instance, - &sled_id, - db_instance.runtime(), - ) + .instance_ensure_unregistered(&opctx, &authz_instance, &sled_id) .await - .map_err(ActionError::action_failed)?; + { + error!(osagactx.log(), + "start saga: failed to unregister instance from sled"; + "instance_id" => %instance_id, + "error" => ?e); + + // If the failure came from talking to sled agent, and the error code + // indicates the instance or sled might be unhealthy, manual + // intervention is likely to be needed, so try to mark the instance as + // Failed and then bail on unwinding. + // + // If sled agent is in good shape but just doesn't know about the + // instance, this saga still owns the instance's state, so allow + // unwinding to continue. + // + // If some other Nexus error occurred, this saga is in bad shape, so + // return an error indicating that intervention is needed without trying + // to modify the instance further. + // + // TODO(#3238): `instance_unhealthy` does not take an especially nuanced + // view of the meanings of the error codes sled agent could return, so + // assuming that an error that isn't `instance_unhealthy` means + // that everything is hunky-dory and it's OK to continue unwinding may + // be a bit of a stretch. See the definition of `instance_unhealthy` for + // more details. + match e { + InstanceStateChangeError::SledAgent(inner) + if inner.instance_unhealthy() => + { + error!(osagactx.log(), + "start saga: failing instance after unregister failure"; + "instance_id" => %instance_id, + "error" => ?inner); + + if let Err(set_failed_error) = osagactx + .nexus() + .mark_instance_failed( + &instance_id, + db_instance.runtime(), + &inner, + ) + .await + { + error!(osagactx.log(), + "start saga: failed to mark instance as failed"; + "instance_id" => %instance_id, + "error" => ?set_failed_error); + + Err(set_failed_error.into()) + } else { + Err(inner.0.into()) + } + } + InstanceStateChangeError::SledAgent(_) => { + info!(osagactx.log(), + "start saga: instance already unregistered from sled"; + "instance_id" => %instance_id); - Ok(()) + Ok(()) + } + InstanceStateChangeError::Other(inner) => { + error!(osagactx.log(), + "start saga: internal error unregistering instance"; + "instance_id" => %instance_id, + "error" => ?inner); + + Err(inner.into()) + } + } + } else { + Ok(()) + } } async fn sis_ensure_running( @@ -615,7 +691,7 @@ async fn sis_ensure_running( .await .map_err(ActionError::action_failed)?; - osagactx + match osagactx .nexus() .instance_request_state( &opctx, @@ -625,9 +701,30 @@ async fn sis_ensure_running( crate::app::instance::InstanceStateChangeRequest::Run, ) .await - .map_err(ActionError::action_failed)?; + { + Ok(_) => Ok(()), + Err(InstanceStateChangeError::SledAgent(inner)) => { + info!(osagactx.log(), + "start saga: sled agent failed to set instance to running"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "error" => ?inner); + + // Don't set the instance to Failed in this case. Instead, allow + // the saga to unwind and restore the instance to the Stopped + // state (matching what would happen if there were a failure + // prior to this point). + Err(ActionError::action_failed(Error::from(inner))) + } + Err(InstanceStateChangeError::Other(inner)) => { + info!(osagactx.log(), + "start saga: internal error changing instance state"; + "instance_id" => %instance_id, + "error" => ?inner); - Ok(()) + Err(ActionError::action_failed(inner)) + } + } } #[cfg(test)] @@ -776,6 +873,9 @@ mod test { new_db_instance.runtime().nexus_state.0, InstanceState::Stopped ); + + assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); } }) }, @@ -818,4 +918,80 @@ mod test { assert_eq!(vmm_state, InstanceState::Running); } + + /// Tests that if a start saga unwinds because sled agent returned failure + /// from a call to ensure the instance was running, then the system returns + /// to the correct state. + /// + /// This is different from `test_action_failure_can_unwind` because that + /// test causes saga nodes to "fail" without actually executing anything, + /// whereas this test injects a failure into the normal operation of the + /// ensure-running node. + #[nexus_test(server = crate::Server)] + async fn test_ensure_running_unwind(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.apictx().nexus; + let _project_id = setup_test_project(&client).await; + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let db_instance = + test_helpers::instance_fetch(cptestctx, instance.identity.id) + .await + .instance() + .clone(); + + let params = Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + db_instance, + }; + + let dag = create_saga_dag::(params).unwrap(); + + // The ensure_running node is last in the saga. This should be the node + // where the failure ultimately occurs. + let last_node_name = dag + .get_nodes() + .last() + .expect("saga should have at least one node") + .name() + .clone(); + + // Inject failure at the simulated sled agent level. This allows the + // ensure-running node to attempt to change the instance's state, but + // forces this operation to fail and produce whatever side effects + // result from that failure. + let sled_agent = &cptestctx.sled_agent.sled_agent; + sled_agent + .set_instance_ensure_state_error(Some(Error::internal_error( + "injected by test_ensure_running_unwind", + ))) + .await; + + let saga = nexus.create_runnable_saga(dag).await.unwrap(); + let saga_error = nexus + .run_saga_raw_result(saga) + .await + .expect("saga execution should have started") + .kind + .expect_err("saga should fail due to injected error"); + + assert_eq!(saga_error.error_node_name, last_node_name); + + let db_instance = + test_helpers::instance_fetch(cptestctx, instance.identity.id).await; + + assert_eq!( + db_instance.instance().runtime_state.nexus_state, + nexus_db_model::InstanceState(InstanceState::Stopped) + ); + assert!(db_instance.vmm().is_none()); + + assert!( + test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await + ); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); + } } diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 3110bd318a..1b383d27bb 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -11,7 +11,9 @@ use crate::{ Nexus, }; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; -use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; +use diesel::{ + BoolExpressionMethods, ExpressionMethods, QueryDsl, SelectableHelper, +}; use futures::future::BoxFuture; use nexus_db_queries::{ authz, @@ -186,6 +188,68 @@ pub async fn instance_fetch( db_state } +pub async fn no_virtual_provisioning_resource_records_exist( + cptestctx: &ControlPlaneTestContext, +) -> bool { + use nexus_db_queries::db::model::VirtualProvisioningResource; + use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; + + let datastore = cptestctx.server.apictx().nexus.datastore().clone(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") + .transaction(&conn, |conn| async move { + conn + .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) + .await + .unwrap(); + + Ok( + dsl::virtual_provisioning_resource + .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string())) + .select(VirtualProvisioningResource::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty() + ) + }).await.unwrap() +} + +pub async fn no_virtual_provisioning_collection_records_using_instances( + cptestctx: &ControlPlaneTestContext, +) -> bool { + use nexus_db_queries::db::model::VirtualProvisioningCollection; + use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; + + let datastore = cptestctx.server.apictx().nexus.datastore().clone(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper( + "no_virtual_provisioning_collection_records_using_instances", + ) + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + Ok(dsl::virtual_provisioning_collection + .filter( + dsl::cpus_provisioned.ne(0).or(dsl::ram_provisioned.ne(0)), + ) + .select(VirtualProvisioningCollection::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) + }) + .await + .unwrap() +} + /// Tests that the saga described by `dag` succeeds if each of its nodes is /// repeated. /// diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 8dae31863c..bbc3e440ab 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -217,6 +217,16 @@ impl SimCollection { } } + /// Forcibly removes the object `id` from the collection without simulating + /// any further state changes for it. + pub async fn sim_force_remove(&self, id: Uuid) { + let mut objects = self.objects.lock().await; + let object = objects.remove(&id).unwrap(); + if let Some(mut tx) = object.channel_tx { + tx.close_channel(); + } + } + /// Complete a desired asynchronous state transition for object `id`. /// This is invoked either by `sim_step()` (if the simulation mode is /// `SimMode::Auto`) or `instance_finish_transition` (if the simulation mode diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index c06ae96f2e..a16049dd2f 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -68,6 +68,8 @@ pub struct SledAgent { pub v2p_mappings: Mutex>>, mock_propolis: Mutex>, PropolisClient)>>, + + instance_ensure_state_error: Mutex>, } fn extract_targets_from_volume_construction_request( @@ -159,6 +161,7 @@ impl SledAgent { disk_id_to_region_ids: Mutex::new(HashMap::new()), v2p_mappings: Mutex::new(HashMap::new()), mock_propolis: Mutex::new(None), + instance_ensure_state_error: Mutex::new(None), }) } @@ -343,15 +346,7 @@ impl SledAgent { updated_runtime: Some(instance.terminate()), }; - // Poke the now-destroyed instance to force it to be removed from the - // collection. - // - // TODO: In the real sled agent, this happens inline without publishing - // any other state changes, whereas this call causes any pending state - // changes to be published. This can be fixed by adding a simulated - // object collection function to forcibly remove an object from a - // collection. - self.instances.sim_poke(instance_id, PokeMode::Drain).await; + self.instances.sim_force_remove(instance_id).await; Ok(response) } @@ -361,6 +356,11 @@ impl SledAgent { instance_id: Uuid, state: InstanceStateRequested, ) -> Result { + if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() + { + return Err(e.clone()); + } + let current = match self.instances.sim_get_cloned_object(&instance_id).await { Ok(i) => i.current().clone(), @@ -416,6 +416,10 @@ impl SledAgent { Ok(InstancePutStateResponse { updated_runtime: Some(new_state) }) } + pub async fn set_instance_ensure_state_error(&self, error: Option) { + *self.instance_ensure_state_error.lock().await = error; + } + async fn detach_disks_from_instance( &self, instance_id: Uuid,