From f242e0aff03129af7d93e87b9344420fb7f2f813 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 5 Jul 2024 11:40:24 -0700 Subject: [PATCH] Mark vmm failed --- sled-agent/src/common/instance.rs | 13 ++++++++++--- sled-agent/src/instance.rs | 28 +++++++++++++++++----------- sled-agent/src/instance_manager.rs | 6 ++++-- sled-agent/src/sim/instance.rs | 6 ++++-- 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index ed0aceff82..0fe2e27698 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -486,9 +486,15 @@ impl InstanceStates { /// instance's state in Nexus may become inconsistent. This routine should /// therefore only be invoked by callers who know that an instance is not /// migrating. - pub(crate) fn terminate_rudely(&mut self) { + pub(crate) fn terminate_rudely(&mut self, mark_failed: bool) { + let vmm_state = if mark_failed { + PropolisInstanceState(PropolisApiState::Failed) + } else { + PropolisInstanceState(PropolisApiState::Destroyed) + }; + let fake_observed = ObservedPropolisState { - vmm_state: PropolisInstanceState(PropolisApiState::Destroyed), + vmm_state, migration_status: if self.instance.migration_id.is_some() { ObservedMigrationStatus::Failed } else { @@ -893,7 +899,8 @@ mod test { assert_eq!(state.propolis_role(), PropolisRole::MigrationTarget); let prev = state.clone(); - state.terminate_rudely(); + let mark_failed = false; + state.terminate_rudely(mark_failed); assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.instance.gen, prev.instance.gen); diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 540811e412..38b97173fc 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -231,6 +231,7 @@ enum InstanceRequest { tx: oneshot::Sender>, }, Terminate { + mark_failed: bool, tx: oneshot::Sender>, }, IssueSnapshotRequest { @@ -395,7 +396,8 @@ impl InstanceRunner { // of the sender alive in "self.tx_monitor". None => { warn!(self.log, "Instance 'VMM monitor' channel closed; shutting down"); - self.terminate().await; + let mark_failed = true; + self.terminate(mark_failed).await; }, } @@ -432,9 +434,9 @@ impl InstanceRunner { ) .map_err(|_| Error::FailedSendClientClosed) }, - Some(Terminate { tx }) => { + Some(Terminate { mark_failed, tx }) => { tx.send(Ok(InstanceUnregisterResponse { - updated_runtime: Some(self.terminate().await) + updated_runtime: Some(self.terminate(mark_failed).await) })) .map_err(|_| Error::FailedSendClientClosed) }, @@ -457,7 +459,8 @@ impl InstanceRunner { }, None => { warn!(self.log, "Instance request channel closed; shutting down"); - self.terminate().await; + let mark_failed = false; + self.terminate(mark_failed).await; break; }, }; @@ -617,8 +620,8 @@ impl InstanceRunner { Some(InstanceAction::Destroy) => { info!(self.log, "terminating VMM that has exited"; "instance_id" => %self.id()); - - self.terminate().await; + let mark_failed = false; + self.terminate(mark_failed).await; Reaction::Terminate } None => Reaction::Continue, @@ -1132,9 +1135,10 @@ impl Instance { pub async fn terminate( &self, tx: oneshot::Sender>, + mark_failed: bool, ) -> Result<(), Error> { self.tx - .send(InstanceRequest::Terminate { tx }) + .send(InstanceRequest::Terminate { mark_failed, tx }) .await .map_err(|_| Error::FailedSendChannelClosed)?; Ok(()) @@ -1254,7 +1258,8 @@ impl InstanceRunner { // This case is morally equivalent to starting Propolis and then // rudely terminating it before asking it to do anything. Update // the VMM and instance states accordingly. - self.state.terminate_rudely(); + let mark_failed = false; + self.state.terminate_rudely(mark_failed); } setup_result?; } @@ -1281,7 +1286,8 @@ impl InstanceRunner { // this happens, generate an instance record bearing the // "Destroyed" state and return it to the caller. if self.running_state.is_none() { - self.terminate().await; + let mark_failed = false; + self.terminate(mark_failed).await; (None, None) } else { ( @@ -1481,9 +1487,9 @@ impl InstanceRunner { Ok(PropolisSetup { client, running_zone }) } - async fn terminate(&mut self) -> SledInstanceState { + async fn terminate(&mut self, mark_failed: bool) -> SledInstanceState { self.terminate_inner().await; - self.state.terminate_rudely(); + self.state.terminate_rudely(mark_failed); // This causes the "run" task to exit on the next iteration. self.should_terminate = true; diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index f40e37bf55..cfb96fb8c9 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -667,7 +667,8 @@ impl InstanceManagerRunner { // Otherwise, we pipeline the request, and send it to the instance, // where it can receive an appropriate response. - instance.terminate(tx).await?; + let mark_failed = false; + instance.terminate(tx, mark_failed).await?; Ok(()) } @@ -842,7 +843,8 @@ impl InstanceManagerRunner { info!(self.log, "use_only_these_disks: Removing instance"; "instance_id" => ?id); if let Some((_, instance)) = self.instances.remove(&id) { let (tx, rx) = oneshot::channel(); - if let Err(e) = instance.terminate(tx).await { + let mark_failed = true; + if let Err(e) = instance.terminate(tx, mark_failed).await { warn!(self.log, "use_only_these_disks: Failed to request instance removal"; "err" => ?e); continue; } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index be6c63f53a..e94b3b4984 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -211,7 +211,8 @@ impl SimInstanceInner { InstanceStateRequested::Stopped => { match self.next_resting_state() { VmmState::Starting => { - self.state.terminate_rudely(); + let mark_failed = false; + self.state.terminate_rudely(mark_failed); } VmmState::Running => self.queue_graceful_stop(), // Idempotently allow requests to stop an instance that is @@ -363,7 +364,8 @@ impl SimInstanceInner { /// Simulates rude termination by moving the instance to the Destroyed state /// immediately and clearing the queue of pending state transitions. fn terminate(&mut self) -> SledInstanceState { - self.state.terminate_rudely(); + let mark_failed = false; + self.state.terminate_rudely(mark_failed); self.queue.clear(); self.destroyed = true; self.state.sled_instance_state()