Skip to content

Commit

Permalink
Mark vmm failed
Browse files Browse the repository at this point in the history
  • Loading branch information
smklein committed Jul 5, 2024
1 parent a818de2 commit f242e0a
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 18 deletions.
13 changes: 10 additions & 3 deletions sled-agent/src/common/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -486,9 +486,15 @@ impl InstanceStates {
/// instance's state in Nexus may become inconsistent. This routine should
/// therefore only be invoked by callers who know that an instance is not
/// migrating.
pub(crate) fn terminate_rudely(&mut self) {
pub(crate) fn terminate_rudely(&mut self, mark_failed: bool) {
let vmm_state = if mark_failed {
PropolisInstanceState(PropolisApiState::Failed)
} else {
PropolisInstanceState(PropolisApiState::Destroyed)
};

let fake_observed = ObservedPropolisState {
vmm_state: PropolisInstanceState(PropolisApiState::Destroyed),
vmm_state,
migration_status: if self.instance.migration_id.is_some() {
ObservedMigrationStatus::Failed
} else {
Expand Down Expand Up @@ -893,7 +899,8 @@ mod test {
assert_eq!(state.propolis_role(), PropolisRole::MigrationTarget);

let prev = state.clone();
state.terminate_rudely();
let mark_failed = false;
state.terminate_rudely(mark_failed);

assert_state_change_has_gen_change(&prev, &state);
assert_eq!(state.instance.gen, prev.instance.gen);
Expand Down
28 changes: 17 additions & 11 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ enum InstanceRequest {
tx: oneshot::Sender<Result<SledInstanceState, ManagerError>>,
},
Terminate {
mark_failed: bool,
tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
},
IssueSnapshotRequest {
Expand Down Expand Up @@ -395,7 +396,8 @@ impl InstanceRunner {
// of the sender alive in "self.tx_monitor".
None => {
warn!(self.log, "Instance 'VMM monitor' channel closed; shutting down");
self.terminate().await;
let mark_failed = true;
self.terminate(mark_failed).await;
},
}

Expand Down Expand Up @@ -432,9 +434,9 @@ impl InstanceRunner {
)
.map_err(|_| Error::FailedSendClientClosed)
},
Some(Terminate { tx }) => {
Some(Terminate { mark_failed, tx }) => {
tx.send(Ok(InstanceUnregisterResponse {
updated_runtime: Some(self.terminate().await)
updated_runtime: Some(self.terminate(mark_failed).await)
}))
.map_err(|_| Error::FailedSendClientClosed)
},
Expand All @@ -457,7 +459,8 @@ impl InstanceRunner {
},
None => {
warn!(self.log, "Instance request channel closed; shutting down");
self.terminate().await;
let mark_failed = false;
self.terminate(mark_failed).await;
break;
},
};
Expand Down Expand Up @@ -617,8 +620,8 @@ impl InstanceRunner {
Some(InstanceAction::Destroy) => {
info!(self.log, "terminating VMM that has exited";
"instance_id" => %self.id());

self.terminate().await;
let mark_failed = false;
self.terminate(mark_failed).await;
Reaction::Terminate
}
None => Reaction::Continue,
Expand Down Expand Up @@ -1132,9 +1135,10 @@ impl Instance {
pub async fn terminate(
&self,
tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
mark_failed: bool,
) -> Result<(), Error> {
self.tx
.send(InstanceRequest::Terminate { tx })
.send(InstanceRequest::Terminate { mark_failed, tx })
.await
.map_err(|_| Error::FailedSendChannelClosed)?;
Ok(())
Expand Down Expand Up @@ -1254,7 +1258,8 @@ impl InstanceRunner {
// This case is morally equivalent to starting Propolis and then
// rudely terminating it before asking it to do anything. Update
// the VMM and instance states accordingly.
self.state.terminate_rudely();
let mark_failed = false;
self.state.terminate_rudely(mark_failed);
}
setup_result?;
}
Expand All @@ -1281,7 +1286,8 @@ impl InstanceRunner {
// this happens, generate an instance record bearing the
// "Destroyed" state and return it to the caller.
if self.running_state.is_none() {
self.terminate().await;
let mark_failed = false;
self.terminate(mark_failed).await;
(None, None)
} else {
(
Expand Down Expand Up @@ -1481,9 +1487,9 @@ impl InstanceRunner {
Ok(PropolisSetup { client, running_zone })
}

async fn terminate(&mut self) -> SledInstanceState {
async fn terminate(&mut self, mark_failed: bool) -> SledInstanceState {
self.terminate_inner().await;
self.state.terminate_rudely();
self.state.terminate_rudely(mark_failed);

// This causes the "run" task to exit on the next iteration.
self.should_terminate = true;
Expand Down
6 changes: 4 additions & 2 deletions sled-agent/src/instance_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,8 @@ impl InstanceManagerRunner {

// Otherwise, we pipeline the request, and send it to the instance,
// where it can receive an appropriate response.
instance.terminate(tx).await?;
let mark_failed = false;
instance.terminate(tx, mark_failed).await?;
Ok(())
}

Expand Down Expand Up @@ -842,7 +843,8 @@ impl InstanceManagerRunner {
info!(self.log, "use_only_these_disks: Removing instance"; "instance_id" => ?id);
if let Some((_, instance)) = self.instances.remove(&id) {
let (tx, rx) = oneshot::channel();
if let Err(e) = instance.terminate(tx).await {
let mark_failed = true;
if let Err(e) = instance.terminate(tx, mark_failed).await {
warn!(self.log, "use_only_these_disks: Failed to request instance removal"; "err" => ?e);
continue;
}
Expand Down
6 changes: 4 additions & 2 deletions sled-agent/src/sim/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ impl SimInstanceInner {
InstanceStateRequested::Stopped => {
match self.next_resting_state() {
VmmState::Starting => {
self.state.terminate_rudely();
let mark_failed = false;
self.state.terminate_rudely(mark_failed);
}
VmmState::Running => self.queue_graceful_stop(),
// Idempotently allow requests to stop an instance that is
Expand Down Expand Up @@ -363,7 +364,8 @@ impl SimInstanceInner {
/// Simulates rude termination by moving the instance to the Destroyed state
/// immediately and clearing the queue of pending state transitions.
fn terminate(&mut self) -> SledInstanceState {
self.state.terminate_rudely();
let mark_failed = false;
self.state.terminate_rudely(mark_failed);
self.queue.clear();
self.destroyed = true;
self.state.sled_instance_state()
Expand Down

0 comments on commit f242e0a

Please sign in to comment.