From 71dc6e92315b451c58fd316de0c89277d5c4185c Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 3 Oct 2023 18:47:45 +0000 Subject: [PATCH] Remove instances from instance manager if terminated before starting Ensure that if an instance is terminated before its Propolis zone is started, it is removed from the instance manager's instance map so that the instance can be restarted on that sled. Found via omicron-stress, which is much happier with this fix in place. Also improve a little bit of tracing. --- nexus/src/app/instance.rs | 2 +- nexus/src/app/sagas/instance_start.rs | 3 ++- sled-agent/src/instance.rs | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 645dc32d8d..977687d0bc 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1133,7 +1133,7 @@ impl super::Nexus { error!( self.log, - "failed to set instance to Failed after bad put"; + "attempted to set instance to Failed after bad put"; "instance_id" => %instance_id, "result" => ?result, ); diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 8b48b185b4..ac3b60146a 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -203,7 +203,8 @@ async fn sis_move_to_starting( let instance_id = params.db_instance.id(); let propolis_id = sagactx.lookup::("propolis_id")?; info!(osagactx.log(), "moving instance to Starting state via saga"; - "instance_id" => %instance_id); + "instance_id" => %instance_id, + "propolis_id" => %propolis_id); let opctx = crate::context::op_context_for_saga_action( &sagactx, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 3701e11fc0..2c542b3ac9 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -506,6 +506,10 @@ impl InstanceInner { self.log, "Instance::terminate() called with no running state" ); + + // Ensure the instance is removed from the instance manager's table + // so that a new instance can take its place. + self.instance_ticket.terminate(); return Ok(()); };