diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 863b77c8e8a..359b333dbb2 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -983,10 +983,35 @@ impl super::Nexus { // long -- say, InstanceRuntimeState::time_updated // plus the timeout, assuming time_updated is the // right point to measure from. - tokio::spawn(async { + let prev_instance_runtime = + prev_instance_state.runtime_state.clone(); + let db_datastore_weak = + Arc::downgrade(&self.db_datastore); + let log = self + .log + .new(o!("component" => "Instance timeout")); + tokio::spawn(async move { tokio::time::sleep(Duration::from_secs(120)) .await; - todo!("fail instance") + if let Some(db_datastore) = + db_datastore_weak.upgrade() + { + Self::mark_instance_failed_inner( + &db_datastore, + &instance_id, + &prev_instance_runtime, + "Timed out waiting for instance state change.", + &log, + ) + .await + .ok(); + } else { + error!( + log, + "DataStore no longer exists to mark instance failed after instance state change timeout."; + "instance_id" => %instance_id + ); + } }); } self.write_returned_instance_state(&instance_id, state) @@ -1043,7 +1068,8 @@ impl super::Nexus { &state.instance().runtime_state, error, ) - .await + .await?; + Ok(HandleInstancePutResultResult::Ok) } } } @@ -1343,7 +1369,24 @@ impl super::Nexus { prev_instance_runtime: &db::model::InstanceRuntimeState, reason: impl std::fmt::Debug, ) -> Result<(), Error> { - error!(self.log, "marking instance failed due to sled agent API error"; + Self::mark_instance_failed_inner( + &self.db_datastore, + instance_id, + prev_instance_runtime, + reason, + &self.log, + ) + .await + } + + async fn mark_instance_failed_inner( + db_datastore: &Arc, + instance_id: &Uuid, + prev_instance_runtime: &db::model::InstanceRuntimeState, + reason: impl std::fmt::Debug, + log: &slog::Logger, + ) -> Result<(), Error> { + error!(log, "marking instance failed due to sled agent API error"; "instance_id" => %instance_id, "error" => ?reason); @@ -1358,16 +1401,15 @@ impl super::Nexus { ..prev_instance_runtime.clone() }; - match self - .db_datastore + match db_datastore .instance_update_runtime(&instance_id, &new_runtime) .await { - Ok(_) => info!(self.log, "marked instance as Failed"; + Ok(_) => info!(log, "marked instance as Failed"; "instance_id" => %instance_id), // XXX: It's not clear what to do with this error; should it be // bubbled back up to the caller? - Err(e) => error!(self.log, + Err(e) => error!(log, "failed to write Failed instance state to DB"; "instance_id" => %instance_id, "error" => ?e), diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 641433d752d..70164cee6af 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -387,7 +387,11 @@ impl InstanceManager { { Ok(HandleInstancePutResultResult::Ok) => {} Ok(HandleInstancePutResultResult::TimedOut) => { - todo!("nexus doesn't want us any more, terminate instance") + error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance. Rudely terminating it from our side."; "instance_id" => %instance_id); + if let Err(err) = instance.terminate().await + { + error!(log, "Couldn't terminate instance whose creation was timed-out by Nexus"; "instance_id" => %instance_id, "err" => %err); + } } Err(err) => { error!(log, "Failed to inform Nexus of instance_put success"; @@ -409,7 +413,11 @@ impl InstanceManager { { Ok(HandleInstancePutResultResult::Ok) => {} Ok(HandleInstancePutResultResult::TimedOut) => { - todo!("well, i guess this is less awkward but clean up if we have to") + error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance, but the instance also explicitly failed on our side. Rudely terminating what remains of it."; "instance_id" => %instance_id); + if let Err(err) = instance.terminate().await + { + error!(log, "Couldn't terminate faulted instance (whose creation was also timed-out by Nexus)"; "instance_id" => %instance_id, "err" => %err); + } } Err(err) => { error!(log, "Failed to inform Nexus of instance_put failure";