diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 053e3d600b7..abd633fec2e 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -27,6 +27,7 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; +use std::num::NonZeroU32; use std::time::Duration; use uuid::Uuid; @@ -527,6 +528,10 @@ pub struct InstanceWatcherConfig { /// period (in seconds) for periodic activations of this background task #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, + + /// maximum number of retries to attempt before considering a sled-agent + /// dead. + pub max_retries: NonZeroU32, } /// Configuration for a nexus server @@ -895,6 +900,7 @@ mod test { }, instance_watcher: InstanceWatcherConfig { period_secs: Duration::from_secs(30), + max_retries: NonZeroU32::new(5).unwrap(), } }, default_region_allocation_strategy: @@ -963,6 +969,7 @@ mod test { switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 instance_watcher.period_secs = 30 + instance_watcher.max_retries = 10 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index be770b9287b..384b55c2d97 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -349,6 +349,7 @@ impl BackgroundTasks { let watcher = instance_watcher::InstanceWatcher::new( datastore, resolver.clone(), + config.instance_watcher.max_retries, ); driver.register( "instance_watcher".to_string(), diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs index f79c694250a..9decfff4e93 100644 --- a/nexus/src/app/background/instance_watcher.rs +++ b/nexus/src/app/background/instance_watcher.rs @@ -10,6 +10,7 @@ use nexus_db_model::{InvSledAgent, SledInstance}; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use omicron_common::backoff::{self, BackoffError}; use omicron_uuid_kinds::GenericUuid; use serde_json::json; use sled_agent_client::Client as SledAgentClient; @@ -22,6 +23,7 @@ use std::sync::Arc; pub(crate) struct InstanceWatcher { datastore: Arc, resolver: internal_dns::resolver::Resolver, + max_retries: NonZeroU32, } const MAX_SLED_AGENTS: NonZeroU32 = unsafe { @@ -33,8 +35,9 @@ impl InstanceWatcher { pub(crate) fn new( datastore: Arc, resolver: internal_dns::resolver::Resolver, + max_retries: NonZeroU32, ) -> Self { - Self { datastore, resolver } + Self { datastore, resolver, max_retries } } fn check_instance( @@ -55,22 +58,61 @@ impl InstanceWatcher { let client = client.clone(); async move { - let InstanceWatcher { datastore, resolver } = watcher; + let InstanceWatcher { datastore, resolver, max_retries } = watcher; slog::trace!(opctx.log, "checking on instance..."); - let rsp = client.instance_get_state(&instance.instance_id()).await; + let backoff = backoff::retry_policy_internal_service(); + let mut retries = 0; + let rsp = backoff::retry_notify( + backoff, + || async { + let rsp = client + .instance_get_state(&instance.instance_id()) + .await; + match rsp { + Ok(rsp) => Ok(rsp.into_inner()), + Err(e) if retries == max_retries.get() => { + Err(BackoffError::Permanent(e)) + } + Err( + e @ ClientError::InvalidRequest(_) + | e @ ClientError::InvalidUpgrade(_) + | e @ ClientError::UnexpectedResponse(_) + | e @ ClientError::PreHookError(_), + ) => Err(BackoffError::Permanent(e)), + Err(e) => Err(BackoffError::transient(e)), + } + }, + |err, duration| { + slog::info!( + opctx.log, + "instance check failed; retrying: {err}"; + "duration" => ?duration, + "retries_remaining" => max_retries.get() - retries, + ); + }, + ) + .await; let state = match rsp { - Ok(rsp) => rsp.into_inner(), + Ok(state) => state, Err(error) => { - // Here is where it gets interesting. This is where we - // might learn that the sled-agent we were trying to - // talk to is dead. - slog::info!( + // TODO(eliza): here is where it gets interesting --- if the + // sled-agent is in a bad state, we need to: + // 1. figure out whether the instance's VMM is reachable directly + // 2. figure out whether we can recover the sled agent? + // 3. if the instances' VMMs are also gone, mark them as + // "failed" + // 4. this might mean that the whole sled is super gone, + // figure that out too. + // + // for now though, we'll just log a really big error. + slog::error!( opctx.log, - "client error checking on instance: {error:?}" + "instance seems to be in a bad state: {error}" ); - todo!("eliza: implement the interesting parts!"); + return; } }; + slog::debug!(opctx.log, "updating instance state: {state:?}"); let result = crate::app::instance::notify_instance_updated( &datastore, diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index d998a3c3960..6d69f4c8fa2 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 instance_watcher.period_secs = 30 +instance_watcher.max_retries = 5 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 4f61609b4e5..607af64fcee 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30 switch_port_settings_manager.period_secs = 30 region_replacement.period_secs = 30 instance_watcher.period_secs = 30 +instance_watcher.max_retries = 5 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds.