sketch out retry stuff

oxidecomputer · Apr 25, 2024 · de01ff7 · de01ff7
1 parent da54164
commit de01ff7
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 10 deletions.
diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
@@ -27,6 +27,7 @@ use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
 use std::net::SocketAddr;
+use std::num::NonZeroU32;
 use std::time::Duration;
 use uuid::Uuid;
 
@@ -527,6 +528,10 @@ pub struct InstanceWatcherConfig {
     /// period (in seconds) for periodic activations of this background task
     #[serde_as(as = "DurationSeconds<u64>")]
     pub period_secs: Duration,
+
+    /// maximum number of retries to attempt before considering a sled-agent
+    /// dead.
+    pub max_retries: NonZeroU32,
 }
 
 /// Configuration for a nexus server
@@ -895,6 +900,7 @@ mod test {
                         },
                         instance_watcher: InstanceWatcherConfig {
                             period_secs: Duration::from_secs(30),
+                            max_retries: NonZeroU32::new(5).unwrap(),
                         }
                     },
                     default_region_allocation_strategy:
@@ -963,6 +969,7 @@ mod test {
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
             instance_watcher.period_secs = 30
+            instance_watcher.max_retries = 10
             [default_region_allocation_strategy]
             type = "random"
             "##,

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
@@ -349,6 +349,7 @@ impl BackgroundTasks {
             let watcher = instance_watcher::InstanceWatcher::new(
                 datastore,
                 resolver.clone(),
+                config.instance_watcher.max_retries,
             );
             driver.register(
                 "instance_watcher".to_string(),

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
@@ -10,6 +10,7 @@ use nexus_db_model::{InvSledAgent, SledInstance};
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
+use omicron_common::backoff::{self, BackoffError};
 use omicron_uuid_kinds::GenericUuid;
 use serde_json::json;
 use sled_agent_client::Client as SledAgentClient;
@@ -22,6 +23,7 @@ use std::sync::Arc;
 pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
+    max_retries: NonZeroU32,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -33,8 +35,9 @@ impl InstanceWatcher {
     pub(crate) fn new(
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
+        max_retries: NonZeroU32,
     ) -> Self {
-        Self { datastore, resolver }
+        Self { datastore, resolver, max_retries }
     }
 
     fn check_instance(
@@ -55,22 +58,61 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver } = watcher;
+            let InstanceWatcher { datastore, resolver, max_retries } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
-            let rsp = client.instance_get_state(&instance.instance_id()).await;
+            let backoff = backoff::retry_policy_internal_service();
+            let mut retries = 0;
+            let rsp = backoff::retry_notify(
+                backoff,
+                || async {
+                    let rsp = client
+                        .instance_get_state(&instance.instance_id())
+                        .await;
+                    match rsp {
+                        Ok(rsp) => Ok(rsp.into_inner()),
+                        Err(e) if retries == max_retries.get() => {
+                            Err(BackoffError::Permanent(e))
+                        }
+                        Err(
+                            e @ ClientError::InvalidRequest(_)
+                            | e @ ClientError::InvalidUpgrade(_)
+                            | e @ ClientError::UnexpectedResponse(_)
+                            | e @ ClientError::PreHookError(_),
+                        ) => Err(BackoffError::Permanent(e)),
+                        Err(e) => Err(BackoffError::transient(e)),
+                    }
+                },
+                |err, duration| {
+                    slog::info!(
+                        opctx.log,
+                        "instance check failed; retrying: {err}";
+                        "duration" => ?duration,
+                        "retries_remaining" => max_retries.get() - retries,
+                    );
+                },
+            )
+            .await;
             let state = match rsp {
-                Ok(rsp) => rsp.into_inner(),
+                Ok(state) => state,
                 Err(error) => {
-                    // Here is where it gets interesting. This is where we
-                    // might learn that the sled-agent we were trying to
-                    // talk to is dead.
-                    slog::info!(
+                    // TODO(eliza): here is where it gets interesting --- if the
+                    // sled-agent is in a bad state, we need to:
+                    // 1. figure out whether the instance's VMM is reachable directly
+                    // 2. figure out whether we can recover the sled agent?
+                    // 3. if the instances' VMMs are also gone, mark them as
+                    //    "failed"
+                    // 4. this might mean that the whole sled is super gone,
+                    //    figure that out too.
+                    //
+                    // for now though, we'll just log a really big error.
+                    slog::error!(
                         opctx.log,
-                        "client error checking on instance: {error:?}"
+                        "instance seems to be in a bad state: {error}"
                     );
-                    todo!("eliza: implement the interesting parts!");
+                    return;
                 }
             };
+
             slog::debug!(opctx.log, "updating instance state: {state:?}");
             let result = crate::app::instance::notify_instance_updated(
                 &datastore,

diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
+instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds

diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
+instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.