Skip to content

Commit

Permalink
sketch out retry stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Apr 25, 2024
1 parent da54164 commit de01ff7
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 10 deletions.
7 changes: 7 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use std::collections::HashMap;
use std::fmt;
use std::net::IpAddr;
use std::net::SocketAddr;
use std::num::NonZeroU32;
use std::time::Duration;
use uuid::Uuid;

Expand Down Expand Up @@ -527,6 +528,10 @@ pub struct InstanceWatcherConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,

/// maximum number of retries to attempt before considering a sled-agent
/// dead.
pub max_retries: NonZeroU32,
}

/// Configuration for a nexus server
Expand Down Expand Up @@ -895,6 +900,7 @@ mod test {
},
instance_watcher: InstanceWatcherConfig {
period_secs: Duration::from_secs(30),
max_retries: NonZeroU32::new(5).unwrap(),
}
},
default_region_allocation_strategy:
Expand Down Expand Up @@ -963,6 +969,7 @@ mod test {
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
instance_watcher.period_secs = 30
instance_watcher.max_retries = 10
[default_region_allocation_strategy]
type = "random"
"##,
Expand Down
1 change: 1 addition & 0 deletions nexus/src/app/background/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ impl BackgroundTasks {
let watcher = instance_watcher::InstanceWatcher::new(
datastore,
resolver.clone(),
config.instance_watcher.max_retries,
);
driver.register(
"instance_watcher".to_string(),
Expand Down
62 changes: 52 additions & 10 deletions nexus/src/app/background/instance_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use nexus_db_model::{InvSledAgent, SledInstance};
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::pagination::Paginator;
use nexus_db_queries::db::DataStore;
use omicron_common::backoff::{self, BackoffError};
use omicron_uuid_kinds::GenericUuid;
use serde_json::json;
use sled_agent_client::Client as SledAgentClient;
Expand All @@ -22,6 +23,7 @@ use std::sync::Arc;
pub(crate) struct InstanceWatcher {
datastore: Arc<DataStore>,
resolver: internal_dns::resolver::Resolver,
max_retries: NonZeroU32,
}

const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
Expand All @@ -33,8 +35,9 @@ impl InstanceWatcher {
pub(crate) fn new(
datastore: Arc<DataStore>,
resolver: internal_dns::resolver::Resolver,
max_retries: NonZeroU32,
) -> Self {
Self { datastore, resolver }
Self { datastore, resolver, max_retries }
}

fn check_instance(
Expand All @@ -55,22 +58,61 @@ impl InstanceWatcher {
let client = client.clone();

async move {
let InstanceWatcher { datastore, resolver } = watcher;
let InstanceWatcher { datastore, resolver, max_retries } = watcher;
slog::trace!(opctx.log, "checking on instance...");
let rsp = client.instance_get_state(&instance.instance_id()).await;
let backoff = backoff::retry_policy_internal_service();
let mut retries = 0;
let rsp = backoff::retry_notify(
backoff,
|| async {
let rsp = client
.instance_get_state(&instance.instance_id())
.await;
match rsp {
Ok(rsp) => Ok(rsp.into_inner()),
Err(e) if retries == max_retries.get() => {
Err(BackoffError::Permanent(e))
}
Err(
e @ ClientError::InvalidRequest(_)
| e @ ClientError::InvalidUpgrade(_)
| e @ ClientError::UnexpectedResponse(_)
| e @ ClientError::PreHookError(_),
) => Err(BackoffError::Permanent(e)),
Err(e) => Err(BackoffError::transient(e)),
}
},
|err, duration| {
slog::info!(
opctx.log,
"instance check failed; retrying: {err}";
"duration" => ?duration,
"retries_remaining" => max_retries.get() - retries,
);
},
)
.await;
let state = match rsp {
Ok(rsp) => rsp.into_inner(),
Ok(state) => state,
Err(error) => {
// Here is where it gets interesting. This is where we
// might learn that the sled-agent we were trying to
// talk to is dead.
slog::info!(
// TODO(eliza): here is where it gets interesting --- if the
// sled-agent is in a bad state, we need to:
// 1. figure out whether the instance's VMM is reachable directly
// 2. figure out whether we can recover the sled agent?
// 3. if the instances' VMMs are also gone, mark them as
// "failed"
// 4. this might mean that the whole sled is super gone,
// figure that out too.
//
// for now though, we'll just log a really big error.
slog::error!(
opctx.log,
"client error checking on instance: {error:?}"
"instance seems to be in a bad state: {error}"
);
todo!("eliza: implement the interesting parts!");
return;
}
};

slog::debug!(opctx.log, "updating instance state: {state:?}");
let result = crate::app::instance::notify_instance_updated(
&datastore,
Expand Down
1 change: 1 addition & 0 deletions smf/nexus/multi-sled/config-partial.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
instance_watcher.period_secs = 30
instance_watcher.max_retries = 5

[default_region_allocation_strategy]
# by default, allocate across 3 distinct sleds
Expand Down
1 change: 1 addition & 0 deletions smf/nexus/single-sled/config-partial.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
switch_port_settings_manager.period_secs = 30
region_replacement.period_secs = 30
instance_watcher.period_secs = 30
instance_watcher.max_retries = 5

[default_region_allocation_strategy]
# by default, allocate without requirement for distinct sleds.
Expand Down

0 comments on commit de01ff7

Please sign in to comment.