diff --git a/wicket-common/src/lib.rs b/wicket-common/src/lib.rs index 9e92d20c0a..aea0634ce7 100644 --- a/wicket-common/src/lib.rs +++ b/wicket-common/src/lib.rs @@ -4,6 +4,13 @@ // Copyright 2023 Oxide Computer Company +use std::time::Duration; + pub mod rack_setup; pub mod rack_update; pub mod update_events; + +// WICKETD_TIMEOUT used to be 1 second, but that might be too short (and in +// particular might be responsible for +// https://github.com/oxidecomputer/omicron/issues/3103). +pub const WICKETD_TIMEOUT: Duration = Duration::from_secs(5); diff --git a/wicket/src/cli/rack_update.rs b/wicket/src/cli/rack_update.rs index f539c22c35..fa41fa7b8c 100644 --- a/wicket/src/cli/rack_update.rs +++ b/wicket/src/cli/rack_update.rs @@ -22,6 +22,7 @@ use update_engine::{ }; use wicket_common::{ rack_update::ClearUpdateStateResponse, update_events::EventReport, + WICKETD_TIMEOUT, }; use wicketd_client::types::{ClearUpdateStateParams, StartUpdateParams}; @@ -31,7 +32,7 @@ use crate::{ parse_event_report_map, ComponentId, CreateClearUpdateStateOptions, CreateStartUpdateOptions, }, - wicketd::{create_wicketd_client, WICKETD_TIMEOUT}, + wicketd::create_wicketd_client, }; use super::command::CommandOutput; diff --git a/wicket/src/wicketd.rs b/wicket/src/wicketd.rs index ec1130a594..a951bf428b 100644 --- a/wicket/src/wicketd.rs +++ b/wicket/src/wicketd.rs @@ -10,6 +10,7 @@ use std::net::SocketAddrV6; use tokio::sync::mpsc::{self, Sender, UnboundedSender}; use tokio::time::{interval, Duration, MissedTickBehavior}; use wicket_common::rack_update::{SpIdentifier, SpType}; +use wicket_common::WICKETD_TIMEOUT; use wicketd_client::types::{ AbortUpdateOptions, ClearUpdateStateOptions, ClearUpdateStateParams, GetInventoryParams, GetInventoryResponse, GetLocationResponse, @@ -38,10 +39,6 @@ impl From for SpIdentifier { } const WICKETD_POLL_INTERVAL: Duration = Duration::from_millis(500); -// WICKETD_TIMEOUT used to be 1 second, but that might be too short (and in -// particular might be responsible for -// https://github.com/oxidecomputer/omicron/issues/3103). -pub(crate) const WICKETD_TIMEOUT: Duration = Duration::from_secs(5); // Assume that these requests are periodic on the order of seconds or the // result of human interaction. In either case, this buffer should be plenty diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index d6cb6ebd6d..cff0436dac 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -51,6 +51,7 @@ use std::time::Duration; use tokio::io::AsyncWriteExt; use wicket_common::rack_setup::PutRssUserConfigInsensitive; use wicket_common::update_events::EventReport; +use wicket_common::WICKETD_TIMEOUT; use crate::ServerContext; @@ -896,21 +897,37 @@ async fn post_start_update( // 1. We haven't pulled its state in our inventory (most likely cause: the // cubby is empty; less likely cause: the SP is misbehaving, which will // make updating it very unlikely to work anyway) - // 2. We have pulled its state but our hardware manager says we can't update - // it (most likely cause: the target is the sled we're currently running - // on; less likely cause: our hardware manager failed to get our local - // identifying information, and it refuses to update this target out of - // an abundance of caution). + // 2. We have pulled its state but our hardware manager says we can't + // update it (most likely cause: the target is the sled we're currently + // running on; less likely cause: our hardware manager failed to get our + // local identifying information, and it refuses to update this target + // out of an abundance of caution). // - // First, get our most-recently-cached inventory view. - let inventory = match rqctx.mgs_handle.get_cached_inventory().await { - Ok(inventory) => inventory, - Err(ShutdownInProgress) => { + // First, get our most-recently-cached inventory view. (Only wait 80% of + // WICKETD_TIMEOUT for this -- if the inventory isn't available, then we + // should produce a useful error message rather than timing out on the + // client.) + let inventory = match tokio::time::timeout( + WICKETD_TIMEOUT.mul_f32(0.8), + rqctx.mgs_handle.get_cached_inventory(), + ) + .await + { + Ok(Ok(inventory)) => inventory, + Ok(Err(ShutdownInProgress)) => { return Err(HttpError::for_unavail( None, "Server is shutting down".into(), )); } + Err(_) => { + // Use 400 Bad Request instead of 503 Service Unavailable so that + // the error message is propagated to the client. + return Err(HttpError::for_bad_request( + None, + "Rack inventory not yet available (is MGS alive?)".into(), + )); + } }; // Error cases.