From a5be09fe0f20466ff66ae546df4895b785129fb0 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 22 Feb 2024 11:06:14 -0800 Subject: [PATCH] "add sled" needs a longer timeout (#5116) In #5111: - the "add sled" Nexus external API call invokes `PUT /sleds` to some sled agent - `PUT /sleds` itself blocks until the new sled's sled agent has started - sled agent startup blocks on setting the reservoir - on production hardware, setting the reservoir took 115s - the default Progenitor (reqwest) timeout is only 15s So as a result, the "add sled" request failed, even though the operation ultimately succeeded. In this PR, I bump the timeout to 5 minutes. I do wonder if we should remove it altogether, or if we should consider the other changes mentioned in #5111 (like not blocking sled agent startup on this, or not blocking these API calls in this way). But for now, this seems like a low-risk way to improve this situation. --- nexus/src/app/rack.rs | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 7a1ad0e6a9..a137f19434 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -61,6 +61,7 @@ use sled_agent_client::types::{ BgpConfig, BgpPeerConfig as SledBgpPeerConfig, EarlyNetworkConfig, PortConfigV1, RackNetworkConfigV1, RouteConfig as SledRouteConfig, }; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -647,7 +648,7 @@ impl super::Nexus { if rack.rack_subnet.is_some() { return Ok(()); } - let sa = self.get_any_sled_agent(opctx).await?; + let sa = self.get_any_sled_agent_client(opctx).await?; let result = sa .read_network_bootstore_config_cache() .await @@ -883,7 +884,27 @@ impl super::Nexus { }, }, }; - let sa = self.get_any_sled_agent(opctx).await?; + + // This timeout value is fairly arbitrary (as they usually are). As of + // this writing, this operation is known to take close to two minutes on + // production hardware. + let dur = std::time::Duration::from_secs(300); + let sa_url = self.get_any_sled_agent_url(opctx).await?; + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build() + .map_err(|e| { + Error::internal_error(&format!( + "failed to create reqwest client for sled agent: {}", + InlineErrorChain::new(&e) + )) + })?; + let sa = sled_agent_client::Client::new_with_client( + &sa_url, + reqwest_client, + self.log.new(o!("sled_agent_url" => sa_url.clone())), + ); sa.sled_add(&req).await.map_err(|e| Error::InternalError { internal_message: format!( "failed to add sled with baseboard {:?} to rack {}: {e}", @@ -899,10 +920,10 @@ impl super::Nexus { Ok(()) } - async fn get_any_sled_agent( + async fn get_any_sled_agent_url( &self, opctx: &OpContext, - ) -> Result { + ) -> Result { let addr = self .sled_list(opctx, &DataPageParams::max_page()) .await? @@ -911,11 +932,15 @@ impl super::Nexus { internal_message: "no sled agents available".into(), })? .address(); + Ok(format!("http://{}", addr)) + } - Ok(sled_agent_client::Client::new( - &format!("http://{}", addr), - self.log.clone(), - )) + async fn get_any_sled_agent_client( + &self, + opctx: &OpContext, + ) -> Result { + let url = self.get_any_sled_agent_url(opctx).await?; + Ok(sled_agent_client::Client::new(&url, self.log.clone())) } }