From 717e6b0097c67087c2a840217a51a2cf303cc682 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 21 Feb 2024 21:47:52 -0800 Subject: [PATCH 1/2] "add sled" needs a longer timeout --- nexus/src/app/rack.rs | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 7a1ad0e6a9..41ca83d71b 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -61,6 +61,7 @@ use sled_agent_client::types::{ BgpConfig, BgpPeerConfig as SledBgpPeerConfig, EarlyNetworkConfig, PortConfigV1, RackNetworkConfigV1, RouteConfig as SledRouteConfig, }; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -647,7 +648,7 @@ impl super::Nexus { if rack.rack_subnet.is_some() { return Ok(()); } - let sa = self.get_any_sled_agent(opctx).await?; + let sa = self.get_any_sled_agent_client(opctx).await?; let result = sa .read_network_bootstore_config_cache() .await @@ -883,7 +884,24 @@ impl super::Nexus { }, }, }; - let sa = self.get_any_sled_agent(opctx).await?; + + let sa_url = self.get_any_sled_agent_url(opctx).await?; + let dur = std::time::Duration::from_secs(300); + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .timeout(dur) + .build() + .map_err(|e| { + Error::internal_error(&format!( + "failed to create reqwest client for sled agent: {}", + InlineErrorChain::new(&e) + )) + })?; + let sa = sled_agent_client::Client::new_with_client( + &sa_url, + reqwest_client, + self.log.new(o!("sled_agent_url" => sa_url.clone())), + ); sa.sled_add(&req).await.map_err(|e| Error::InternalError { internal_message: format!( "failed to add sled with baseboard {:?} to rack {}: {e}", @@ -899,10 +917,10 @@ impl super::Nexus { Ok(()) } - async fn get_any_sled_agent( + async fn get_any_sled_agent_url( &self, opctx: &OpContext, - ) -> Result { + ) -> Result { let addr = self .sled_list(opctx, &DataPageParams::max_page()) .await? @@ -911,11 +929,15 @@ impl super::Nexus { internal_message: "no sled agents available".into(), })? .address(); + Ok(format!("http://{}", addr)) + } - Ok(sled_agent_client::Client::new( - &format!("http://{}", addr), - self.log.clone(), - )) + async fn get_any_sled_agent_client( + &self, + opctx: &OpContext, + ) -> Result { + let url = self.get_any_sled_agent_url(opctx).await?; + Ok(sled_agent_client::Client::new(&url, self.log.clone())) } } From f30d56b29f15288a53c3744f8b7af7e2a23931ac Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 21 Feb 2024 21:55:30 -0800 Subject: [PATCH 2/2] add a comment --- nexus/src/app/rack.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 41ca83d71b..a137f19434 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -885,8 +885,11 @@ impl super::Nexus { }, }; - let sa_url = self.get_any_sled_agent_url(opctx).await?; + // This timeout value is fairly arbitrary (as they usually are). As of + // this writing, this operation is known to take close to two minutes on + // production hardware. let dur = std::time::Duration::from_secs(300); + let sa_url = self.get_any_sled_agent_url(opctx).await?; let reqwest_client = reqwest::ClientBuilder::new() .connect_timeout(dur) .timeout(dur)