From 2afaae9d5002a1ae52b487a546c88b6dc4ac1eb6 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 21 Jun 2024 09:47:27 -0700 Subject: [PATCH] xtask download / releng / omicron-package: timeouts, keepalives, and retries (#5923) The HTTPS frontend for Buildomat has been hitting [illumos bug #16615](https://www.illumos.org/issues/16615), which resulted in a number of HTTP downloads from Buildomat artifact URLs not completing... and not timing out, either! This set of changes ensures that all of the HTTP downloads in `cargo xtask download`, `cargo xtask releng`, and `omicron-package` have timeouts and TCP keepalives configured, and have retries. Closes #5910. Closes #5911. --- dev-tools/releng/src/hubris.rs | 51 ++++++++++++++++++++++-------- dev-tools/releng/src/main.rs | 6 +++- dev-tools/xtask/src/download.rs | 45 +++++++++++++++++++------- package/src/bin/omicron-package.rs | 15 +++++++-- tools/softnpu_version | 2 ++ 5 files changed, 92 insertions(+), 27 deletions(-) create mode 100644 tools/softnpu_version diff --git a/dev-tools/releng/src/hubris.rs b/dev-tools/releng/src/hubris.rs index 685a729a9f..f46af4bfaf 100644 --- a/dev-tools/releng/src/hubris.rs +++ b/dev-tools/releng/src/hubris.rs @@ -14,12 +14,17 @@ use omicron_common::api::external::SemverVersion; use omicron_common::api::internal::nexus::KnownArtifactKind; use semver::Version; use serde::Deserialize; +use slog::warn; +use slog::Logger; use tufaceous_lib::assemble::DeserializedArtifactData; use tufaceous_lib::assemble::DeserializedArtifactSource; use tufaceous_lib::assemble::DeserializedFileArtifactSource; use tufaceous_lib::assemble::DeserializedManifest; +use crate::RETRY_ATTEMPTS; + pub(crate) async fn fetch_hubris_artifacts( + logger: Logger, base_url: &'static str, client: reqwest::Client, manifest_list: Utf8PathBuf, @@ -43,7 +48,7 @@ pub(crate) async fn fetch_hubris_artifacts( for line in fs::read_to_string(manifest_list).await?.lines() { if let Some(hash) = line.split_whitespace().next() { - let data = fetch_hash(base_url, &client, hash).await?; + let data = fetch_hash(&logger, base_url, &client, hash).await?; let str = String::from_utf8(data).with_context(|| { format!("hubris artifact manifest {} was not UTF-8", hash) })?; @@ -85,7 +90,9 @@ pub(crate) async fn fetch_hubris_artifacts( }, ); for hash in hashes { - let data = fetch_hash(base_url, &client, &hash).await?; + let data = + fetch_hash(&logger, base_url, &client, &hash) + .await?; fs::write(output_dir.join(zip!(hash)), data).await?; } } @@ -102,21 +109,39 @@ pub(crate) async fn fetch_hubris_artifacts( } async fn fetch_hash( + logger: &Logger, base_url: &'static str, client: &reqwest::Client, hash: &str, ) -> Result> { - client - .get(format!("{}/artifact/{}", base_url, hash)) - .send() - .and_then(|response| response.json()) - .await - .with_context(|| { - format!( - "failed to fetch hubris artifact {} from {}", - hash, base_url - ) - }) + let url = format!("{}/artifact/{}", base_url, hash); + for attempt in 1..=RETRY_ATTEMPTS { + let result = client + .get(&url) + .send() + .and_then(|response| { + futures::future::ready(response.error_for_status()) + }) + .and_then(|response| response.json()) + .await + .with_context(|| { + format!( + "failed to fetch hubris artifact {} from {}", + hash, base_url + ) + }); + match result { + Ok(data) => return Ok(data), + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err); + } else { + warn!(logger, "fetching {} failed, retrying: {}", url, err); + } + } + } + } + unreachable!(); } // These structs are similar to `DeserializeManifest` and friends from diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 9bb0cd33bb..9340de4961 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -43,6 +43,8 @@ use crate::job::Jobs; /// the future. const BASE_VERSION: Version = Version::new(9, 0, 0); +const RETRY_ATTEMPTS: usize = 3; + #[derive(Debug, Clone, Copy)] enum InstallMethod { /// Unpack the tarball to `/opt/oxide/`, and install @@ -234,7 +236,8 @@ async fn main() -> Result<()> { let client = reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(15)) - .timeout(Duration::from_secs(15)) + .timeout(Duration::from_secs(120)) + .tcp_keepalive(Duration::from_secs(60)) .build() .context("failed to build reqwest client")?; @@ -565,6 +568,7 @@ async fn main() -> Result<()> { jobs.push( format!("hubris-{}", name), hubris::fetch_hubris_artifacts( + logger.clone(), base_url, client.clone(), WORKSPACE_DIR.join(format!("tools/permslip_{}", name)), diff --git a/dev-tools/xtask/src/download.rs b/dev-tools/xtask/src/download.rs index f7583d19a0..3002837507 100644 --- a/dev-tools/xtask/src/download.rs +++ b/dev-tools/xtask/src/download.rs @@ -15,6 +15,8 @@ use slog::{info, o, warn, Drain, Logger}; use std::collections::{BTreeSet, HashMap}; use std::io::Write; use std::os::unix::fs::PermissionsExt; +use std::sync::OnceLock; +use std::time::Duration; use strum::EnumIter; use strum::IntoEnumIterator; use tar::Archive; @@ -23,6 +25,7 @@ use tokio::process::Command; const BUILDOMAT_URL: &'static str = "https://buildomat.eng.oxide.computer/public/file"; +const RETRY_ATTEMPTS: usize = 3; /// What is being downloaded? #[derive( @@ -236,7 +239,17 @@ async fn get_values_from_file( /// /// Writes the response to the file as it is received. async fn streaming_download(url: &str, path: &Utf8Path) -> Result<()> { - let mut response = reqwest::get(url).await?; + static CLIENT: OnceLock = OnceLock::new(); + + let client = CLIENT.get_or_init(|| { + reqwest::ClientBuilder::new() + .timeout(Duration::from_secs(3600)) + .tcp_keepalive(Duration::from_secs(60)) + .connect_timeout(Duration::from_secs(15)) + .build() + .unwrap() + }); + let mut response = client.get(url).send().await?.error_for_status()?; let mut tarball = tokio::fs::File::create(&path).await?; while let Some(chunk) = response.chunk().await? { tarball.write_all(chunk.as_ref()).await?; @@ -410,8 +423,22 @@ async fn download_file_and_verify( }; if do_download { - info!(log, "Downloading {path}"); - streaming_download(&url, &path).await?; + for attempt in 1..=RETRY_ATTEMPTS { + info!( + log, + "Downloading {path} (attempt {attempt}/{RETRY_ATTEMPTS})" + ); + match streaming_download(&url, &path).await { + Ok(()) => break, + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err); + } else { + warn!(log, "Download failed, retrying: {err}"); + } + } + } + } } let observed_checksum = algorithm.checksum(&path).await?; @@ -787,19 +814,15 @@ impl<'a> Downloader<'a> { let destination_dir = self.output_dir.join("npuzone"); tokio::fs::create_dir_all(&destination_dir).await?; - let repo = "oxidecomputer/softnpu"; + let checksums_path = self.versions_dir.join("softnpu_version"); + let [commit, sha2] = + get_values_from_file(["COMMIT", "SHA2"], &checksums_path).await?; - // TODO: This should probably live in a separate file, but - // at the moment we're just building parity with - // "ci_download_softnpu_machinery". - let commit = "3203c51cf4473d30991b522062ac0df2e045c2f2"; + let repo = "oxidecomputer/softnpu"; let filename = "npuzone"; let base_url = format!("{BUILDOMAT_URL}/{repo}/image/{commit}"); let artifact_url = format!("{base_url}/{filename}"); - let sha2_url = format!("{base_url}/{filename}.sha256.txt"); - let sha2 = reqwest::get(sha2_url).await?.text().await?; - let sha2 = sha2.trim(); let path = destination_dir.join(filename); download_file_and_verify( diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index 09fa7ab178..a8c5508b77 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -28,7 +28,8 @@ use std::env; use std::fs::create_dir_all; use std::io::Write; use std::str::FromStr; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use std::time::Duration; use swrite::{swrite, SWrite}; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; use tokio::process::Command; @@ -350,6 +351,8 @@ async fn download_prebuilt( expected_digest: &Vec, path: &Utf8Path, ) -> Result<()> { + static CLIENT: OnceLock = OnceLock::new(); + progress.set_message("downloading prebuilt".into()); let url = format!( "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/{}/image/{}/{}", @@ -357,7 +360,15 @@ async fn download_prebuilt( commit, path.file_name().unwrap(), ); - let response = reqwest::Client::new() + let client = CLIENT.get_or_init(|| { + reqwest::ClientBuilder::new() + .timeout(Duration::from_secs(3600)) + .tcp_keepalive(Duration::from_secs(60)) + .connect_timeout(Duration::from_secs(15)) + .build() + .unwrap() + }); + let response = client .get(&url) .send() .await diff --git a/tools/softnpu_version b/tools/softnpu_version new file mode 100644 index 0000000000..03f74d8865 --- /dev/null +++ b/tools/softnpu_version @@ -0,0 +1,2 @@ +COMMIT="3203c51cf4473d30991b522062ac0df2e045c2f2" +SHA2="36095c7f9d613b9208415aeb67335836a25f72eed2f7a41931ba7d91ddb00568"