Skip to content

Commit

Permalink
xtask download / releng / omicron-package: timeouts, keepalives, and …
Browse files Browse the repository at this point in the history
…retries (#5923)

The HTTPS frontend for Buildomat has been hitting [illumos bug
#16615](https://www.illumos.org/issues/16615), which resulted in a
number of HTTP downloads from Buildomat artifact URLs not completing...
and not timing out, either!

This set of changes ensures that all of the HTTP downloads in `cargo
xtask download`, `cargo xtask releng`, and `omicron-package` have
timeouts and TCP keepalives configured, and have retries.

Closes #5910. Closes #5911.
  • Loading branch information
iliana authored Jun 21, 2024
1 parent e9c2432 commit 2afaae9
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 27 deletions.
51 changes: 38 additions & 13 deletions dev-tools/releng/src/hubris.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@ use omicron_common::api::external::SemverVersion;
use omicron_common::api::internal::nexus::KnownArtifactKind;
use semver::Version;
use serde::Deserialize;
use slog::warn;
use slog::Logger;
use tufaceous_lib::assemble::DeserializedArtifactData;
use tufaceous_lib::assemble::DeserializedArtifactSource;
use tufaceous_lib::assemble::DeserializedFileArtifactSource;
use tufaceous_lib::assemble::DeserializedManifest;

use crate::RETRY_ATTEMPTS;

pub(crate) async fn fetch_hubris_artifacts(
logger: Logger,
base_url: &'static str,
client: reqwest::Client,
manifest_list: Utf8PathBuf,
Expand All @@ -43,7 +48,7 @@ pub(crate) async fn fetch_hubris_artifacts(

for line in fs::read_to_string(manifest_list).await?.lines() {
if let Some(hash) = line.split_whitespace().next() {
let data = fetch_hash(base_url, &client, hash).await?;
let data = fetch_hash(&logger, base_url, &client, hash).await?;
let str = String::from_utf8(data).with_context(|| {
format!("hubris artifact manifest {} was not UTF-8", hash)
})?;
Expand Down Expand Up @@ -85,7 +90,9 @@ pub(crate) async fn fetch_hubris_artifacts(
},
);
for hash in hashes {
let data = fetch_hash(base_url, &client, &hash).await?;
let data =
fetch_hash(&logger, base_url, &client, &hash)
.await?;
fs::write(output_dir.join(zip!(hash)), data).await?;
}
}
Expand All @@ -102,21 +109,39 @@ pub(crate) async fn fetch_hubris_artifacts(
}

async fn fetch_hash(
logger: &Logger,
base_url: &'static str,
client: &reqwest::Client,
hash: &str,
) -> Result<Vec<u8>> {
client
.get(format!("{}/artifact/{}", base_url, hash))
.send()
.and_then(|response| response.json())
.await
.with_context(|| {
format!(
"failed to fetch hubris artifact {} from {}",
hash, base_url
)
})
let url = format!("{}/artifact/{}", base_url, hash);
for attempt in 1..=RETRY_ATTEMPTS {
let result = client
.get(&url)
.send()
.and_then(|response| {
futures::future::ready(response.error_for_status())
})
.and_then(|response| response.json())
.await
.with_context(|| {
format!(
"failed to fetch hubris artifact {} from {}",
hash, base_url
)
});
match result {
Ok(data) => return Ok(data),
Err(err) => {
if attempt == RETRY_ATTEMPTS {
return Err(err);
} else {
warn!(logger, "fetching {} failed, retrying: {}", url, err);
}
}
}
}
unreachable!();
}

// These structs are similar to `DeserializeManifest` and friends from
Expand Down
6 changes: 5 additions & 1 deletion dev-tools/releng/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ use crate::job::Jobs;
/// the future.
const BASE_VERSION: Version = Version::new(9, 0, 0);

const RETRY_ATTEMPTS: usize = 3;

#[derive(Debug, Clone, Copy)]
enum InstallMethod {
/// Unpack the tarball to `/opt/oxide/<service-name>`, and install
Expand Down Expand Up @@ -234,7 +236,8 @@ async fn main() -> Result<()> {

let client = reqwest::ClientBuilder::new()
.connect_timeout(Duration::from_secs(15))
.timeout(Duration::from_secs(15))
.timeout(Duration::from_secs(120))
.tcp_keepalive(Duration::from_secs(60))
.build()
.context("failed to build reqwest client")?;

Expand Down Expand Up @@ -565,6 +568,7 @@ async fn main() -> Result<()> {
jobs.push(
format!("hubris-{}", name),
hubris::fetch_hubris_artifacts(
logger.clone(),
base_url,
client.clone(),
WORKSPACE_DIR.join(format!("tools/permslip_{}", name)),
Expand Down
45 changes: 34 additions & 11 deletions dev-tools/xtask/src/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use slog::{info, o, warn, Drain, Logger};
use std::collections::{BTreeSet, HashMap};
use std::io::Write;
use std::os::unix::fs::PermissionsExt;
use std::sync::OnceLock;
use std::time::Duration;
use strum::EnumIter;
use strum::IntoEnumIterator;
use tar::Archive;
Expand All @@ -23,6 +25,7 @@ use tokio::process::Command;

const BUILDOMAT_URL: &'static str =
"https://buildomat.eng.oxide.computer/public/file";
const RETRY_ATTEMPTS: usize = 3;

/// What is being downloaded?
#[derive(
Expand Down Expand Up @@ -236,7 +239,17 @@ async fn get_values_from_file<const N: usize>(
///
/// Writes the response to the file as it is received.
async fn streaming_download(url: &str, path: &Utf8Path) -> Result<()> {
let mut response = reqwest::get(url).await?;
static CLIENT: OnceLock<reqwest::Client> = OnceLock::new();

let client = CLIENT.get_or_init(|| {
reqwest::ClientBuilder::new()
.timeout(Duration::from_secs(3600))
.tcp_keepalive(Duration::from_secs(60))
.connect_timeout(Duration::from_secs(15))
.build()
.unwrap()
});
let mut response = client.get(url).send().await?.error_for_status()?;
let mut tarball = tokio::fs::File::create(&path).await?;
while let Some(chunk) = response.chunk().await? {
tarball.write_all(chunk.as_ref()).await?;
Expand Down Expand Up @@ -410,8 +423,22 @@ async fn download_file_and_verify(
};

if do_download {
info!(log, "Downloading {path}");
streaming_download(&url, &path).await?;
for attempt in 1..=RETRY_ATTEMPTS {
info!(
log,
"Downloading {path} (attempt {attempt}/{RETRY_ATTEMPTS})"
);
match streaming_download(&url, &path).await {
Ok(()) => break,
Err(err) => {
if attempt == RETRY_ATTEMPTS {
return Err(err);
} else {
warn!(log, "Download failed, retrying: {err}");
}
}
}
}
}

let observed_checksum = algorithm.checksum(&path).await?;
Expand Down Expand Up @@ -787,19 +814,15 @@ impl<'a> Downloader<'a> {
let destination_dir = self.output_dir.join("npuzone");
tokio::fs::create_dir_all(&destination_dir).await?;

let repo = "oxidecomputer/softnpu";
let checksums_path = self.versions_dir.join("softnpu_version");
let [commit, sha2] =
get_values_from_file(["COMMIT", "SHA2"], &checksums_path).await?;

// TODO: This should probably live in a separate file, but
// at the moment we're just building parity with
// "ci_download_softnpu_machinery".
let commit = "3203c51cf4473d30991b522062ac0df2e045c2f2";
let repo = "oxidecomputer/softnpu";

let filename = "npuzone";
let base_url = format!("{BUILDOMAT_URL}/{repo}/image/{commit}");
let artifact_url = format!("{base_url}/{filename}");
let sha2_url = format!("{base_url}/{filename}.sha256.txt");
let sha2 = reqwest::get(sha2_url).await?.text().await?;
let sha2 = sha2.trim();

let path = destination_dir.join(filename);
download_file_and_verify(
Expand Down
15 changes: 13 additions & 2 deletions package/src/bin/omicron-package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ use std::env;
use std::fs::create_dir_all;
use std::io::Write;
use std::str::FromStr;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};
use std::time::Duration;
use swrite::{swrite, SWrite};
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader};
use tokio::process::Command;
Expand Down Expand Up @@ -350,14 +351,24 @@ async fn download_prebuilt(
expected_digest: &Vec<u8>,
path: &Utf8Path,
) -> Result<()> {
static CLIENT: OnceLock<reqwest::Client> = OnceLock::new();

progress.set_message("downloading prebuilt".into());
let url = format!(
"https://buildomat.eng.oxide.computer/public/file/oxidecomputer/{}/image/{}/{}",
repo,
commit,
path.file_name().unwrap(),
);
let response = reqwest::Client::new()
let client = CLIENT.get_or_init(|| {
reqwest::ClientBuilder::new()
.timeout(Duration::from_secs(3600))
.tcp_keepalive(Duration::from_secs(60))
.connect_timeout(Duration::from_secs(15))
.build()
.unwrap()
});
let response = client
.get(&url)
.send()
.await
Expand Down
2 changes: 2 additions & 0 deletions tools/softnpu_version
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
COMMIT="3203c51cf4473d30991b522062ac0df2e045c2f2"
SHA2="36095c7f9d613b9208415aeb67335836a25f72eed2f7a41931ba7d91ddb00568"

0 comments on commit 2afaae9

Please sign in to comment.