Skip to content

Commit

Permalink
[sled-agent] Replace du(1) with a Rust function (#6482)
Browse files Browse the repository at this point in the history
Currently, the `zone_bundle::disk_usage` function in sled-agent shells
out to `du(1)` and parses its output. This works fine, but it's a bit
flaky, especially when running the corresponding tests on "weird"
platforms such as NixOS, where the `du(1)` binary doesn't live in
`/usr/bin/du` (see #6479).

This commit replaces the code that shells out to `/usr/bin/du` with a
pure-Rust rewrite. There's a unit test that asserts that the disk usage
calculated by the new implementation matches the value calculated by
`du(1)`, with code for handling the potential inaccuracy due to `du(1)`
operating in terms of 512B blocks on macOS.

Closes #6479
  • Loading branch information
hawkw authored Sep 3, 2024
1 parent 165d42a commit b6a098d
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 62 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ zone.workspace = true
static_assertions.workspace = true
omicron-workspace-hack.workspace = true
slog-error-chain.workspace = true
walkdir.workspace = true

[target.'cfg(target_os = "illumos")'.dependencies]
opte-ioctl.workspace = true
Expand Down
159 changes: 97 additions & 62 deletions sled-agent/src/zone_bundle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,11 @@ pub enum BundleError {

#[error("Instance is terminating")]
InstanceTerminating,

/// The `walkdir` crate's errors already include the path which could not be
/// read (if one exists), so we can just wrap them directly.
#[error(transparent)]
WalkDir(#[from] walkdir::Error),
}

// Helper function to write an array of bytes into the tar archive, with
Expand Down Expand Up @@ -1533,7 +1538,7 @@ async fn compute_bundle_utilization(
// TODO-correctness: This takes into account the directories themselves,
// and may be not quite what we want. But it is very easy and pretty
// close.
let bytes_used = disk_usage(dir).await?;
let bytes_used = dir_size(dir).await?;
debug!(log, "computed bytes used"; "bytes_used" => bytes_used);
out.insert(
dir.clone(),
Expand All @@ -1545,61 +1550,27 @@ async fn compute_bundle_utilization(

// Return the number of bytes occupied by the provided directory.
//
// This returns an error if:
//
// - The "du" command fails
// - Parsing stdout fails
// - Parsing the actual size as a u64 fails
async fn disk_usage(path: &Utf8PathBuf) -> Result<u64, BundleError> {
// Each OS implements slightly different `du` options.
//
// Linux and illumos support the "apparent" size in bytes, though using
// different options. macOS doesn't support bytes at all, and has a minimum
// block size of 512.
//
// We'll suffer the lower resolution on macOS, and get higher resolution on
// the others.
cfg_if::cfg_if! {
if #[cfg(target_os = "illumos")] {
const BLOCK_SIZE: u64 = 1;
const DU_ARG: &str = "-A";
} else if #[cfg(target_os = "linux")] {
const BLOCK_SIZE: u64 = 1;
const DU_ARG: &str = "-b";
} else if #[cfg(target_os = "macos")] {
const BLOCK_SIZE: u64 = 512;
const DU_ARG: &str = "-k";
} else {
compile_error!("unsupported target OS");
// This returns an error if reading any file or directory within the provided
// directory fails.
async fn dir_size(path: &Utf8PathBuf) -> Result<u64, BundleError> {
let path = path.clone();
// We could, alternatively, just implement this using `tokio::fs` to read
// the directory and so on. However, the `tokio::fs` APIs are basically just
// a wrapper around `spawn_blocking`-ing code that does the `std::fs`
// functions. So, putting the whole thing in one big `spawn_blocking`
// closure that just does all the blocking fs ops lets us spend less time
// creating and destroying tiny blocking tasks.
tokio::task::spawn_blocking(move || {
let mut sum = 0;
for entry in walkdir::WalkDir::new(&path).into_iter() {
let entry = entry?;
sum += entry.metadata()?.len()
}
}
const DU: &str = "/usr/bin/du";
let args = &[DU_ARG, "-s", path.as_str()];
let output = Command::new(DU).args(args).output().await.map_err(|err| {
BundleError::Command { cmd: format!("{DU} {}", args.join(" ")), err }
})?;
let err = |msg: &str| {
BundleError::Cleanup(anyhow!(
"failed to fetch disk usage for {}: {}",
path,
msg,
))
};
if !output.status.success() {
return Err(err("du command failed"));
}
let Ok(s) = std::str::from_utf8(&output.stdout) else {
return Err(err("non-UTF8 stdout"));
};
let Some(line) = s.lines().next() else {
return Err(err("no lines in du output"));
};
let Some(part) = line.trim().split_ascii_whitespace().next() else {
return Err(err("no disk usage size computed in output"));
};
part.parse()
.map(|x: u64| x.saturating_mul(BLOCK_SIZE))
.map_err(|_| err("failed to parse du output"))

Ok(sum)
})
.await
.expect("spawned blocking task should not be cancelled or panic")
}

// Return the quota for a ZFS dataset, or the available size.
Expand Down Expand Up @@ -1659,22 +1630,86 @@ async fn zfs_quota(path: &Utf8PathBuf) -> Result<u64, BundleError> {

#[cfg(test)]
mod tests {
use super::disk_usage;
use super::Utf8PathBuf;
use super::*;

#[tokio::test]
async fn test_disk_usage() {
async fn test_dir_size() {
let path =
Utf8PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"), "/src"));
let usage = disk_usage(&path).await.unwrap();
// Run `du -As /path/to/omicron/sled-agent/src`, which currently shows this
// directory is ~450 KiB.
let usage = dir_size(&path).await.unwrap();
assert!(
usage >= 1024 * 400,
"sled-agent manifest directory disk usage not correct?"
);
let path = Utf8PathBuf::from("/some/nonexistent/path");
assert!(disk_usage(&path).await.is_err());
assert!(dir_size(&path).await.is_err());
}

#[tokio::test]
// Different operating systems ship slightly different versions of `du(1)`,
// with differing behaviors. We really only care that the `dir_size`
// function behaves the same as the illumos `du(1)`, so skip this test on
// other systems.
#[cfg_attr(not(target_os = "illumos"), ignore)]
async fn test_dir_size_matches_du() {
const DU: &str = "du";
async fn dir_size_du(path: &Utf8PathBuf) -> Result<u64, BundleError> {
let args = &["-A", "-s", path.as_str()];
let output =
Command::new(DU).args(args).output().await.map_err(|err| {
BundleError::Command {
cmd: format!("{DU} {}", args.join(" ")),
err,
}
})?;
let err = |msg: &str| {
BundleError::Cleanup(anyhow!(
"failed to fetch disk usage for {}: {}",
path,
msg,
))
};
if !output.status.success() {
return Err(err("du command failed"));
}
let Ok(s) = std::str::from_utf8(&output.stdout) else {
return Err(err("non-UTF8 stdout"));
};
let Some(line) = s.lines().next() else {
return Err(err("no lines in du output"));
};
let Some(part) = line.trim().split_ascii_whitespace().next() else {
return Err(err("no disk usage size computed in output"));
};
part.parse().map_err(|_| err("failed to parse du output"))
}

let du_output =
Command::new(DU).arg("--version").output().await.unwrap();
eprintln!(
"du --version:\n{}\n",
String::from_utf8_lossy(&du_output.stdout)
);

let path =
Utf8PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"), "/src"));
let t0 = std::time::Instant::now();
let usage =
dbg!(dir_size(&path).await).expect("disk usage for src dir failed");
eprintln!("dir_size({path}) took {:?}", t0.elapsed());

let t0 = std::time::Instant::now();
// Run `du -As /path/to/omicron/sled-agent/src`, which currently shows this
// directory is ~450 KiB.
let du_usage =
dbg!(dir_size_du(&path).await).expect("running du failed!");
eprintln!("du -s {path} took {:?}", t0.elapsed());

assert_eq!(
usage, du_usage,
"expected `dir_size({path})` to == `du -s {path}`\n\
{usage}B (`dir_size`) != {du_usage}B (`du`)",
);
}
}

Expand Down

0 comments on commit b6a098d

Please sign in to comment.