From fadecf77e26fca6c31a1a10fb2887f361479e089 Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Tue, 7 Nov 2023 11:45:44 +0000 Subject: [PATCH 01/18] Ensure required subdirectories of /var/fm/fmd exist (#4445) When a blank ZFS dataset is created to back /var/fm/fmd, the fault management daemon creates the files it needs but it does not create subdirectories that are required to store data. sled-agent needs to ensure these exist after mounting the dataset Fixes #4444 --- sled-agent/src/backing_fs.rs | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/backing_fs.rs b/sled-agent/src/backing_fs.rs index 5014ac5999..6ecb9dac43 100644 --- a/sled-agent/src/backing_fs.rs +++ b/sled-agent/src/backing_fs.rs @@ -25,6 +25,7 @@ use camino::Utf8PathBuf; use illumos_utils::zfs::{ EnsureFilesystemError, GetValueError, Mountpoint, SizeDetails, Zfs, }; +use std::io; #[derive(Debug, thiserror::Error)] pub enum BackingFsError { @@ -36,9 +37,12 @@ pub enum BackingFsError { #[error("Error initializing dataset: {0}")] Mount(#[from] EnsureFilesystemError), + + #[error("Failed to ensure subdirectory {0}")] + EnsureSubdir(#[from] io::Error), } -struct BackingFs { +struct BackingFs<'a> { // Dataset name name: &'static str, // Mountpoint @@ -49,9 +53,11 @@ struct BackingFs { compression: Option<&'static str>, // Linked service service: Option<&'static str>, + // Subdirectories to ensure + subdirs: Option<&'a [&'static str]>, } -impl BackingFs { +impl<'a> BackingFs<'a> { const fn new(name: &'static str) -> Self { Self { name, @@ -59,6 +65,7 @@ impl BackingFs { quota: None, compression: None, service: None, + subdirs: None, } } @@ -81,10 +88,16 @@ impl BackingFs { self.service = Some(service); self } + + const fn subdirs(mut self, subdirs: &'a [&'static str]) -> Self { + self.subdirs = Some(subdirs); + self + } } const BACKING_FMD_DATASET: &'static str = "fmd"; const BACKING_FMD_MOUNTPOINT: &'static str = "/var/fm/fmd"; +const BACKING_FMD_SUBDIRS: [&'static str; 3] = ["rsrc", "ckpt", "xprt"]; const BACKING_FMD_SERVICE: &'static str = "svc:/system/fmd:default"; const BACKING_FMD_QUOTA: usize = 500 * (1 << 20); // 500 MiB @@ -94,6 +107,7 @@ const BACKINGFS_COUNT: usize = 1; static BACKINGFS: [BackingFs; BACKINGFS_COUNT] = [BackingFs::new(BACKING_FMD_DATASET) .mountpoint(BACKING_FMD_MOUNTPOINT) + .subdirs(&BACKING_FMD_SUBDIRS) .quota(BACKING_FMD_QUOTA) .compression(BACKING_COMPRESSION) .service(BACKING_FMD_SERVICE)]; @@ -165,6 +179,15 @@ pub(crate) fn ensure_backing_fs( Zfs::mount_overlay_dataset(&dataset, &mountpoint)?; + if let Some(subdirs) = bfs.subdirs { + for dir in subdirs { + let subdir = format!("{}/{}", mountpoint, dir); + + info!(log, "Ensuring directory {}", subdir); + std::fs::create_dir_all(subdir)?; + } + } + if let Some(service) = bfs.service { info!(log, "Starting service {}", service); smf::Adm::new() From 1264772ebb6c0cfd5f54677c81b69939997b4431 Mon Sep 17 00:00:00 2001 From: Augustus Mayo Date: Tue, 7 Nov 2023 07:22:26 -0600 Subject: [PATCH 02/18] Fix newline output in mgd update script (#4447) --- tools/update_maghemite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/update_maghemite.sh b/tools/update_maghemite.sh index eebece1aa5..db7e482776 100755 --- a/tools/update_maghemite.sh +++ b/tools/update_maghemite.sh @@ -59,7 +59,7 @@ function update_mgd { fi echo "Updating Maghemite mgd from: $TARGET_COMMIT" set -x - echo "$OUTPUT\n$OUTPUT_LINUX" > $MGD_PATH + printf "$OUTPUT\n$OUTPUT_LINUX" > $MGD_PATH set +x } From e831c6a84731f4c7cdf98ef4b31c08ec8c99d3af Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 09:35:33 -0800 Subject: [PATCH 03/18] fix(deps): update rust crate cargo_toml to 0.17 (#4457) --- Cargo.lock | 4 ++-- dev-tools/xtask/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cc10947083..fcc788f19e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -874,9 +874,9 @@ dependencies = [ [[package]] name = "cargo_toml" -version = "0.16.3" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f9629bc6c4388ea699781dc988c2b99766d7679b151c81990b4fa1208fafd3" +checksum = "6ca592ad99e6a0fd4b95153406138b997cc26ccd3cd0aecdfd4fbdbf1519bd77" dependencies = [ "serde", "toml 0.8.6", diff --git a/dev-tools/xtask/Cargo.toml b/dev-tools/xtask/Cargo.toml index 30ffdc416b..d054d85646 100644 --- a/dev-tools/xtask/Cargo.toml +++ b/dev-tools/xtask/Cargo.toml @@ -7,6 +7,6 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true camino.workspace = true -cargo_toml = "0.16" +cargo_toml = "0.17" cargo_metadata = "0.18" clap.workspace = true From 4a95ef5d60ac75dc4c3e28c1dd2e94eca1026896 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 09:36:44 -0800 Subject: [PATCH 04/18] chore(deps): update swatinem/rust-cache action to v2.7.1 (#4455) --- .github/workflows/rust.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8cc98f192f..1b3a892338 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -28,7 +28,7 @@ jobs: - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 - - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 + - uses: Swatinem/rust-cache@3cf7f8cc28d1b4e7d01e3783be10a97d55d483c8 # v2.7.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version @@ -54,7 +54,7 @@ jobs: - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 - - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 + - uses: Swatinem/rust-cache@3cf7f8cc28d1b4e7d01e3783be10a97d55d483c8 # v2.7.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version @@ -80,7 +80,7 @@ jobs: - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0 - - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 + - uses: Swatinem/rust-cache@3cf7f8cc28d1b4e7d01e3783be10a97d55d483c8 # v2.7.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version From 7696cb18985a1232bde8991bb5fbe467d0a1f892 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 09:37:36 -0800 Subject: [PATCH 05/18] chore(deps): update rust crate toml to 0.8.8 (#4453) --- Cargo.lock | 63 ++++++++++++++++++++++++++++++++---------------------- Cargo.toml | 2 +- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fcc788f19e..b7c1bb70bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -879,7 +879,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ca592ad99e6a0fd4b95153406138b997cc26ccd3cd0aecdfd4fbdbf1519bd77" dependencies = [ "serde", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -1439,7 +1439,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "toml 0.8.6", + "toml 0.8.8", "tracing", "usdt", "uuid", @@ -1497,7 +1497,7 @@ dependencies = [ "tempfile", "thiserror", "tokio-rustls", - "toml 0.8.6", + "toml 0.8.8", "twox-hash", "uuid", "vergen", @@ -1767,7 +1767,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -2040,7 +2040,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", "trust-dns-client", "trust-dns-proto", "trust-dns-resolver", @@ -2124,7 +2124,7 @@ dependencies = [ "serde", "serde_json", "slog", - "toml 0.8.6", + "toml 0.8.8", "uuid", ] @@ -2305,7 +2305,7 @@ dependencies = [ "russh-keys", "serde_json", "tokio", - "toml 0.8.6", + "toml 0.8.8", "trust-dns-resolver", "uuid", ] @@ -3400,7 +3400,7 @@ dependencies = [ "smf", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", "uuid", "zone", ] @@ -3518,7 +3518,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "toml 0.8.6", + "toml 0.8.8", "tufaceous-lib", "update-engine", "uuid", @@ -4143,7 +4143,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -4437,7 +4437,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.8.6", + "toml 0.8.8", "usdt", "uuid", ] @@ -4911,7 +4911,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.8.6", + "toml 0.8.8", "uuid", ] @@ -4967,7 +4967,7 @@ dependencies = [ "serde", "serde_derive", "thiserror", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -5001,7 +5001,7 @@ dependencies = [ "subprocess", "tokio", "tokio-postgres", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -5045,7 +5045,7 @@ dependencies = [ "tokio-stream", "tokio-tungstenite 0.18.0", "tokio-util", - "toml 0.8.6", + "toml 0.8.8", "uuid", ] @@ -5156,7 +5156,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.8.6", + "toml 0.8.8", "tough", "trust-dns-resolver", "usdt", @@ -5239,7 +5239,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", "topological-sort", "walkdir", ] @@ -5359,7 +5359,7 @@ dependencies = [ "tofino", "tokio", "tokio-tungstenite 0.18.0", - "toml 0.8.6", + "toml 0.8.8", "usdt", "uuid", "zeroize", @@ -5814,7 +5814,7 @@ dependencies = [ "subprocess", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", "uuid", ] @@ -8459,7 +8459,7 @@ dependencies = [ "sprockets-rot", "thiserror", "tokio", - "toml 0.8.6", + "toml 0.8.8", ] [[package]] @@ -9226,14 +9226,14 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ff9e3abce27ee2c9a37f9ad37238c1bdd4e789c84ba37df76aa4d528f5072cc" +checksum = "a1a195ec8c9da26928f773888e0742ca3ca1040c6cd859c919c9f59c1954ab35" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.20.7", + "toml_edit 0.21.0", ] [[package]] @@ -9263,6 +9263,17 @@ name = "toml_edit" version = "0.20.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70f427fce4d84c72b5b732388bf4a9f4531b53f74e2887e3ecb2481f68f66d81" +dependencies = [ + "indexmap 2.1.0", + "toml_datetime", + "winnow", +] + +[[package]] +name = "toml_edit" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34d383cd00a163b4a5b85053df514d45bc330f6de7737edfe0a93311d1eaa03" dependencies = [ "indexmap 2.1.0", "serde", @@ -9517,7 +9528,7 @@ dependencies = [ "sha2", "slog", "tar", - "toml 0.8.6", + "toml 0.8.8", "tough", "url", "zip", @@ -10137,7 +10148,7 @@ dependencies = [ "textwrap 0.16.0", "tokio", "tokio-util", - "toml 0.8.6", + "toml 0.8.8", "toml_edit 0.20.7", "tui-tree-widget", "unicode-width", @@ -10245,7 +10256,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml 0.8.6", + "toml 0.8.8", "tough", "trust-dns-resolver", "tufaceous", diff --git a/Cargo.toml b/Cargo.toml index 63b18a9f9a..2680cf9193 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -363,7 +363,7 @@ tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1 tokio-stream = "0.1.14" tokio-tungstenite = "0.18" tokio-util = "0.7.10" -toml = "0.8.6" +toml = "0.8.8" toml_edit = "0.20.7" topological-sort = "0.2.2" tough = { version = "0.14", features = [ "http" ] } From 5b6f7df40250d994d8085d0c009b64615ffe7299 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 7 Nov 2023 11:02:16 -0800 Subject: [PATCH 06/18] Bump system version to 1.0.4 (#4437) In preparation for the next release, bump the system version. --- .github/buildomat/jobs/package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index 5cfb6fcfd7..86937d1908 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -37,7 +37,7 @@ rustc --version # trampoline global zone images. # COMMIT=$(git rev-parse HEAD) -VERSION="1.0.3-0.ci+git${COMMIT:0:11}" +VERSION="1.0.4-0.ci+git${COMMIT:0:11}" echo "$VERSION" >/work/version.txt ptime -m ./tools/install_builder_prerequisites.sh -yp From bd05e19a0df601986d5c6346599a7931f9ba5595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Wed, 8 Nov 2023 09:02:59 +1300 Subject: [PATCH 07/18] Implement ClickHouse replicated cluster option for simulated omicron (#4422) New `--replicated` flag for the `omicron-dev ch-run` command. This will allow us to spin up a ClickHouse replicated cluster containing 2 replicas and 3 keepers. ```console $ cargo run --bin omicron-dev -- ch-run --replicated Finished dev [unoptimized + debuginfo] target(s) in 0.31s Running `target/debug/omicron-dev ch-run --replicated` omicron-dev: running ClickHouse cluster with configuration files: replicas: /home/{user}/src/omicron/oximeter/db/src/configs/replica_config.xml keepers: /home/{user}/src/omicron/oximeter/db/src/configs/keeper_config.xml omicron-dev: ClickHouse cluster is running with PIDs: 1113482, 1113681, 1113387, 1113451, 1113419 omicron-dev: ClickHouse HTTP servers listening on ports: 8123, 8124 omicron-dev: using /tmp/.tmpFH6v8h and /tmp/.tmpkUjDji for ClickHouse data storage ``` Related: https://github.com/oxidecomputer/omicron/issues/4148 --- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 126 ++++++++++++++++++- docs/how-to-run-simulated.adoc | 14 +++ oximeter/db/src/client.rs | 9 +- test-utils/src/dev/clickhouse.rs | 53 ++++---- 4 files changed, 177 insertions(+), 25 deletions(-) diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index 66778d96e7..73d698f8d3 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -265,16 +265,28 @@ struct ChRunArgs { /// The HTTP port on which the server will listen #[clap(short, long, default_value = "8123", action)] port: u16, + /// Starts a ClickHouse replicated cluster of 2 replicas and 3 keeper nodes + #[clap(long, conflicts_with = "port", action)] + replicated: bool, } async fn cmd_clickhouse_run(args: &ChRunArgs) -> Result<(), anyhow::Error> { + if args.replicated { + start_replicated_cluster().await?; + } else { + start_single_node(args.port).await?; + } + Ok(()) +} + +async fn start_single_node(port: u16) -> Result<(), anyhow::Error> { // Start a stream listening for SIGINT let signals = Signals::new(&[SIGINT]).expect("failed to wait for SIGINT"); let mut signal_stream = signals.fuse(); // Start the database server process, possibly on a specific port let mut db_instance = - dev::clickhouse::ClickHouseInstance::new_single_node(args.port).await?; + dev::clickhouse::ClickHouseInstance::new_single_node(port).await?; println!( "omicron-dev: running ClickHouse with full command:\n\"clickhouse {}\"", db_instance.cmdline().join(" ") @@ -320,6 +332,118 @@ async fn cmd_clickhouse_run(args: &ChRunArgs) -> Result<(), anyhow::Error> { Ok(()) } +async fn start_replicated_cluster() -> Result<(), anyhow::Error> { + // Start a stream listening for SIGINT + let signals = Signals::new(&[SIGINT]).expect("failed to wait for SIGINT"); + let mut signal_stream = signals.fuse(); + + // Start the database server and keeper processes + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let replica_config = manifest_dir + .as_path() + .join("../../oximeter/db/src/configs/replica_config.xml"); + let keeper_config = manifest_dir + .as_path() + .join("../../oximeter/db/src/configs/keeper_config.xml"); + + let mut cluster = + dev::clickhouse::ClickHouseCluster::new(replica_config, keeper_config) + .await?; + println!( + "omicron-dev: running ClickHouse cluster with configuration files:\n \ + replicas: {}\n keepers: {}", + cluster.replica_config_path().display(), + cluster.keeper_config_path().display() + ); + let pid_error_msg = "Failed to get process PID, it may not have started"; + println!( + "omicron-dev: ClickHouse cluster is running with: server PIDs = [{}, {}] \ + and keeper PIDs = [{}, {}, {}]", + cluster.replica_1 + .pid() + .expect(pid_error_msg), + cluster.replica_2 + .pid() + .expect(pid_error_msg), + cluster.keeper_1 + .pid() + .expect(pid_error_msg), + cluster.keeper_2 + .pid() + .expect(pid_error_msg), + cluster.keeper_3 + .pid() + .expect(pid_error_msg), + ); + println!( + "omicron-dev: ClickHouse HTTP servers listening on ports: {}, {}", + cluster.replica_1.port(), + cluster.replica_2.port() + ); + println!( + "omicron-dev: using {} and {} for ClickHouse data storage", + cluster.replica_1.data_path().display(), + cluster.replica_2.data_path().display() + ); + + // Wait for the replicas and keepers to exit themselves (an error), or for SIGINT + tokio::select! { + _ = cluster.replica_1.wait_for_shutdown() => { + cluster.replica_1.cleanup().await.context( + format!("clean up {} after shutdown", cluster.replica_1.data_path().display()) + )?; + bail!("omicron-dev: ClickHouse replica 1 shutdown unexpectedly"); + } + _ = cluster.replica_2.wait_for_shutdown() => { + cluster.replica_2.cleanup().await.context( + format!("clean up {} after shutdown", cluster.replica_2.data_path().display()) + )?; + bail!("omicron-dev: ClickHouse replica 2 shutdown unexpectedly"); + } + _ = cluster.keeper_1.wait_for_shutdown() => { + cluster.keeper_1.cleanup().await.context( + format!("clean up {} after shutdown", cluster.keeper_1.data_path().display()) + )?; + bail!("omicron-dev: ClickHouse keeper 1 shutdown unexpectedly"); + } + _ = cluster.keeper_2.wait_for_shutdown() => { + cluster.keeper_2.cleanup().await.context( + format!("clean up {} after shutdown", cluster.keeper_2.data_path().display()) + )?; + bail!("omicron-dev: ClickHouse keeper 2 shutdown unexpectedly"); + } + _ = cluster.keeper_3.wait_for_shutdown() => { + cluster.keeper_3.cleanup().await.context( + format!("clean up {} after shutdown", cluster.keeper_3.data_path().display()) + )?; + bail!("omicron-dev: ClickHouse keeper 3 shutdown unexpectedly"); + } + caught_signal = signal_stream.next() => { + assert_eq!(caught_signal.unwrap(), SIGINT); + eprintln!( + "omicron-dev: caught signal, shutting down and removing \ + temporary directories" + ); + + // Remove the data directories. + let mut instances = vec![ + cluster.replica_1, + cluster.replica_2, + cluster.keeper_1, + cluster.keeper_2, + cluster.keeper_3, + ]; + for instance in instances.iter_mut() { + instance + .wait_for_shutdown() + .await + .context(format!("clean up {} after SIGINT shutdown", instance.data_path().display()))?; + }; + } + } + Ok(()) +} + #[derive(Clone, Debug, Args)] struct RunAllArgs { /// Nexus external API listen port. Use `0` to request any available port. diff --git a/docs/how-to-run-simulated.adoc b/docs/how-to-run-simulated.adoc index 6b8efb0ed8..de19b70f04 100644 --- a/docs/how-to-run-simulated.adoc +++ b/docs/how-to-run-simulated.adoc @@ -159,6 +159,20 @@ $ cargo run --bin omicron-dev -- ch-run omicron-dev: running ClickHouse (PID: 2463), full command is "clickhouse server --log-file /var/folders/67/2tlym22x1r3d2kwbh84j298w0000gn/T/.tmpJ5nhot/clickhouse-server.log --errorlog-file /var/folders/67/2tlym22x1r3d2kwbh84j298w0000gn/T/.tmpJ5nhot/clickhouse-server.errlog -- --http_port 8123 --path /var/folders/67/2tlym22x1r3d2kwbh84j298w0000gn/T/.tmpJ5nhot" omicron-dev: using /var/folders/67/2tlym22x1r3d2kwbh84j298w0000gn/T/.tmpJ5nhot for ClickHouse data storage ---- ++ +If you wish to start a ClickHouse replicated cluster instead of a single node, run the following instead: +[source,text] +--- +$ cargo run --bin omicron-dev -- ch-run --replicated + Finished dev [unoptimized + debuginfo] target(s) in 0.31s + Running `target/debug/omicron-dev ch-run --replicated` +omicron-dev: running ClickHouse cluster with configuration files: + replicas: /home/{user}/src/omicron/oximeter/db/src/configs/replica_config.xml + keepers: /home/{user}/src/omicron/oximeter/db/src/configs/keeper_config.xml +omicron-dev: ClickHouse cluster is running with PIDs: 1113482, 1113681, 1113387, 1113451, 1113419 +omicron-dev: ClickHouse HTTP servers listening on ports: 8123, 8124 +omicron-dev: using /tmp/.tmpFH6v8h and /tmp/.tmpkUjDji for ClickHouse data storage +--- . `nexus` requires a configuration file to run. You can use `nexus/examples/config.toml` to start with. Build and run it like this: + diff --git a/oximeter/db/src/client.rs b/oximeter/db/src/client.rs index 69e91f888a..88b95c3764 100644 --- a/oximeter/db/src/client.rs +++ b/oximeter/db/src/client.rs @@ -1064,7 +1064,14 @@ mod tests { #[tokio::test] async fn test_replicated() { - let mut cluster = ClickHouseCluster::new() + let cur_dir = std::env::current_dir().unwrap(); + let replica_config = + cur_dir.as_path().join("src/configs/replica_config.xml"); + let cur_dir = std::env::current_dir().unwrap(); + let keeper_config = + cur_dir.as_path().join("src/configs/keeper_config.xml"); + + let mut cluster = ClickHouseCluster::new(replica_config, keeper_config) .await .expect("Failed to initialise ClickHouse Cluster"); diff --git a/test-utils/src/dev/clickhouse.rs b/test-utils/src/dev/clickhouse.rs index 6fb495627f..c35b871684 100644 --- a/test-utils/src/dev/clickhouse.rs +++ b/test-utils/src/dev/clickhouse.rs @@ -39,16 +39,6 @@ pub struct ClickHouseInstance { child: Option, } -/// A `ClickHouseCluster` is used to start and manage a 2 replica 3 keeper ClickHouse cluster. -#[derive(Debug)] -pub struct ClickHouseCluster { - pub replica_1: ClickHouseInstance, - pub replica_2: ClickHouseInstance, - pub keeper_1: ClickHouseInstance, - pub keeper_2: ClickHouseInstance, - pub keeper_3: ClickHouseInstance, -} - #[derive(Debug, Error)] pub enum ClickHouseError { #[error("Failed to open ClickHouse log file")] @@ -330,25 +320,32 @@ impl Drop for ClickHouseInstance { } } +/// A `ClickHouseCluster` is used to start and manage a 2 replica 3 keeper ClickHouse cluster. +#[derive(Debug)] +pub struct ClickHouseCluster { + pub replica_1: ClickHouseInstance, + pub replica_2: ClickHouseInstance, + pub keeper_1: ClickHouseInstance, + pub keeper_2: ClickHouseInstance, + pub keeper_3: ClickHouseInstance, + pub replica_config_path: PathBuf, + pub keeper_config_path: PathBuf, +} + impl ClickHouseCluster { - pub async fn new() -> Result { + pub async fn new( + replica_config: PathBuf, + keeper_config: PathBuf, + ) -> Result { // Start all Keeper coordinator nodes - let cur_dir = std::env::current_dir().unwrap(); - let keeper_config = - cur_dir.as_path().join("src/configs/keeper_config.xml"); - let keeper_amount = 3; let mut keepers = - Self::new_keeper_set(keeper_amount, keeper_config).await?; + Self::new_keeper_set(keeper_amount, &keeper_config).await?; // Start all replica nodes - let cur_dir = std::env::current_dir().unwrap(); - let replica_config = - cur_dir.as_path().join("src/configs/replica_config.xml"); - let replica_amount = 2; let mut replicas = - Self::new_replica_set(replica_amount, replica_config).await?; + Self::new_replica_set(replica_amount, &replica_config).await?; let r1 = replicas.swap_remove(0); let r2 = replicas.swap_remove(0); @@ -362,12 +359,14 @@ impl ClickHouseCluster { keeper_1: k1, keeper_2: k2, keeper_3: k3, + replica_config_path: replica_config, + keeper_config_path: keeper_config, }) } pub async fn new_keeper_set( keeper_amount: u16, - config_path: PathBuf, + config_path: &PathBuf, ) -> Result, anyhow::Error> { let mut keepers = vec![]; @@ -392,7 +391,7 @@ impl ClickHouseCluster { pub async fn new_replica_set( replica_amount: u16, - config_path: PathBuf, + config_path: &PathBuf, ) -> Result, anyhow::Error> { let mut replicas = vec![]; @@ -419,6 +418,14 @@ impl ClickHouseCluster { Ok(replicas) } + + pub fn replica_config_path(&self) -> &Path { + &self.replica_config_path + } + + pub fn keeper_config_path(&self) -> &Path { + &self.keeper_config_path + } } // Wait for the ClickHouse log file to become available, including the From 3445996b4f880dbaab3becdfd5461cff86a7c995 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 7 Nov 2023 15:36:46 -0500 Subject: [PATCH 08/18] Create a new version of StartSledAgentRequest (#4407) In preparation for sled-addition, a new version of `StartSledAgentRequest` was created and the old version coverted to V0. Deterministic conversion from v0 to v1 is possible because we know that `ntp_servers` and `dns_servers` fields are not in use. We also know the values of the new fields. `is_lrtq_learner` must be false because we only support RSS addition on existing sleds. We then implement a deserialize method that will return the new version regardless of what's on disk. This is similar to how we handled the bootstore upgrade for `EarlyNetworkConfig`. Also similar to that, we extended the format of `StartSledAgentRequest` to ease future deserializations. In those deserializations we can deserialize the "header" and use [serde_json::value::RawValue](https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html) in order to defer deserialization of the `body` field until we know the `schema_version`. --- common/src/ledger.rs | 21 ++- schema/rss-sled-plan.json | 45 +++-- ...est.json => start-sled-agent-request.json} | 46 ++--- sled-agent/src/bootstrap/params.rs | 163 ++++++++++++++++-- sled-agent/src/bootstrap/server.rs | 70 +++----- sled-agent/src/rack_setup/plan/service.rs | 4 +- sled-agent/src/rack_setup/plan/sled.rs | 16 +- sled-agent/src/rack_setup/service.rs | 8 +- sled-agent/src/sled_agent.rs | 16 +- 9 files changed, 270 insertions(+), 119 deletions(-) rename schema/{persistent-sled-agent-request.json => start-sled-agent-request.json} (54%) diff --git a/common/src/ledger.rs b/common/src/ledger.rs index c07fb5e693..ae028998e2 100644 --- a/common/src/ledger.rs +++ b/common/src/ledger.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use camino::{Utf8Path, Utf8PathBuf}; use serde::{de::DeserializeOwned, Serialize}; -use slog::{debug, warn, Logger}; +use slog::{error, info, warn, Logger}; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -84,8 +84,11 @@ impl Ledger { // Read all the ledgers that we can. let mut ledgers = vec![]; for path in paths.iter() { - if let Ok(ledger) = T::read_from(log, &path).await { - ledgers.push(ledger); + match T::read_from(log, &path).await { + Ok(ledger) => ledgers.push(ledger), + Err(err) => { + error!(log, "Failed to read ledger: {err}"; "path" => %path) + } } } @@ -169,8 +172,8 @@ pub trait Ledgerable: DeserializeOwned + Serialize + Send + Sync { /// Reads from `path` as a json-serialized version of `Self`. async fn read_from(log: &Logger, path: &Utf8Path) -> Result { if path.exists() { - debug!(log, "Reading ledger from {}", path); - serde_json::from_str( + info!(log, "Reading ledger from {}", path); + ::deserialize( &tokio::fs::read_to_string(&path) .await .map_err(|err| Error::io_path(&path, err))?, @@ -180,7 +183,7 @@ pub trait Ledgerable: DeserializeOwned + Serialize + Send + Sync { err, }) } else { - debug!(log, "No ledger in {path}"); + warn!(log, "No ledger in {path}"); Err(Error::NotFound) } } @@ -191,7 +194,7 @@ pub trait Ledgerable: DeserializeOwned + Serialize + Send + Sync { log: &Logger, path: &Utf8Path, ) -> Result<(), Error> { - debug!(log, "Writing ledger to {}", path); + info!(log, "Writing ledger to {}", path); let as_str = serde_json::to_string(&self).map_err(|err| { Error::JsonSerialize { path: path.to_path_buf(), err } })?; @@ -200,6 +203,10 @@ pub trait Ledgerable: DeserializeOwned + Serialize + Send + Sync { .map_err(|err| Error::io_path(&path, err))?; Ok(()) } + + fn deserialize(s: &str) -> Result { + serde_json::from_str(s) + } } #[cfg(test)] diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index 39a9a68acc..0534c79aef 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -588,33 +588,46 @@ "description": "Configuration information for launching a Sled Agent.", "type": "object", "required": [ - "dns_servers", + "body", + "generation", + "schema_version" + ], + "properties": { + "body": { + "$ref": "#/definitions/StartSledAgentRequestBody" + }, + "generation": { + "description": "The current generation number of data as stored in CRDB.\n\nThe initial generation is set during RSS time and then only mutated by Nexus. For now, we don't actually anticipate mutating this data, but we leave open the possiblity.", + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "schema_version": { + "type": "integer", + "format": "uint32", + "minimum": 0.0 + } + } + }, + "StartSledAgentRequestBody": { + "description": "This is the actual app level data of `StartSledAgentRequest`\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.", + "type": "object", + "required": [ "id", - "ntp_servers", + "is_lrtq_learner", "rack_id", "subnet", "use_trust_quorum" ], "properties": { - "dns_servers": { - "description": "The external DNS servers to use", - "type": "array", - "items": { - "type": "string", - "format": "ip" - } - }, "id": { "description": "Uuid of the Sled Agent to be created.", "type": "string", "format": "uuid" }, - "ntp_servers": { - "description": "The external NTP servers to use", - "type": "array", - "items": { - "type": "string" - } + "is_lrtq_learner": { + "description": "Is this node an LRTQ learner node?\n\nWe only put the node into learner mode if `use_trust_quorum` is also true.", + "type": "boolean" }, "rack_id": { "description": "Uuid of the rack to which this sled agent belongs.", diff --git a/schema/persistent-sled-agent-request.json b/schema/start-sled-agent-request.json similarity index 54% rename from schema/persistent-sled-agent-request.json rename to schema/start-sled-agent-request.json index 5679165c32..b03058d106 100644 --- a/schema/persistent-sled-agent-request.json +++ b/schema/start-sled-agent-request.json @@ -1,13 +1,27 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "title": "PersistentSledAgentRequest", + "title": "StartSledAgentRequest", + "description": "Configuration information for launching a Sled Agent.", "type": "object", "required": [ - "request" + "body", + "generation", + "schema_version" ], "properties": { - "request": { - "$ref": "#/definitions/StartSledAgentRequest" + "body": { + "$ref": "#/definitions/StartSledAgentRequestBody" + }, + "generation": { + "description": "The current generation number of data as stored in CRDB.\n\nThe initial generation is set during RSS time and then only mutated by Nexus. For now, we don't actually anticipate mutating this data, but we leave open the possiblity.", + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "schema_version": { + "type": "integer", + "format": "uint32", + "minimum": 0.0 } }, "definitions": { @@ -32,37 +46,25 @@ } } }, - "StartSledAgentRequest": { - "description": "Configuration information for launching a Sled Agent.", + "StartSledAgentRequestBody": { + "description": "This is the actual app level data of `StartSledAgentRequest`\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.", "type": "object", "required": [ - "dns_servers", "id", - "ntp_servers", + "is_lrtq_learner", "rack_id", "subnet", "use_trust_quorum" ], "properties": { - "dns_servers": { - "description": "The external DNS servers to use", - "type": "array", - "items": { - "type": "string", - "format": "ip" - } - }, "id": { "description": "Uuid of the Sled Agent to be created.", "type": "string", "format": "uuid" }, - "ntp_servers": { - "description": "The external NTP servers to use", - "type": "array", - "items": { - "type": "string" - } + "is_lrtq_learner": { + "description": "Is this node an LRTQ learner node?\n\nWe only put the node into learner mode if `use_trust_quorum` is also true.", + "type": "boolean" }, "rack_id": { "description": "Uuid of the rack to which this sled agent belongs.", diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 4983383470..cef7bb13bb 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -5,8 +5,10 @@ //! Request types for the bootstrap agent use anyhow::{bail, Result}; +use async_trait::async_trait; use omicron_common::address::{self, Ipv6Subnet, SLED_PREFIX}; use omicron_common::api::internal::shared::RackNetworkConfig; +use omicron_common::ledger::Ledgerable; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sha3::{Digest, Sha3_256}; @@ -172,9 +174,16 @@ impl TryFrom for RackInitializeRequest { pub type Certificate = nexus_client::types::Certificate; pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; -/// Configuration information for launching a Sled Agent. +// A wrapper around StartSledAgentRequestV0 that was used +// for the ledger format. #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct StartSledAgentRequest { +struct PersistentSledAgentRequest { + request: StartSledAgentRequestV0, +} + +/// The version of `StartSledAgentRequest` we originally shipped with. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequestV0 { /// Uuid of the Sled Agent to be created. pub id: Uuid, @@ -197,13 +206,62 @@ pub struct StartSledAgentRequest { pub subnet: Ipv6Subnet, } +/// Configuration information for launching a Sled Agent. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequest { + /// The current generation number of data as stored in CRDB. + /// + /// The initial generation is set during RSS time and then only mutated + /// by Nexus. For now, we don't actually anticipate mutating this data, + /// but we leave open the possiblity. + pub generation: u64, + + // Which version of the data structure do we have. This is to help with + // deserialization and conversion in future updates. + pub schema_version: u32, + + // The actual configuration details + pub body: StartSledAgentRequestBody, +} + +/// This is the actual app level data of `StartSledAgentRequest` +/// +/// We nest it below the "header" of `generation` and `schema_version` so that +/// we can perform partial deserialization of `EarlyNetworkConfig` to only read +/// the header and defer deserialization of the body once we know the schema +/// version. This is possible via the use of [`serde_json::value::RawValue`] in +/// future (post-v1) deserialization paths. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequestBody { + /// Uuid of the Sled Agent to be created. + pub id: Uuid, + + /// Uuid of the rack to which this sled agent belongs. + pub rack_id: Uuid, + + /// Use trust quorum for key generation + pub use_trust_quorum: bool, + + /// Is this node an LRTQ learner node? + /// + /// We only put the node into learner mode if `use_trust_quorum` is also + /// true. + pub is_lrtq_learner: bool, + + // Note: The order of these fields is load bearing, because we serialize + // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it + // must come after non-table fields. + /// Portion of the IP space to be managed by the Sled Agent. + pub subnet: Ipv6Subnet, +} + impl StartSledAgentRequest { pub fn sled_address(&self) -> SocketAddrV6 { - address::get_sled_address(self.subnet) + address::get_sled_address(self.body.subnet) } pub fn switch_zone_ip(&self) -> Ipv6Addr { - address::get_switch_zone_address(self.subnet) + address::get_switch_zone_address(self.body.subnet) } /// Compute the sha3_256 digest of `self.rack_id` to use as a `salt` @@ -212,7 +270,57 @@ impl StartSledAgentRequest { /// between sleds. pub fn hash_rack_id(&self) -> [u8; 32] { // We know the unwrap succeeds as a Sha3_256 digest is 32 bytes - Sha3_256::digest(self.rack_id.as_bytes()).as_slice().try_into().unwrap() + Sha3_256::digest(self.body.rack_id.as_bytes()) + .as_slice() + .try_into() + .unwrap() + } +} + +impl From for StartSledAgentRequest { + fn from(v0: StartSledAgentRequestV0) -> Self { + StartSledAgentRequest { + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: v0.id, + rack_id: v0.rack_id, + use_trust_quorum: v0.use_trust_quorum, + is_lrtq_learner: false, + subnet: v0.subnet, + }, + } + } +} + +#[async_trait] +impl Ledgerable for StartSledAgentRequest { + fn is_newer_than(&self, other: &Self) -> bool { + self.generation > other.generation + } + + fn generation_bump(&mut self) { + // DO NOTHING! + // + // Generation bumps must only ever come from nexus and will be encoded + // in the struct itself + } + + // Attempt to deserialize the v1 or v0 version and return + // the v1 version. + fn deserialize( + s: &str, + ) -> Result { + // Try to deserialize the latest version of the data structure (v1). If + // that succeeds we are done. + if let Ok(val) = serde_json::from_str::(s) { + return Ok(val); + } + + // We don't have the latest version. Try to deserialize v0 and then + // convert it to the latest version. + let v0 = serde_json::from_str::(s)?.request; + Ok(v0.into()) } } @@ -291,12 +399,15 @@ mod tests { version: 1, request: Request::StartSledAgentRequest(Cow::Owned( StartSledAgentRequest { - id: Uuid::new_v4(), - rack_id: Uuid::new_v4(), - ntp_servers: vec![String::from("test.pool.example.com")], - dns_servers: vec!["1.1.1.1".parse().unwrap()], - use_trust_quorum: false, - subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: Uuid::new_v4(), + rack_id: Uuid::new_v4(), + use_trust_quorum: false, + is_lrtq_learner: false, + subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + }, }, )), }; @@ -308,6 +419,36 @@ mod tests { assert!(envelope == deserialized, "serialization round trip failed"); } + #[test] + fn serialize_start_sled_agent_v0_deserialize_v1() { + let v0 = PersistentSledAgentRequest { + request: StartSledAgentRequestV0 { + id: Uuid::new_v4(), + rack_id: Uuid::new_v4(), + ntp_servers: vec![String::from("test.pool.example.com")], + dns_servers: vec!["1.1.1.1".parse().unwrap()], + use_trust_quorum: false, + subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + }, + }; + let serialized = serde_json::to_string(&v0).unwrap(); + let expected = StartSledAgentRequest { + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: v0.request.id, + rack_id: v0.request.rack_id, + use_trust_quorum: v0.request.use_trust_quorum, + is_lrtq_learner: false, + subnet: v0.request.subnet, + }, + }; + + let actual: StartSledAgentRequest = + Ledgerable::deserialize(&serialized).unwrap(); + assert_eq!(expected, actual); + } + #[test] fn validate_external_dns_ips_must_be_in_internal_services_ip_pools() { // Conjure up a config; we'll tweak the internal services pools and diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 9ed3ad582d..242cfdd1c7 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -41,14 +41,9 @@ use illumos_utils::zone; use illumos_utils::zone::Zones; use omicron_common::ledger; use omicron_common::ledger::Ledger; -use omicron_common::ledger::Ledgerable; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; use sled_hardware::underlay; use sled_hardware::HardwareUpdate; use slog::Logger; -use std::borrow::Cow; use std::io; use std::net::SocketAddr; use std::net::SocketAddrV6; @@ -225,11 +220,8 @@ impl Server { async { let paths = sled_config_paths(storage_resources).await?; let maybe_ledger = - Ledger::>::new( - &startup_log, - paths, - ) - .await; + Ledger::::new(&startup_log, paths) + .await; Ok::<_, StartError>(maybe_ledger) }, &mut hardware_monitor, @@ -288,11 +280,11 @@ impl Server { // Do we have a persistent sled-agent request that we need to restore? let state = if let Some(ledger) = maybe_ledger { - let sled_request = ledger.data(); + let start_sled_agent_request = ledger.into_inner(); let sled_agent_server = wait_while_handling_hardware_updates( start_sled_agent( &config, - &sled_request.request, + start_sled_agent_request, &bootstore_handles.node_handle, &managers, &ddm_admin_localhost_client, @@ -426,7 +418,7 @@ impl From for StartError { async fn start_sled_agent( config: &SledConfig, - request: &StartSledAgentRequest, + request: StartSledAgentRequest, bootstore: &bootstore::NodeHandle, managers: &BootstrapManagers, ddmd_client: &DdmAdminClient, @@ -440,7 +432,7 @@ async fn start_sled_agent( // storage manager about keys, advertising prefixes, ...). // Initialize the secret retriever used by the `KeyManager` - if request.use_trust_quorum { + if request.body.use_trust_quorum { info!(log, "KeyManager: using lrtq secret retriever"); let salt = request.hash_rack_id(); LrtqOrHardcodedSecretRetriever::init_lrtq(salt, bootstore.clone()) @@ -463,7 +455,7 @@ async fn start_sled_agent( // we'll need to do something different here for underlay vs bootstrap // addrs (either talk to a differently-configured ddmd, or include info // indicating which kind of address we're advertising). - ddmd_client.advertise_prefix(request.subnet); + ddmd_client.advertise_prefix(request.body.subnet); // Server does not exist, initialize it. let server = SledAgentServer::start( @@ -483,11 +475,7 @@ async fn start_sled_agent( // initialized on the next boot. let paths = sled_config_paths(managers.storage.resources()).await?; - let mut ledger = Ledger::new_with( - &log, - paths, - PersistentSledAgentRequest { request: Cow::Borrowed(request) }, - ); + let mut ledger = Ledger::new_with(&log, paths, request); ledger.commit().await?; Ok(server) @@ -611,18 +599,6 @@ async fn wait_while_handling_hardware_updates, T>( } } -#[derive(Clone, Serialize, Deserialize, PartialEq, JsonSchema)] -struct PersistentSledAgentRequest<'a> { - request: Cow<'a, StartSledAgentRequest>, -} - -impl<'a> Ledgerable for PersistentSledAgentRequest<'a> { - fn is_newer_than(&self, _other: &Self) -> bool { - true - } - fn generation_bump(&mut self) {} -} - /// Runs the OpenAPI generator, emitting the spec to stdout. pub fn run_openapi() -> Result<(), String> { http_api() @@ -717,11 +693,12 @@ impl Inner { response_tx: oneshot::Sender>, log: &Logger, ) { + let request_id = request.body.id; match &self.state { SledAgentState::Bootstrapping => { let response = match start_sled_agent( &self.config, - &request, + request, &self.bootstore_handles.node_handle, &self.managers, &self.ddm_admin_localhost_client, @@ -742,7 +719,7 @@ impl Inner { .await; self.state = SledAgentState::ServerStarted(server); - Ok(SledAgentResponse { id: request.id }) + Ok(SledAgentResponse { id: request_id }) } Err(err) => Err(format!("{err:#}")), }; @@ -752,12 +729,12 @@ impl Inner { info!(log, "Sled Agent already loaded"); let sled_address = request.sled_address(); - let response = if server.id() != request.id { + let response = if server.id() != request_id { Err(format!( "Sled Agent already running with UUID {}, \ but {} was requested", server.id(), - request.id, + request_id, )) } else if &server.address().ip() != sled_address.ip() { Err(format!( @@ -873,6 +850,8 @@ impl Inner { #[cfg(test)] mod tests { + use crate::bootstrap::params::StartSledAgentRequestBody; + use super::*; use omicron_common::address::Ipv6Subnet; use omicron_test_utils::dev::test_setup_log; @@ -880,20 +859,21 @@ mod tests { use uuid::Uuid; #[tokio::test] - async fn persistent_sled_agent_request_serialization() { + async fn start_sled_agent_request_serialization() { let logctx = test_setup_log("persistent_sled_agent_request_serialization"); let log = &logctx.log; - let request = PersistentSledAgentRequest { - request: Cow::Owned(StartSledAgentRequest { + let request = StartSledAgentRequest { + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { id: Uuid::new_v4(), rack_id: Uuid::new_v4(), - ntp_servers: vec![String::from("test.pool.example.com")], - dns_servers: vec!["1.1.1.1".parse().unwrap()], use_trust_quorum: false, + is_lrtq_learner: false, subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), - }), + }, }; let tempdir = camino_tempfile::Utf8TempDir::new().unwrap(); @@ -902,7 +882,7 @@ mod tests { let mut ledger = Ledger::new_with(log, paths.clone(), request.clone()); ledger.commit().await.expect("Failed to write to ledger"); - let ledger = Ledger::::new(log, paths) + let ledger = Ledger::::new(log, paths) .await .expect("Failed to read request"); @@ -912,9 +892,9 @@ mod tests { #[test] fn test_persistent_sled_agent_request_schema() { - let schema = schemars::schema_for!(PersistentSledAgentRequest<'_>); + let schema = schemars::schema_for!(StartSledAgentRequest); expectorate::assert_contents( - "../schema/persistent-sled-agent-request.json", + "../schema/start-sled-agent-request.json", &serde_json::to_string_pretty(&schema).unwrap(), ); } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 3dac5d7d1e..8cd815e7fb 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -249,7 +249,7 @@ impl Plan { let result: Result, PlanError> = futures::future::try_join_all(sleds.values().map( |sled_request| async { - let subnet = sled_request.subnet; + let subnet = sled_request.body.subnet; let sled_address = get_sled_address(subnet); let u2_zpools = Self::get_u2_zpools_from_sled(log, sled_address) @@ -257,7 +257,7 @@ impl Plan { let is_scrimlet = Self::is_sled_scrimlet(log, sled_address).await?; Ok(SledInfo::new( - sled_request.id, + sled_request.body.id, subnet, sled_address, u2_zpools, diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index ea12f0db32..163b24cd45 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -4,6 +4,7 @@ //! Plan generation for "how should sleds be initialized". +use crate::bootstrap::params::StartSledAgentRequestBody; use crate::bootstrap::{ config::BOOTSTRAP_AGENT_RACK_INIT_PORT, params::StartSledAgentRequest, }; @@ -99,12 +100,15 @@ impl Plan { ( bootstrap_addr, StartSledAgentRequest { - id: Uuid::new_v4(), - subnet, - ntp_servers: config.ntp_servers.clone(), - dns_servers: config.dns_servers.clone(), - use_trust_quorum, - rack_id, + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: Uuid::new_v4(), + subnet, + use_trust_quorum, + is_lrtq_learner: false, + rack_id, + }, }, ) }); diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 7f6469d2c0..5657c7e69a 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -535,8 +535,10 @@ impl ServiceInner { // We need the ID when passing info to Nexus. let mut id_map = HashMap::new(); for (_, sled_request) in sled_plan.sleds.iter() { - id_map - .insert(get_sled_address(sled_request.subnet), sled_request.id); + id_map.insert( + get_sled_address(sled_request.body.subnet), + sled_request.body.id, + ); } // Convert all the information we have about services and datasets into @@ -925,7 +927,7 @@ impl ServiceInner { .sleds .values() .map(|initialization_request| { - get_sled_address(initialization_request.subnet) + get_sled_address(initialization_request.body.subnet) }) .collect(); let service_plan = if let Some(plan) = diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 6655a732a0..b29c7e1af4 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -278,7 +278,7 @@ impl SledAgent { // Use "log" for ourself. let log = log.new(o!( "component" => "SledAgent", - "sled_id" => request.id.to_string(), + "sled_id" => request.body.id.to_string(), )); info!(&log, "SledAgent::new(..) starting"); @@ -347,7 +347,7 @@ impl SledAgent { storage .setup_underlay_access(storage_manager::UnderlayAccess { nexus_client: nexus_client.clone(), - sled_id: request.id, + sled_id: request.body.id, }) .await?; @@ -391,8 +391,10 @@ impl SledAgent { }; let updates = UpdateManager::new(update_config); - let svc_config = - services::Config::new(request.id, config.sidecar_revision.clone()); + let svc_config = services::Config::new( + request.body.id, + config.sidecar_revision.clone(), + ); // Get our rack network config from the bootstore; we cannot proceed // until we have this, as we need to know which switches have uplinks to @@ -437,15 +439,15 @@ impl SledAgent { svc_config, port_manager.clone(), *sled_address.ip(), - request.rack_id, + request.body.rack_id, rack_network_config.clone(), )?; let zone_bundler = storage.zone_bundler().clone(); let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { - id: request.id, - subnet: request.subnet, + id: request.body.id, + subnet: request.body.subnet, storage, instances, hardware, From 3db5ae80f856c8136e8becf15cbebd444ddfb25a Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 7 Nov 2023 20:43:27 -0500 Subject: [PATCH 09/18] Implement sled-agent API for adding a sled (#4415) This code largely reuses the existing code paths for starting sled-agent with a small change to configured the bootstore as a learner node if trust quorum is in use. This code path does not attempt any retries, as it should be idempotent as long as the same sled and rack UUIDs are used.This is a command driven by an operator and we want to know right away if it succeeded or not. Multiple sleds can be added by Nexus with individual calls, so that individual errors can be returned without added complexity at the sled- agent level. These requests can also be sent to different sled-agents, since the sled-agent is just acting as a proxy to a remote bootstrap agent. --- openapi/sled-agent.json | 187 +++++++++++++++++++++++++++++ sled-agent/src/bootstrap/params.rs | 7 ++ sled-agent/src/bootstrap/server.rs | 34 +++--- sled-agent/src/http_entrypoints.rs | 43 +++++++ sled-agent/src/sled_agent.rs | 104 +++++++++++++++- 5 files changed, 360 insertions(+), 15 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 486662853c..20cadd3054 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -387,6 +387,33 @@ } } }, + "/sleds": { + "put": { + "summary": "Add a sled to a rack that was already initialized via RSS", + "operationId": "add_sled_to_initialized_rack", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AddSledRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/switch-ports": { "post": { "operationId": "uplink_ensure", @@ -938,6 +965,90 @@ } }, "schemas": { + "AddSledRequest": { + "description": "A request to Add a given sled after rack initialization has occurred", + "type": "object", + "properties": { + "sled_id": { + "$ref": "#/components/schemas/Baseboard" + }, + "start_request": { + "$ref": "#/components/schemas/StartSledAgentRequest" + } + }, + "required": [ + "sled_id", + "start_request" + ] + }, + "Baseboard": { + "description": "Describes properties that should uniquely identify a Gimlet.", + "oneOf": [ + { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "model": { + "type": "string" + }, + "revision": { + "type": "integer", + "format": "int64" + }, + "type": { + "type": "string", + "enum": [ + "gimlet" + ] + } + }, + "required": [ + "identifier", + "model", + "revision", + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "unknown" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "identifier": { + "type": "string" + }, + "model": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "pc" + ] + } + }, + "required": [ + "identifier", + "model", + "type" + ] + } + ] + }, "BgpConfig": { "type": "object", "properties": { @@ -2365,6 +2476,18 @@ "type": "string", "pattern": "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\")[/](12[0-8]|1[0-1][0-9]|[0-9]?[0-9])$" }, + "Ipv6Subnet": { + "description": "Wraps an [`Ipv6Network`] with a compile-time prefix length.", + "type": "object", + "properties": { + "net": { + "$ref": "#/components/schemas/Ipv6Net" + } + }, + "required": [ + "net" + ] + }, "KnownArtifactKind": { "description": "Kinds of update artifacts, as used by Nexus to determine what updates are available and by sled-agent to determine how to apply an update when asked.", "type": "string", @@ -3184,6 +3307,70 @@ "last_port" ] }, + "StartSledAgentRequest": { + "description": "Configuration information for launching a Sled Agent.", + "type": "object", + "properties": { + "body": { + "$ref": "#/components/schemas/StartSledAgentRequestBody" + }, + "generation": { + "description": "The current generation number of data as stored in CRDB.\n\nThe initial generation is set during RSS time and then only mutated by Nexus. For now, we don't actually anticipate mutating this data, but we leave open the possiblity.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "schema_version": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + "required": [ + "body", + "generation", + "schema_version" + ] + }, + "StartSledAgentRequestBody": { + "description": "This is the actual app level data of `StartSledAgentRequest`\n\nWe nest it below the \"header\" of `generation` and `schema_version` so that we can perform partial deserialization of `EarlyNetworkConfig` to only read the header and defer deserialization of the body once we know the schema version. This is possible via the use of [`serde_json::value::RawValue`] in future (post-v1) deserialization paths.", + "type": "object", + "properties": { + "id": { + "description": "Uuid of the Sled Agent to be created.", + "type": "string", + "format": "uuid" + }, + "is_lrtq_learner": { + "description": "Is this node an LRTQ learner node?\n\nWe only put the node into learner mode if `use_trust_quorum` is also true.", + "type": "boolean" + }, + "rack_id": { + "description": "Uuid of the rack to which this sled agent belongs.", + "type": "string", + "format": "uuid" + }, + "subnet": { + "description": "Portion of the IP space to be managed by the Sled Agent.", + "allOf": [ + { + "$ref": "#/components/schemas/Ipv6Subnet" + } + ] + }, + "use_trust_quorum": { + "description": "Use trust quorum for key generation", + "type": "boolean" + } + }, + "required": [ + "id", + "is_lrtq_learner", + "rack_id", + "subnet", + "use_trust_quorum" + ] + }, "StorageLimit": { "description": "The limit on space allowed for zone bundles, as a percentage of the overall dataset's quota.", "type": "integer", diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index cef7bb13bb..ab85915dc1 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -174,6 +174,13 @@ impl TryFrom for RackInitializeRequest { pub type Certificate = nexus_client::types::Certificate; pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; +/// A request to Add a given sled after rack initialization has occurred +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct AddSledRequest { + pub sled_id: Baseboard, + pub start_request: StartSledAgentRequest, +} + // A wrapper around StartSledAgentRequestV0 that was used // for the ledger format. #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 242cfdd1c7..e57e8318e4 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -148,6 +148,9 @@ pub enum StartError { #[error("Failed to bind sprocket server")] BindSprocketsServer(#[source] io::Error), + + #[error("Failed to initialize lrtq node as learner: {0}")] + FailedLearnerInit(bootstore::NodeRequestError), } /// Server for the bootstrap agent. @@ -398,6 +401,9 @@ pub enum SledAgentServerStartError { #[error("Failed to commit sled agent request to ledger")] CommitToLedger(#[from] ledger::Error), + + #[error("Failed to initialize this lrtq node as a learner: {0}")] + FailedLearnerInit(#[from] bootstore::NodeRequestError), } impl From for StartError { @@ -412,6 +418,9 @@ impl From for StartError { SledAgentServerStartError::CommitToLedger(err) => { Self::CommitToLedger(err) } + SledAgentServerStartError::FailedLearnerInit(err) => { + Self::FailedLearnerInit(err) + } } } } @@ -441,6 +450,11 @@ async fn start_sled_agent( LrtqOrHardcodedSecretRetriever::init_hardcoded(); } + if request.body.use_trust_quorum && request.body.is_lrtq_learner { + info!(log, "Initializing sled as learner"); + bootstore.init_learner().await?; + } + // Inform the storage service that the key manager is available managers.storage.key_manager_ready().await; @@ -727,21 +741,13 @@ impl Inner { } SledAgentState::ServerStarted(server) => { info!(log, "Sled Agent already loaded"); - - let sled_address = request.sled_address(); - let response = if server.id() != request_id { - Err(format!( - "Sled Agent already running with UUID {}, \ - but {} was requested", - server.id(), - request_id, - )) - } else if &server.address().ip() != sled_address.ip() { + let initial = server.sled_agent().start_request(); + let response = if initial != &request { Err(format!( - "Sled Agent already running on address {}, \ - but {} was requested", - server.address().ip(), - sled_address.ip(), + "Sled Agent already running: + initital request = {:?}, + current request: {:?}", + initial, request )) } else { Ok(SledAgentResponse { id: server.id() }) diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 68330d0c0e..92149b6475 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -6,6 +6,7 @@ use super::sled_agent::SledAgent; use crate::bootstrap::early_networking::EarlyNetworkConfig; +use crate::bootstrap::params::AddSledRequest; use crate::params::{ CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstancePutMigrationIdsBody, InstancePutStateBody, @@ -68,6 +69,7 @@ pub fn api() -> SledApiDescription { api.register(uplink_ensure)?; api.register(read_network_bootstore_config_cache)?; api.register(write_network_bootstore_config)?; + api.register(add_sled_to_initialized_rack)?; Ok(()) } @@ -706,3 +708,44 @@ async fn write_network_bootstore_config( Ok(HttpResponseUpdatedNoContent()) } + +/// Add a sled to a rack that was already initialized via RSS +#[endpoint { + method = PUT, + path = "/sleds" +}] +async fn add_sled_to_initialized_rack( + rqctx: RequestContext, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let request = body.into_inner(); + + // Perform some minimal validation + if request.start_request.body.use_trust_quorum + && !request.start_request.body.is_lrtq_learner + { + return Err(HttpError::for_bad_request( + None, + "New sleds must be LRTQ learners if trust quorum is in use" + .to_string(), + )); + } + + crate::sled_agent::add_sled_to_initialized_rack( + sa.logger().clone(), + request.sled_id, + request.start_request, + ) + .await + .map_err(|e| { + let message = format!("Failed to add sled to rack cluster: {e}"); + HttpError { + status_code: http::StatusCode::INTERNAL_SERVER_ERROR, + error_code: None, + external_message: message.clone(), + internal_message: message, + } + })?; + Ok(HttpResponseUpdatedNoContent()) +} diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index b29c7e1af4..08569c6529 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -4,6 +4,7 @@ //! Sled agent implementation +use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkSetupError, }; @@ -24,7 +25,11 @@ use crate::zone_bundle; use crate::zone_bundle::BundleError; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; +use ddm_admin_client::Client as DdmAdminClient; +use derive_more::From; use dropshot::HttpError; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use illumos_utils::opte::params::{ DeleteVirtualNetworkInterfaceHost, SetVirtualNetworkInterfaceHost, }; @@ -49,8 +54,8 @@ use omicron_common::backoff::{ retry_notify, retry_notify_ext, retry_policy_internal_service_aggressive, BackoffError, }; -use sled_hardware::underlay; use sled_hardware::HardwareManager; +use sled_hardware::{underlay, underlay::BootstrapInterface, Baseboard}; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -210,6 +215,10 @@ struct SledAgentInner { // The Sled Agent's address can be derived from this value. subnet: Ipv6Subnet, + // The request that was used to start the sled-agent + // This is used for idempotence checks during RSS/Add-Sled internal APIs + start_request: StartSledAgentRequest, + // Component of Sled Agent responsible for storage and dataset management. storage: StorageManager, @@ -448,6 +457,7 @@ impl SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, subnet: request.body.subnet, + start_request: request, storage, instances, hardware, @@ -524,6 +534,10 @@ impl SledAgent { &self.log } + pub fn start_request(&self) -> &StartSledAgentRequest { + &self.inner.start_request + } + // Sends a request to Nexus informing it that the current sled exists. pub(crate) fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; @@ -942,3 +956,91 @@ impl SledAgent { self.inner.bootstore.clone() } } + +#[derive(From, thiserror::Error, Debug)] +pub enum AddSledError { + #[error("Failed to learn bootstrap ip for {sled_id}")] + BootstrapAgentClient { + sled_id: Baseboard, + #[source] + err: bootstrap_agent_client::Error, + }, + #[error("Failed to connect to DDM")] + DdmAdminClient(#[source] ddm_admin_client::DdmError), + #[error("Failed to learn bootstrap ip for {0}")] + NotFound(Baseboard), + #[error("Failed to initialize {sled_id}: {err}")] + BootstrapTcpClient { + sled_id: Baseboard, + err: crate::bootstrap::client::Error, + }, +} + +/// Add a sled to an +pub async fn add_sled_to_initialized_rack( + log: Logger, + sled_id: Baseboard, + request: StartSledAgentRequest, +) -> Result<(), AddSledError> { + // Get all known bootstrap addresses via DDM + let ddm_admin_client = DdmAdminClient::localhost(&log)?; + let addrs = ddm_admin_client + .derive_bootstrap_addrs_from_prefixes(&[BootstrapInterface::GlobalZone]) + .await?; + + // Create a set of futures to concurrently map the baseboard to bootstrap ip + // for each sled + let mut addrs_to_sleds = addrs + .map(|ip| { + let log = log.clone(); + async move { + let client = bootstrap_agent_client::Client::new( + &format!("http://[{ip}]"), + log, + ); + let result = client.baseboard_get().await; + + (ip, result) + } + }) + .collect::>(); + + // Execute the futures until we find our matching sled or done searching + let mut target_ip = None; + while let Some((ip, result)) = addrs_to_sleds.next().await { + match result { + Ok(baseboard) => { + // Convert from progenitor type back to `sled-hardware` + // type. + let found = baseboard.into_inner().into(); + if sled_id == found { + target_ip = Some(ip); + break; + } + } + Err(err) => { + warn!( + log, "Failed to get baseboard for {ip}"; + "err" => #%err, + ); + } + } + } + + // Contact the sled and initialize it + let bootstrap_addr = + target_ip.ok_or_else(|| AddSledError::NotFound(sled_id.clone()))?; + let bootstrap_addr = + SocketAddrV6::new(bootstrap_addr, BOOTSTRAP_AGENT_RACK_INIT_PORT, 0, 0); + let client = crate::bootstrap::client::Client::new( + bootstrap_addr, + log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), + ); + + client.start_sled_agent(&request).await.map_err(|err| { + AddSledError::BootstrapTcpClient { sled_id: sled_id.clone(), err } + })?; + + info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %sled_id); + Ok(()) +} From 88b5a1d82e6fc7ab2624e30dc59a297cbb6a10e8 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 7 Nov 2023 20:17:27 -0800 Subject: [PATCH 10/18] [wicket] command-line interface to mupdate (#4062) ## Overview Implement a command-line interface to mupdates, with an eye to using this in CI. To achieve this, add two commands to wicket: ### `rack-update start` This command starts mupdates on one or more components. The help looks like: ``` Start one or more running updates Usage: wicket rack-update start [OPTIONS] Options: -h, --help Print help (see more with '--help') Component selectors: --sled The sleds to operate on --switch The switches to operate on --psc The PSCs to operate on Update options: --force-update-rot Force update the RoT even if the version is the same --force-update-sp Force update the SP even if the version is the same -d, --detach Detach after starting the update Global options: --color Color output [default: auto] [possible values: auto, always, never] ``` ### `rack-update attach` This command attaches to any existing mupdates on one or more components. The help looks like: ``` Attach to one or more running updates Usage: wicket rack-update attach [OPTIONS] Options: -h, --help Print help (see more with '--help') Component selectors: --sled The sleds to operate on --switch The switches to operate on --psc The PSCs to operate on Global options: --color Color output [default: auto] [possible values: auto, always, never] ``` ### What does this look like? Here's a screenshot: ![image](https://github.com/oxidecomputer/omicron/assets/180618/11cad6ec-cc29-48d8-ba74-c0b3e63adc9d) Buildomat doesn't yet support ANSI escape codes (https://github.com/oxidecomputer/buildomat/issues/40), but if we add support for them then we'd get full color support. Otherwise, it'll look like this without colors. ## Implementation This is a somewhat large PR but almost all added code. The bulk of added code is in the update-engine, which has two new displayers: ### 1. Line display This displayer shows events line by line. Since this is meant to be used in CI, we have a hard constraint that we can't go back and change old lines. ### 2. Group display This displayer shows events across several possible executions. This is just a slightly more involved version of the line display, and shares most of its code (via `line_display_shared.rs`). The rest of this PR: * Hooks up a group displayer to wicket, which is pretty straightforward. * Adds some more to the `update-engine-basic` example. ## Testing this The wicket side can be tested with: ``` SSH_ORIGINAL_COMMAND='rack-update start --sled 1' cargo run -p wicket ``` For a more comprehensive example, run `cargo run --example update-engine-basic -- --display-style group`. Depends on #4429 (and a number of other PRs that have already landed.) ## Future work One bit of work that hasn't been done yet is doing an integration test with wicketd, wicket, MGS and sp-sim. One issue is that currently, we don't generate machine-readable output from wicket (maybe we should!), and given that that's not the case we only know how to exit with either the exit code 0 (success) or 1 (failure). --- Cargo.lock | 33 + Cargo.toml | 1 + update-engine/Cargo.toml | 6 + .../examples/update-engine-basic/display.rs | 123 +- .../examples/update-engine-basic/main.rs | 290 +++-- update-engine/src/buffer.rs | 90 +- update-engine/src/display/group_display.rs | 512 +++++++++ update-engine/src/display/line_display.rs | 137 +++ .../src/display/line_display_shared.rs | 1011 +++++++++++++++++ update-engine/src/display/mod.rs | 21 + update-engine/src/errors.rs | 14 + update-engine/src/events.rs | 21 + update-engine/src/lib.rs | 1 + wicket/Cargo.toml | 3 + wicket/src/cli/command.rs | 84 +- wicket/src/cli/mod.rs | 3 +- wicket/src/cli/rack_update.rs | 313 +++++ wicket/src/dispatch.rs | 39 +- wicket/src/helpers.rs | 70 ++ wicket/src/lib.rs | 1 + wicket/src/runner.rs | 105 +- wicket/src/state/inventory.rs | 88 +- wicket/src/state/mod.rs | 4 +- wicket/src/state/update.rs | 105 +- wicket/src/ui/panes/update.rs | 2 +- wicket/src/wicketd.rs | 2 +- 26 files changed, 2819 insertions(+), 260 deletions(-) create mode 100644 update-engine/src/display/group_display.rs create mode 100644 update-engine/src/display/line_display.rs create mode 100644 update-engine/src/display/line_display_shared.rs create mode 100644 update-engine/src/display/mod.rs create mode 100644 wicket/src/cli/rack_update.rs create mode 100644 wicket/src/helpers.rs diff --git a/Cargo.lock b/Cargo.lock index b7c1bb70bd..3516f72fac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3722,6 +3722,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "is_ci" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616cde7c720bb2bb5824a224687d8f77bfd38922027f01d825cd7453be5099fb" + [[package]] name = "itertools" version = "0.10.5" @@ -3926,6 +3932,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libsw" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "610cd929d24f634af855498b575263c44d541a0e28c21d595968a6e25fe190f9" +dependencies = [ + "tokio", +] + [[package]] name = "libtest-mimic" version = "0.6.1" @@ -8714,6 +8729,16 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "supports-color" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6398cde53adc3c4557306a96ce67b302968513830a77a95b2b17305d9719a89" +dependencies = [ + "is-terminal", + "is_ci", +] + [[package]] name = "swrite" version = "0.1.0" @@ -9744,12 +9769,14 @@ dependencies = [ "camino", "camino-tempfile", "cancel-safe-futures", + "clap 4.4.3", "debug-ignore", "derive-where", "either", "futures", "indexmap 2.1.0", "indicatif", + "libsw", "linear-map", "omicron-test-utils", "omicron-workspace-hack", @@ -9760,8 +9787,11 @@ dependencies = [ "serde_json", "serde_with", "slog", + "supports-color", + "swrite", "tokio", "tokio-stream", + "unicode-width", "uuid", ] @@ -10121,6 +10151,7 @@ dependencies = [ "ciborium", "clap 4.4.3", "crossterm 0.27.0", + "debug-ignore", "futures", "hex", "humantime", @@ -10144,6 +10175,7 @@ dependencies = [ "slog-async", "slog-envlogger", "slog-term", + "supports-color", "tempfile", "textwrap 0.16.0", "tokio", @@ -10153,6 +10185,7 @@ dependencies = [ "tui-tree-widget", "unicode-width", "update-engine", + "uuid", "wicket-common", "wicketd-client", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 2680cf9193..e915a2d113 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -345,6 +345,7 @@ static_assertions = "1.1.0" steno = "0.4.0" strum = { version = "0.25", features = [ "derive" ] } subprocess = "0.2.9" +supports-color = "2.1.0" swrite = "0.1.0" libsw = { version = "3.3.0", features = ["tokio"] } syn = { version = "2.0" } diff --git a/update-engine/Cargo.toml b/update-engine/Cargo.toml index af988bf091..5bc3672f54 100644 --- a/update-engine/Cargo.toml +++ b/update-engine/Cargo.toml @@ -12,14 +12,18 @@ derive-where.workspace = true either.workspace = true futures.workspace = true indexmap.workspace = true +libsw.workspace = true linear-map.workspace = true +owo-colors.workspace = true petgraph.workspace = true serde.workspace = true serde_json.workspace = true serde_with.workspace = true schemars = { workspace = true, features = ["uuid1"] } slog.workspace = true +swrite.workspace = true tokio = { workspace = true, features = ["macros", "sync", "time", "rt-multi-thread"] } +unicode-width.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true @@ -28,8 +32,10 @@ buf-list.workspace = true bytes.workspace = true camino.workspace = true camino-tempfile.workspace = true +clap.workspace = true indicatif.workspace = true omicron-test-utils.workspace = true owo-colors.workspace = true +supports-color.workspace = true tokio = { workspace = true, features = ["io-util"] } tokio-stream.workspace = true diff --git a/update-engine/examples/update-engine-basic/display.rs b/update-engine/examples/update-engine-basic/display.rs index e6b80e3637..122777211b 100644 --- a/update-engine/examples/update-engine-basic/display.rs +++ b/update-engine/examples/update-engine-basic/display.rs @@ -12,28 +12,135 @@ use indexmap::{map::Entry, IndexMap}; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use owo_colors::OwoColorize; use tokio::{sync::mpsc, task::JoinHandle}; -use update_engine::events::ProgressCounter; +use update_engine::{ + display::{GroupDisplay, LineDisplay, LineDisplayStyles}, + events::ProgressCounter, +}; -use crate::spec::{ - Event, ExampleComponent, ExampleStepId, ExampleStepMetadata, ProgressEvent, - ProgressEventKind, StepEventKind, StepInfoWithMetadata, StepOutcome, +use crate::{ + spec::{ + Event, EventBuffer, ExampleComponent, ExampleStepId, + ExampleStepMetadata, ProgressEvent, ProgressEventKind, StepEventKind, + StepInfoWithMetadata, StepOutcome, + }, + DisplayStyle, }; /// An example that displays an event stream on the command line. pub(crate) fn make_displayer( log: &slog::Logger, + display_style: DisplayStyle, + prefix: Option, ) -> (JoinHandle>, mpsc::Sender) { let (sender, receiver) = mpsc::channel(512); let log = log.clone(); let join_handle = - tokio::task::spawn( - async move { display_messages(&log, receiver).await }, - ); + match display_style { + DisplayStyle::ProgressBar => tokio::task::spawn(async move { + display_progress_bar(&log, receiver).await + }), + DisplayStyle::Line => tokio::task::spawn(async move { + display_line(&log, receiver, prefix).await + }), + DisplayStyle::Group => tokio::task::spawn(async move { + display_group(&log, receiver).await + }), + }; (join_handle, sender) } -async fn display_messages( +async fn display_line( + log: &slog::Logger, + mut receiver: mpsc::Receiver, + prefix: Option, +) -> Result<()> { + slog::info!(log, "setting up display"); + let mut buffer = EventBuffer::new(8); + let mut display = LineDisplay::new(std::io::stdout()); + // For now, always colorize. TODO: figure out whether colorization should be + // done based on always/auto/never etc. + if supports_color::on(supports_color::Stream::Stdout).is_some() { + display.set_styles(LineDisplayStyles::colorized()); + } + if let Some(prefix) = prefix { + display.set_prefix(prefix); + } + display.set_progress_interval(Duration::from_millis(50)); + while let Some(event) = receiver.recv().await { + buffer.add_event(event); + display.write_event_buffer(&buffer)?; + } + + Ok(()) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] +enum GroupDisplayKey { + Example, + Other, +} + +async fn display_group( + log: &slog::Logger, + mut receiver: mpsc::Receiver, +) -> Result<()> { + slog::info!(log, "setting up display"); + + let mut display = GroupDisplay::new( + [ + (GroupDisplayKey::Example, "example"), + (GroupDisplayKey::Other, "other"), + ], + std::io::stdout(), + ); + // For now, always colorize. TODO: figure out whether colorization should be + // done based on always/auto/never etc. + if supports_color::on(supports_color::Stream::Stdout).is_some() { + display.set_styles(LineDisplayStyles::colorized()); + } + + display.set_progress_interval(Duration::from_millis(50)); + + let mut example_buffer = EventBuffer::default(); + let mut example_buffer_last_seen = None; + let mut other_buffer = EventBuffer::default(); + let mut other_buffer_last_seen = None; + + let mut interval = tokio::time::interval(Duration::from_secs(2)); + interval.tick().await; + + loop { + tokio::select! { + _ = interval.tick() => { + // Print out status lines every 2 seconds. + display.write_stats("Status")?; + } + event = receiver.recv() => { + let Some(event) = event else { break }; + example_buffer.add_event(event.clone()); + other_buffer.add_event(event); + + display.add_event_report( + &GroupDisplayKey::Example, + example_buffer.generate_report_since(&mut example_buffer_last_seen), + )?; + display.add_event_report( + &GroupDisplayKey::Other, + other_buffer.generate_report_since(&mut other_buffer_last_seen), + )?; + display.write_events()?; + } + } + } + + // Print status at the end. + display.write_stats("Summary")?; + + Ok(()) +} + +async fn display_progress_bar( log: &slog::Logger, mut receiver: mpsc::Receiver, ) -> Result<()> { diff --git a/update-engine/examples/update-engine-basic/main.rs b/update-engine/examples/update-engine-basic/main.rs index 260473edde..95fe3c54dc 100644 --- a/update-engine/examples/update-engine-basic/main.rs +++ b/update-engine/examples/update-engine-basic/main.rs @@ -4,85 +4,139 @@ // Copyright 2023 Oxide Computer Company -use std::time::Duration; +use std::{io::IsTerminal, time::Duration}; -use anyhow::{bail, Context}; +use anyhow::{bail, Context, Result}; use buf_list::BufList; use bytes::Buf; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; +use clap::{Parser, ValueEnum}; use display::make_displayer; use omicron_test_utils::dev::test_setup_log; use spec::{ - ComponentRegistrar, ExampleCompletionMetadata, ExampleComponent, - ExampleSpec, ExampleStepId, ExampleStepMetadata, ExampleWriteSpec, - ExampleWriteStepId, StepHandle, StepProgress, StepSkipped, StepWarning, - UpdateEngine, + ComponentRegistrar, EventBuffer, ExampleCompletionMetadata, + ExampleComponent, ExampleSpec, ExampleStepId, ExampleStepMetadata, + ExampleWriteSpec, ExampleWriteStepId, StepHandle, StepProgress, + StepSkipped, StepWarning, UpdateEngine, +}; +use tokio::{io::AsyncWriteExt, sync::mpsc}; +use update_engine::{ + events::{Event, ProgressUnits}, + StepContext, StepSuccess, }; -use tokio::io::AsyncWriteExt; -use update_engine::{events::ProgressUnits, StepContext, StepSuccess}; mod display; mod spec; #[tokio::main(worker_threads = 2)] -async fn main() { - let logctx = test_setup_log("update_engine_basic_example"); - - let context = ExampleContext::new(&logctx.log); - let (display_handle, sender) = make_displayer(&logctx.log); - - let engine = UpdateEngine::new(&logctx.log, sender); - - // Download component 1. - let component_1 = engine.for_component(ExampleComponent::Component1); - let download_handle_1 = context.register_download_step( - &component_1, - "https://www.example.org".to_owned(), - 1_048_576, - ); - - // An example of a skipped step for component 1. - context.register_skipped_step(&component_1); - - // Create temporary directories for component 1. - let temp_dirs_handle_1 = - context.register_create_temp_dirs_step(&component_1, 2); - - // Write component 1 out to disk. - context.register_write_step( - &component_1, - download_handle_1, - temp_dirs_handle_1, - None, - ); - - // Download component 2. - let component_2 = engine.for_component(ExampleComponent::Component2); - let download_handle_2 = context.register_download_step( - &component_2, - "https://www.example.com".to_owned(), - 1_048_576 * 8, - ); - - // Create temporary directories for component 2. - let temp_dirs_handle_2 = - context.register_create_temp_dirs_step(&component_2, 3); - - // Now write component 2 out to disk. - context.register_write_step( - &component_2, - download_handle_2, - temp_dirs_handle_2, - Some(1), - ); - - _ = engine.execute().await; - - // Wait until all messages have been received by the displayer. - _ = display_handle.await; - - // Do not clean up the log file so people can inspect it. +async fn main() -> Result<()> { + let app = App::parse(); + app.exec().await +} + +#[derive(Debug, Parser)] +struct App { + /// Display style to use. + #[clap(long, short = 's', default_value_t, value_enum)] + display_style: DisplayStyleOpt, + + /// Prefix to set on all log messages with display-style=line. + #[clap(long, short = 'p')] + prefix: Option, +} + +impl App { + async fn exec(self) -> Result<()> { + let logctx = test_setup_log("update_engine_basic_example"); + + let display_style = match self.display_style { + DisplayStyleOpt::ProgressBar => DisplayStyle::ProgressBar, + DisplayStyleOpt::Line => DisplayStyle::Line, + DisplayStyleOpt::Group => DisplayStyle::Group, + DisplayStyleOpt::Auto => { + if std::io::stdout().is_terminal() { + DisplayStyle::ProgressBar + } else { + DisplayStyle::Line + } + } + }; + + let context = ExampleContext::new(&logctx.log); + let (display_handle, sender) = + make_displayer(&logctx.log, display_style, self.prefix); + + let engine = UpdateEngine::new(&logctx.log, sender); + + // Download component 1. + let component_1 = engine.for_component(ExampleComponent::Component1); + let download_handle_1 = context.register_download_step( + &component_1, + "https://www.example.org".to_owned(), + 1_048_576, + ); + + // An example of a skipped step for component 1. + context.register_skipped_step(&component_1); + + // Create temporary directories for component 1. + let temp_dirs_handle_1 = + context.register_create_temp_dirs_step(&component_1, 2); + + // Write component 1 out to disk. + context.register_write_step( + &component_1, + download_handle_1, + temp_dirs_handle_1, + None, + ); + + // Download component 2. + let component_2 = engine.for_component(ExampleComponent::Component2); + let download_handle_2 = context.register_download_step( + &component_2, + "https://www.example.com".to_owned(), + 1_048_576 * 8, + ); + + // Create temporary directories for component 2. + let temp_dirs_handle_2 = + context.register_create_temp_dirs_step(&component_2, 3); + + // Now write component 2 out to disk. + context.register_write_step( + &component_2, + download_handle_2, + temp_dirs_handle_2, + Some(1), + ); + + _ = engine.execute().await; + + // Wait until all messages have been received by the displayer. + _ = display_handle.await; + + // Do not clean up the log file so people can inspect it. + + Ok(()) + } +} + +#[derive(Copy, Clone, Debug, Default, ValueEnum)] +enum DisplayStyleOpt { + ProgressBar, + Line, + Group, + #[default] + Auto, +} + +#[derive(Copy, Clone, Debug)] +enum DisplayStyle { + ProgressBar, + Line, + Group, } /// Context shared across steps. This forms the lifetime "'a" defined by the @@ -146,9 +200,30 @@ impl ExampleContext { ({num_bytes} bytes)", ); - // Try a second time, and this time go all the way to 100%. + // Try a second time, and this time go to 80%. let mut buf_list = BufList::new(); - for i in 0..10 { + for i in 0..8 { + tokio::time::sleep(Duration::from_millis(100)).await; + cx.send_progress(StepProgress::with_current_and_total( + num_bytes * i / 10, + num_bytes, + ProgressUnits::BYTES, + serde_json::Value::Null, + )) + .await; + buf_list.push_chunk(&b"downloaded-data"[..]); + } + + // Now indicate a progress reset. + cx.send_progress(StepProgress::reset( + serde_json::Value::Null, + "Progress reset", + )) + .await; + + // Try again, and this time succeed. + let mut buf_list = BufList::new(); + for i in 0..=10 { tokio::time::sleep(Duration::from_millis(100)).await; cx.send_progress(StepProgress::with_current_and_total( num_bytes * i / 10, @@ -243,6 +318,7 @@ impl ExampleContext { cx.with_nested_engine(|engine| { register_nested_write_steps( + &self.log, engine, component, &destinations, @@ -282,6 +358,7 @@ impl ExampleContext { } fn register_nested_write_steps<'a>( + log: &'a slog::Logger, engine: &mut UpdateEngine<'a, ExampleWriteSpec>, component: ExampleComponent, destinations: &'a [Utf8PathBuf], @@ -307,6 +384,38 @@ fn register_nested_write_steps<'a>( Default::default(), )) .await; + + let mut remote_engine_receiver = create_remote_engine( + log, + component, + buf_list.clone(), + destination.clone(), + ); + let mut buffer = EventBuffer::default(); + let mut last_seen = None; + while let Some(event) = remote_engine_receiver.recv().await + { + // Only send progress up to 50% to demonstrate + // not receiving full progress. + if let Event::Progress(event) = &event { + if let Some(counter) = event.kind.progress_counter() + { + if let Some(total) = counter.total { + if counter.current > total / 2 { + break; + } + } + } + } + + buffer.add_event(event); + let report = + buffer.generate_report_since(&mut last_seen); + cx.send_nested_report(report) + .await + .expect("this engine should never fail"); + } + let mut file = tokio::fs::File::create(destination) .await @@ -345,3 +454,50 @@ fn register_nested_write_steps<'a>( .register(); } } + +/// Sets up a remote engine that can be used to execute steps. +fn create_remote_engine( + log: &slog::Logger, + component: ExampleComponent, + mut buf_list: BufList, + destination: Utf8PathBuf, +) -> mpsc::Receiver> { + let (sender, receiver) = tokio::sync::mpsc::channel(128); + let engine = UpdateEngine::new(log, sender); + engine + .for_component(component) + .new_step( + ExampleWriteStepId::Write { destination: destination.clone() }, + format!("Writing to {destination} (remote, fake)"), + move |cx| async move { + let num_bytes = buf_list.num_bytes(); + let mut total_written = 0; + + while buf_list.has_remaining() { + tokio::time::sleep(Duration::from_millis(20)).await; + // Don't actually write these bytes -- this engine is just + // for demoing. + let written_bytes = + (num_bytes / 10).min(buf_list.num_bytes()); + total_written += written_bytes; + buf_list.advance(written_bytes); + cx.send_progress(StepProgress::with_current_and_total( + total_written as u64, + num_bytes as u64, + ProgressUnits::new_const("fake bytes"), + (), + )) + .await; + } + + StepSuccess::new(()).into() + }, + ) + .register(); + + tokio::spawn(async move { + engine.execute().await.expect("remote engine succeeded") + }); + + receiver +} diff --git a/update-engine/src/buffer.rs b/update-engine/src/buffer.rs index 3cb2e0849b..6e0e66d6d0 100644 --- a/update-engine/src/buffer.rs +++ b/update-engine/src/buffer.rs @@ -107,6 +107,15 @@ impl EventBuffer { self.event_store.root_execution_id } + /// Returns an execution summary for the root execution ID, if this event buffer is aware of any + /// events. + pub fn root_execution_summary(&self) -> Option { + // XXX: more efficient algorithm + let root_execution_id = self.root_execution_id()?; + let mut summary = self.steps().summarize(); + summary.remove(&root_execution_id) + } + /// Returns information about each step, as currently tracked by the buffer, /// in order of when the events were first defined. pub fn steps(&self) -> EventBufferSteps<'_, S> { @@ -249,6 +258,7 @@ impl EventStore { &event, 0, None, + None, root_event_index, event.total_elapsed, ); @@ -256,6 +266,26 @@ impl EventStore { if new_execution.nest_level == 0 { self.root_execution_id = Some(new_execution.execution_id); } + // If there's a parent key, then what's the child index? + let parent_key_and_child_index = + if let Some(parent_key) = new_execution.parent_key { + match self.map.get_mut(&parent_key) { + Some(parent_data) => { + let child_index = parent_data.child_executions_seen; + parent_data.child_executions_seen += 1; + Some((parent_key, child_index)) + } + None => { + // This should never happen -- it indicates that the + // parent key was unknown. This can happen if we + // didn't receive an event regarding a parent + // execution being started. + None + } + } + } else { + None + }; let total_steps = new_execution.steps_to_add.len(); for (new_step_key, new_step, sort_key) in new_execution.steps_to_add { @@ -264,6 +294,7 @@ impl EventStore { self.map.entry(new_step_key).or_insert_with(|| { EventBufferStepData::new( new_step, + parent_key_and_child_index, sort_key, new_execution.nest_level, total_steps, @@ -320,6 +351,7 @@ impl EventStore { &mut self, event: &StepEvent, nest_level: usize, + parent_key: Option, parent_sort_key: Option<&StepSortKey>, root_event_index: RootEventIndex, root_total_elapsed: Duration, @@ -348,6 +380,7 @@ impl EventStore { } new_execution = Some(NewExecutionAction { execution_id: event.execution_id, + parent_key, nest_level, steps_to_add, }); @@ -498,6 +531,7 @@ impl EventStore { let actions = self.recurse_for_step_event( nested_event, nest_level + 1, + Some(parent_key), parent_sort_key.as_ref(), root_event_index, root_total_elapsed, @@ -796,6 +830,9 @@ struct NewExecutionAction { // An execution ID corresponding to a new run, if seen. execution_id: ExecutionId, + // The parent key for this execution, if this is a nested step. + parent_key: Option, + // The nest level for this execution. nest_level: usize, @@ -857,12 +894,16 @@ impl<'buf, S: StepSpec> EventBufferSteps<'buf, S> { #[derive_where(Clone, Debug)] pub struct EventBufferStepData { step_info: StepInfo, + sort_key: StepSortKey, - // XXX: nest_level and total_steps are common to each execution, but are - // stored separately here. Should we store them in a separate map - // indexed by execution ID? + + // TODO: These steps are common to each execution, but are stored separately + // here. These should likely move into EventBufferExecutionData. + parent_key_and_child_index: Option<(StepKey, usize)>, nest_level: usize, total_steps: usize, + child_executions_seen: usize, + // Invariant: stored in order sorted by leaf event index. high_priority: Vec>, step_status: StepStatus, @@ -874,6 +915,7 @@ pub struct EventBufferStepData { impl EventBufferStepData { fn new( step_info: StepInfo, + parent_key_and_child_index: Option<(StepKey, usize)>, sort_key: StepSortKey, nest_level: usize, total_steps: usize, @@ -881,9 +923,11 @@ impl EventBufferStepData { ) -> Self { Self { step_info, + parent_key_and_child_index, sort_key, nest_level, total_steps, + child_executions_seen: 0, high_priority: Vec::new(), step_status: StepStatus::NotStarted, last_root_event_index: root_event_index, @@ -895,6 +939,11 @@ impl EventBufferStepData { &self.step_info } + #[inline] + pub fn parent_key_and_child_index(&self) -> Option<(StepKey, usize)> { + self.parent_key_and_child_index + } + #[inline] pub fn nest_level(&self) -> usize { self.nest_level @@ -905,6 +954,11 @@ impl EventBufferStepData { self.total_steps } + #[inline] + pub fn child_executions_seen(&self) -> usize { + self.child_executions_seen + } + #[inline] pub fn step_status(&self) -> &StepStatus { &self.step_status @@ -1406,8 +1460,17 @@ impl ExecutionSummary { StepStatus::NotStarted => { // This step hasn't been started yet. Skip over it. } - StepStatus::Running { .. } => { - execution_status = ExecutionStatus::Running { step_key }; + StepStatus::Running { low_priority, progress_event } => { + let root_total_elapsed = low_priority + .iter() + .map(|event| event.total_elapsed) + .chain(std::iter::once(progress_event.total_elapsed)) + .max() + .expect("at least one value was provided"); + execution_status = ExecutionStatus::Running { + step_key, + root_total_elapsed, + }; } StepStatus::Completed { reason } => { let (root_total_elapsed, leaf_total_elapsed) = @@ -1513,6 +1576,9 @@ pub enum ExecutionStatus { /// /// Use [`EventBuffer::get`] to get more information about this step. step_key: StepKey, + + /// The maximum root_total_elapsed seen. + root_total_elapsed: Duration, }, /// Execution has finished. @@ -1561,6 +1627,20 @@ pub enum TerminalKind { Aborted, } +impl ExecutionStatus { + /// Returns the terminal status and the total amount of time elapsed, or + /// None if the execution has not reached a terminal state. + /// + /// The time elapsed might be None if the execution was interrupted and + /// completion information wasn't available. + pub fn terminal_info(&self) -> Option<&ExecutionTerminalInfo> { + match self { + Self::NotStarted | Self::Running { .. } => None, + Self::Terminal(info) => Some(info), + } + } +} + /// Keys for the event tree. #[derive(Copy, Clone, Debug, Hash, Eq, PartialEq, Ord, PartialOrd)] enum EventTreeNode { diff --git a/update-engine/src/display/group_display.rs b/update-engine/src/display/group_display.rs new file mode 100644 index 0000000000..0d50489a9f --- /dev/null +++ b/update-engine/src/display/group_display.rs @@ -0,0 +1,512 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +use std::{borrow::Borrow, collections::BTreeMap, fmt, time::Duration}; + +use libsw::TokioSw; +use owo_colors::OwoColorize; +use swrite::{swrite, SWrite}; +use unicode_width::UnicodeWidthStr; + +use crate::{ + errors::UnknownReportKey, events::EventReport, EventBuffer, + ExecutionStatus, ExecutionTerminalInfo, StepSpec, TerminalKind, +}; + +use super::{ + line_display_shared::{LineDisplayFormatter, LineDisplayOutput}, + LineDisplayShared, LineDisplayStyles, HEADER_WIDTH, +}; + +/// A displayer that simultaneously manages and shows line-based output for +/// several event buffers. +/// +/// `K` is the key type for each element in the group. Its [`fmt::Display`] impl +/// is called to obtain the prefix, and `Eq + Ord` is used for keys. +#[derive(Debug)] +pub struct GroupDisplay { + // We don't need to add any buffering here because we already write data to + // the writer in a line-buffered fashion (see Self::write_events). + writer: W, + max_width: usize, + // This is set to the highest value of root_total_elapsed seen from any event reports. + start_sw: TokioSw, + single_states: BTreeMap>, + formatter: LineDisplayFormatter, + stats: GroupDisplayStats, +} + +impl GroupDisplay { + /// Creates a new `GroupDisplay` with the provided report keys and + /// prefixes. + /// + /// The function passed in is expected to create a writer. + pub fn new( + keys_and_prefixes: impl IntoIterator, + writer: W, + ) -> Self + where + Str: Into, + { + // Right-align prefixes to their maximum width -- this helps keep the + // output organized. + let mut max_width = 0; + let keys_and_prefixes: Vec<_> = keys_and_prefixes + .into_iter() + .map(|(k, prefix)| { + let prefix = prefix.into(); + max_width = + max_width.max(UnicodeWidthStr::width(prefix.as_str())); + (k, prefix) + }) + .collect(); + let single_states: BTreeMap<_, _> = keys_and_prefixes + .into_iter() + .map(|(k, prefix)| (k, SingleState::new(prefix, max_width))) + .collect(); + + let not_started = single_states.len(); + Self { + writer, + max_width, + // This creates the stopwatch in the stopped state with duration 0 -- i.e. a minimal + // value that will be replaced as soon as an event comes in. + start_sw: TokioSw::new(), + single_states, + formatter: LineDisplayFormatter::new(), + stats: GroupDisplayStats::new(not_started), + } + } + + /// Creates a new `GroupDisplay` with the provided report keys, using the + /// `Display` impl to obtain the respective prefixes. + pub fn new_with_display( + keys: impl IntoIterator, + writer: W, + ) -> Self + where + K: fmt::Display, + { + Self::new( + keys.into_iter().map(|k| { + let prefix = k.to_string(); + (k, prefix) + }), + writer, + ) + } + + /// Sets the styles for all future lines. + #[inline] + pub fn set_styles(&mut self, styles: LineDisplayStyles) { + self.formatter.set_styles(styles); + } + + /// Sets the amount of time before new progress events are shown. + #[inline] + pub fn set_progress_interval(&mut self, interval: Duration) { + self.formatter.set_progress_interval(interval); + } + + /// Returns true if this `GroupDisplay` is producing reports corresponding + /// to the given key. + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow, + Q: Ord, + { + self.single_states.contains_key(key) + } + + /// Adds an event report to the display, keyed by the index, and updates + /// internal state. + /// + /// Returns `Ok(())` if the report was accepted because the key was + /// known to this `GroupDisplay`, and an error if it was not. + pub fn add_event_report( + &mut self, + key: &Q, + event_report: EventReport, + ) -> Result<(), UnknownReportKey> + where + K: Borrow, + Q: Ord, + { + if let Some(state) = self.single_states.get_mut(key) { + let result = state.add_event_report(event_report); + // Set self.start_sw to the max of root_total_elapsed and the current value. + if let Some(root_total_elapsed) = result.root_total_elapsed { + if self.start_sw.elapsed() < root_total_elapsed { + self.start_sw = + TokioSw::with_elapsed_started(root_total_elapsed); + } + } + self.stats.apply_result(result); + Ok(()) + } else { + Err(UnknownReportKey {}) + } + } + + /// Writes a "Status" or "Summary" line to the writer with statistics. + pub fn write_stats(&mut self, header: &str) -> std::io::Result<()> { + // Add a blank prefix which is equal to the maximum width of known prefixes. + let prefix = " ".repeat(self.max_width); + let mut line = + self.formatter.start_line(&prefix, Some(self.start_sw.elapsed())); + self.stats.format_line(&mut line, header, &self.formatter); + writeln!(self.writer, "{line}") + } + + /// Writes all pending events to the writer. + pub fn write_events(&mut self) -> std::io::Result<()> { + let mut out = LineDisplayOutput::new(); + for state in self.single_states.values_mut() { + state.format_events(&self.formatter, &mut out); + } + for line in out.iter() { + writeln!(self.writer, "{line}")?; + } + Ok(()) + } + + /// Returns the current statistics for this `GroupDisplay`. + pub fn stats(&self) -> &GroupDisplayStats { + &self.stats + } +} + +#[derive(Clone, Copy, Debug)] +pub struct GroupDisplayStats { + /// The total number of reports. + pub total: usize, + + /// The number of reports that have not yet started. + pub not_started: usize, + + /// The number of reports that are currently running. + pub running: usize, + + /// The number of reports that indicate successful completion. + pub completed: usize, + + /// The number of reports that indicate failure. + pub failed: usize, + + /// The number of reports that indicate being aborted. + pub aborted: usize, + + /// The number of reports where we didn't receive a final state and it got + /// overwritten by another report. + /// + /// Overwritten reports are considered failures since we don't know what + /// happened. + pub overwritten: usize, +} + +impl GroupDisplayStats { + fn new(total: usize) -> Self { + Self { + total, + not_started: total, + completed: 0, + failed: 0, + aborted: 0, + overwritten: 0, + running: 0, + } + } + + /// Returns the number of terminal reports. + pub fn terminal_count(&self) -> usize { + self.completed + self.failed + self.aborted + self.overwritten + } + + /// Returns true if all reports have reached a terminal state. + pub fn is_terminal(&self) -> bool { + self.not_started == 0 && self.running == 0 + } + + /// Returns true if there are any failures. + pub fn has_failures(&self) -> bool { + self.failed > 0 || self.aborted > 0 || self.overwritten > 0 + } + + fn apply_result(&mut self, result: AddEventReportResult) { + // Process result.after first to avoid integer underflow. + match result.after { + SingleStateTag::NotStarted => self.not_started += 1, + SingleStateTag::Running => self.running += 1, + SingleStateTag::Terminal(TerminalKind::Completed) => { + self.completed += 1 + } + SingleStateTag::Terminal(TerminalKind::Failed) => self.failed += 1, + SingleStateTag::Terminal(TerminalKind::Aborted) => { + self.aborted += 1 + } + SingleStateTag::Overwritten => self.overwritten += 1, + } + + match result.before { + SingleStateTag::NotStarted => self.not_started -= 1, + SingleStateTag::Running => self.running -= 1, + SingleStateTag::Terminal(TerminalKind::Completed) => { + self.completed -= 1 + } + SingleStateTag::Terminal(TerminalKind::Failed) => self.failed -= 1, + SingleStateTag::Terminal(TerminalKind::Aborted) => { + self.aborted -= 1 + } + SingleStateTag::Overwritten => self.overwritten -= 1, + } + } + + fn format_line( + &self, + line: &mut String, + header: &str, + formatter: &LineDisplayFormatter, + ) { + let header_style = if self.has_failures() { + formatter.styles().error_style + } else { + formatter.styles().progress_style + }; + + swrite!(line, "{:>HEADER_WIDTH$} ", header.style(header_style)); + let terminal_count = self.terminal_count(); + swrite!( + line, + "{terminal_count}/{}: {} running, {} {}", + self.total, + self.running.style(formatter.styles().meta_style), + self.completed.style(formatter.styles().meta_style), + "completed".style(formatter.styles().progress_style), + ); + if self.failed > 0 { + swrite!( + line, + ", {} {}", + self.failed.style(formatter.styles().meta_style), + "failed".style(formatter.styles().error_style), + ); + } + if self.aborted > 0 { + swrite!( + line, + ", {} {}", + self.aborted.style(formatter.styles().meta_style), + "aborted".style(formatter.styles().error_style), + ); + } + if self.overwritten > 0 { + swrite!( + line, + ", {} {}", + self.overwritten.style(formatter.styles().meta_style), + "overwritten".style(formatter.styles().error_style), + ); + } + } +} + +#[derive(Debug)] +struct SingleState { + shared: LineDisplayShared, + kind: SingleStateKind, + prefix: String, +} + +impl SingleState { + fn new(prefix: String, max_width: usize) -> Self { + // Right-align the prefix to the maximum width. + let prefix = format!("{:>max_width$}", prefix); + Self { + shared: LineDisplayShared::default(), + kind: SingleStateKind::NotStarted { displayed: false }, + prefix, + } + } + + /// Adds an event report and updates the internal state. + fn add_event_report( + &mut self, + event_report: EventReport, + ) -> AddEventReportResult { + let before = match &self.kind { + SingleStateKind::NotStarted { .. } => { + self.kind = SingleStateKind::Running { + event_buffer: EventBuffer::new(8), + }; + SingleStateTag::NotStarted + } + SingleStateKind::Running { .. } => SingleStateTag::Running, + + SingleStateKind::Terminal { info, .. } => { + // Once we've reached a terminal state, we don't record any more + // events. + return AddEventReportResult::unchanged( + SingleStateTag::Terminal(info.kind), + info.root_total_elapsed, + ); + } + SingleStateKind::Overwritten { .. } => { + // This update has already completed -- assume that the event + // buffer is for a new update, which we don't show. + return AddEventReportResult::unchanged( + SingleStateTag::Overwritten, + None, + ); + } + }; + + let SingleStateKind::Running { event_buffer } = &mut self.kind else { + unreachable!("other branches were handled above"); + }; + + if let Some(root_execution_id) = event_buffer.root_execution_id() { + if event_report.root_execution_id != Some(root_execution_id) { + // The report is for a different execution ID -- assume that + // this event is completed and mark our current execution as + // completed. + self.kind = SingleStateKind::Overwritten { displayed: false }; + return AddEventReportResult { + before, + after: SingleStateTag::Overwritten, + root_total_elapsed: None, + }; + } + } + + event_buffer.add_event_report(event_report); + let (after, max_total_elapsed) = + match event_buffer.root_execution_summary() { + Some(summary) => { + match summary.execution_status { + ExecutionStatus::NotStarted => { + (SingleStateTag::NotStarted, None) + } + ExecutionStatus::Running { + root_total_elapsed: max_total_elapsed, + .. + } => (SingleStateTag::Running, Some(max_total_elapsed)), + ExecutionStatus::Terminal(info) => { + // Grab the event buffer to store it in the terminal state. + let event_buffer = std::mem::replace( + event_buffer, + EventBuffer::new(0), + ); + let terminal_kind = info.kind; + let root_total_elapsed = info.root_total_elapsed; + self.kind = SingleStateKind::Terminal { + info, + pending_event_buffer: Some(event_buffer), + }; + ( + SingleStateTag::Terminal(terminal_kind), + root_total_elapsed, + ) + } + } + } + None => { + // We don't have a summary yet. + (SingleStateTag::NotStarted, None) + } + }; + + AddEventReportResult { + before, + after, + root_total_elapsed: max_total_elapsed, + } + } + + pub(super) fn format_events( + &mut self, + formatter: &LineDisplayFormatter, + out: &mut LineDisplayOutput, + ) { + let mut cx = self.shared.with_context(&self.prefix, formatter); + match &mut self.kind { + SingleStateKind::NotStarted { displayed } => { + if !*displayed { + let line = + cx.format_generic("Update not started, waiting..."); + out.add_line(line); + *displayed = true; + } + } + SingleStateKind::Running { event_buffer } => { + cx.format_event_buffer(event_buffer, out); + } + SingleStateKind::Terminal { info, pending_event_buffer } => { + // Are any remaining events left? This also sets pending_event_buffer + // to None after displaying remaining events. + if let Some(event_buffer) = pending_event_buffer.take() { + cx.format_event_buffer(&event_buffer, out); + // Also show a line to wrap up the terminal status. + let line = cx.format_terminal_info(info); + out.add_line(line); + } + + // Nothing to do, the terminal status was already printed above. + } + SingleStateKind::Overwritten { displayed } => { + if !*displayed { + let line = cx.format_generic( + "Update overwritten (a different update was started): \ + assuming failure", + ); + out.add_line(line); + *displayed = true; + } + } + } + } +} + +#[derive(Debug)] +enum SingleStateKind { + NotStarted { + displayed: bool, + }, + Running { + event_buffer: EventBuffer, + }, + Terminal { + info: ExecutionTerminalInfo, + // The event buffer is kept around so that we can display any remaining + // lines. + pending_event_buffer: Option>, + }, + Overwritten { + displayed: bool, + }, +} + +struct AddEventReportResult { + before: SingleStateTag, + after: SingleStateTag, + root_total_elapsed: Option, +} + +impl AddEventReportResult { + fn unchanged( + tag: SingleStateTag, + root_total_elapsed: Option, + ) -> Self { + Self { before: tag, after: tag, root_total_elapsed } + } +} + +#[derive(Copy, Clone, Debug)] +enum SingleStateTag { + NotStarted, + Running, + Terminal(TerminalKind), + Overwritten, +} diff --git a/update-engine/src/display/line_display.rs b/update-engine/src/display/line_display.rs new file mode 100644 index 0000000000..5321ec017c --- /dev/null +++ b/update-engine/src/display/line_display.rs @@ -0,0 +1,137 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +use debug_ignore::DebugIgnore; +use derive_where::derive_where; +use owo_colors::Style; +use std::time::Duration; + +use crate::{EventBuffer, ExecutionTerminalInfo, StepSpec}; + +use super::{ + line_display_shared::LineDisplayOutput, LineDisplayFormatter, + LineDisplayShared, +}; + +/// A line-oriented display. +/// +/// This display produces output to the provided writer. +#[derive_where(Debug)] +pub struct LineDisplay { + writer: DebugIgnore, + shared: LineDisplayShared, + formatter: LineDisplayFormatter, + prefix: String, +} + +impl LineDisplay { + /// Creates a new LineDisplay. + pub fn new(writer: W) -> Self { + Self { + writer: DebugIgnore(writer), + shared: LineDisplayShared::default(), + formatter: LineDisplayFormatter::new(), + prefix: String::new(), + } + } + + /// Sets the prefix for all future lines. + #[inline] + pub fn set_prefix(&mut self, prefix: impl Into) { + self.prefix = prefix.into(); + } + + /// Sets the styles for all future lines. + #[inline] + pub fn set_styles(&mut self, styles: LineDisplayStyles) { + self.formatter.set_styles(styles); + } + + /// Sets the amount of time before the next progress event is shown. + #[inline] + pub fn set_progress_interval(&mut self, interval: Duration) { + self.formatter.set_progress_interval(interval); + } + + /// Writes an event buffer to the writer, incrementally. + /// + /// This is a stateful method that will only display events that have not + /// been displayed before. + pub fn write_event_buffer( + &mut self, + buffer: &EventBuffer, + ) -> std::io::Result<()> { + let mut out = LineDisplayOutput::new(); + self.shared + .with_context(&self.prefix, &self.formatter) + .format_event_buffer(buffer, &mut out); + for line in out.iter() { + writeln!(self.writer, "{line}")?; + } + + Ok(()) + } + + /// Writes terminal information to the writer. + pub fn write_terminal_info( + &mut self, + info: &ExecutionTerminalInfo, + ) -> std::io::Result<()> { + let line = self + .shared + .with_context(&self.prefix, &self.formatter) + .format_terminal_info(info); + writeln!(self.writer, "{line}") + } + + /// Writes a generic line to the writer, with prefix attached if provided. + pub fn write_generic(&mut self, message: &str) -> std::io::Result<()> { + let line = self + .shared + .with_context(&self.prefix, &self.formatter) + .format_generic(message); + writeln!(self.writer, "{line}") + } +} + +/// Styles for [`LineDisplay`]. +/// +/// By default this isn't colorized, but it can be if so chosen. +#[derive(Debug, Default)] +#[non_exhaustive] +pub struct LineDisplayStyles { + pub prefix_style: Style, + pub meta_style: Style, + pub step_name_style: Style, + pub progress_style: Style, + pub progress_message_style: Style, + pub warning_style: Style, + pub warning_message_style: Style, + pub error_style: Style, + pub error_message_style: Style, + pub skipped_style: Style, + pub retry_style: Style, +} + +impl LineDisplayStyles { + /// Returns a default set of colorized styles with ANSI colors. + pub fn colorized() -> Self { + let mut ret = Self::default(); + ret.prefix_style = Style::new().bold(); + ret.meta_style = Style::new().bold(); + ret.step_name_style = Style::new().cyan(); + ret.progress_style = Style::new().bold().green(); + ret.progress_message_style = Style::new().green(); + ret.warning_style = Style::new().bold().yellow(); + ret.warning_message_style = Style::new().yellow(); + ret.error_style = Style::new().bold().red(); + ret.error_message_style = Style::new().red(); + ret.skipped_style = Style::new().bold().yellow(); + ret.retry_style = Style::new().bold().yellow(); + + ret + } +} diff --git a/update-engine/src/display/line_display_shared.rs b/update-engine/src/display/line_display_shared.rs new file mode 100644 index 0000000000..99b03b13f7 --- /dev/null +++ b/update-engine/src/display/line_display_shared.rs @@ -0,0 +1,1011 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +//! Types and code shared between `LineDisplay` and `GroupDisplay`. + +use std::{ + collections::HashMap, + fmt::{self, Write as _}, + time::Duration, +}; + +use owo_colors::OwoColorize; +use swrite::{swrite, SWrite as _}; + +use crate::{ + events::{ + ProgressCounter, ProgressEvent, ProgressEventKind, StepEvent, + StepEventKind, StepInfo, StepOutcome, + }, + EventBuffer, ExecutionId, ExecutionTerminalInfo, StepKey, StepSpec, + TerminalKind, +}; + +use super::LineDisplayStyles; + +// This is chosen to leave enough room for all possible headers: "Completed" at +// 9 characters is the longest. +pub(super) const HEADER_WIDTH: usize = 9; + +#[derive(Debug, Default)] +pub(super) struct LineDisplayShared { + // This is a map from root execution ID to data about it. + execution_data: HashMap, +} + +impl LineDisplayShared { + pub(super) fn with_context<'a>( + &'a mut self, + prefix: &'a str, + formatter: &'a LineDisplayFormatter, + ) -> LineDisplaySharedContext<'a> { + LineDisplaySharedContext { shared: self, prefix, formatter } + } +} + +#[derive(Debug)] +pub(super) struct LineDisplaySharedContext<'a> { + shared: &'a mut LineDisplayShared, + prefix: &'a str, + formatter: &'a LineDisplayFormatter, +} + +impl<'a> LineDisplaySharedContext<'a> { + /// Produces a generic line from the prefix and message. + /// + /// This line does not have a trailing newline; adding one is the caller's + /// responsibility. + pub(super) fn format_generic(&self, message: &str) -> String { + let mut line = self.formatter.start_line(self.prefix, None); + line.push_str(message); + line + } + + /// Produces lines for this event buffer, and advances internal state. + /// + /// Returned lines do not have a trailing newline; adding them is the + /// caller's responsibility. + pub(super) fn format_event_buffer( + &mut self, + buffer: &EventBuffer, + out: &mut LineDisplayOutput, + ) { + let Some(execution_id) = buffer.root_execution_id() else { + // No known events, so nothing to display. + return; + }; + let execution_data = + self.shared.execution_data.entry(execution_id).or_default(); + let prev_progress_event_at = execution_data.last_progress_event_at; + let mut current_progress_event_at = prev_progress_event_at; + + let report = + buffer.generate_report_since(&mut execution_data.last_seen); + + for event in &report.step_events { + self.format_step_event(buffer, event, out); + } + + // Update progress events. + for event in &report.progress_events { + if Some(event.total_elapsed) > prev_progress_event_at { + self.format_progress_event(buffer, event, out); + current_progress_event_at = + current_progress_event_at.max(Some(event.total_elapsed)); + } + } + + // Finally, write to last_progress_event_at. (Need to re-fetch execution data.) + let execution_data = self + .shared + .execution_data + .get_mut(&execution_id) + .expect("we created this execution data above"); + execution_data.last_progress_event_at = current_progress_event_at; + } + + /// Format this step event. + fn format_step_event( + &self, + buffer: &EventBuffer, + step_event: &StepEvent, + out: &mut LineDisplayOutput, + ) { + self.format_step_event_impl( + buffer, + step_event, + Default::default(), + step_event.total_elapsed, + out, + ); + } + + fn format_step_event_impl( + &self, + buffer: &EventBuffer, + step_event: &StepEvent, + mut nest_data: NestData, + root_total_elapsed: Duration, + out: &mut LineDisplayOutput, + ) { + match &step_event.kind { + StepEventKind::NoStepsDefined => { + let mut line = self + .formatter + .start_line(self.prefix, Some(step_event.total_elapsed)); + swrite!( + line, + "{}", + "No steps defined" + .style(self.formatter.styles.progress_style), + ); + out.add_line(line); + } + StepEventKind::ExecutionStarted { first_step, .. } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &first_step.info, + &nest_data, + ); + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Running".style(self.formatter.styles.progress_style), + ); + self.formatter.add_step_info(&mut line, ld_step_info); + out.add_line(line); + } + StepEventKind::AttemptRetry { + step, + next_attempt, + attempt_elapsed, + message, + .. + } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Retry".style(self.formatter.styles.warning_style) + ); + self.formatter.add_step_info(&mut line, ld_step_info); + swrite!( + line, + ": after {:.2?}", + attempt_elapsed.style(self.formatter.styles.meta_style), + ); + if *next_attempt > 1 { + swrite!( + line, + " (at attempt {})", + next_attempt + .saturating_sub(1) + .style(self.formatter.styles.meta_style), + ); + } + swrite!( + line, + " with message: {}", + message.style(self.formatter.styles.warning_message_style) + ); + + out.add_line(line); + } + StepEventKind::ProgressReset { + step, + attempt, + attempt_elapsed, + message, + .. + } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Reset".style(self.formatter.styles.warning_style) + ); + self.formatter.add_step_info(&mut line, ld_step_info); + swrite!( + line, + ": after {:.2?}", + attempt_elapsed.style(self.formatter.styles.meta_style), + ); + if *attempt > 1 { + swrite!( + line, + " (at attempt {})", + attempt.style(self.formatter.styles.meta_style), + ); + } + swrite!( + line, + " with message: {}", + message.style(self.formatter.styles.warning_message_style) + ); + + out.add_line(line); + } + StepEventKind::StepCompleted { + step, + attempt, + outcome, + next_step, + attempt_elapsed, + .. + } => { + // --- Add completion info about this step. + + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &step.info, + &nest_data, + ); + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + self.formatter.add_completion_and_step_info( + &mut line, + ld_step_info, + *attempt_elapsed, + *attempt, + outcome, + ); + + out.add_line(line); + + // --- Add information about the next step. + + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &next_step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + self.format_step_running(&mut line, ld_step_info); + + out.add_line(line); + } + StepEventKind::ExecutionCompleted { + last_step, + last_attempt, + last_outcome, + attempt_elapsed, + .. + } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &last_step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + self.formatter.add_completion_and_step_info( + &mut line, + ld_step_info, + *attempt_elapsed, + *last_attempt, + last_outcome, + ); + + out.add_line(line); + } + StepEventKind::ExecutionFailed { + failed_step, + total_attempts, + attempt_elapsed, + message, + causes, + .. + } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &failed_step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + // The prefix is used for "Caused by" lines below. Add + // the requisite amount of spacing here. + let mut caused_by_prefix = line.clone(); + swrite!(caused_by_prefix, "{:>HEADER_WIDTH$} ", ""); + nest_data.add_prefix(&mut caused_by_prefix); + + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Failed".style(self.formatter.styles.error_style) + ); + + self.formatter.add_step_info(&mut line, ld_step_info); + line.push_str(": "); + + self.formatter.add_failure_info( + &mut line, + &caused_by_prefix, + *attempt_elapsed, + *total_attempts, + message, + causes, + ); + + out.add_line(line); + } + StepEventKind::ExecutionAborted { + aborted_step, + attempt, + attempt_elapsed, + message, + .. + } => { + let ld_step_info = LineDisplayStepInfo::new( + buffer, + step_event.execution_id, + &aborted_step.info, + &nest_data, + ); + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Aborted".style(self.formatter.styles.error_style) + ); + self.formatter.add_step_info(&mut line, ld_step_info); + line.push_str(": "); + + self.formatter.add_abort_info( + &mut line, + *attempt_elapsed, + *attempt, + message, + ); + + out.add_line(line); + } + StepEventKind::Nested { step, event, .. } => { + // Look up the child event's ID to add to the nest data. + let child_step_key = StepKey { + execution_id: event.execution_id, + // XXX: we currently look up index 0 because that should + // always exist (unless no steps are defined, in which case + // we skip this). The child index is actually shared by all + // steps within an execution. Fix this by changing + // EventBuffer to also track general per-execution data. + index: 0, + }; + let Some(child_step_data) = buffer.get(&child_step_key) else { + // This should only happen if no steps are defined. See TODO + // above. + return; + }; + let (_, child_index) = child_step_data + .parent_key_and_child_index() + .expect("child steps should have a child index"); + + nest_data.add_nest_level(step.info.index, child_index); + + self.format_step_event_impl( + buffer, + &**event, + nest_data, + root_total_elapsed, + out, + ); + } + StepEventKind::Unknown => {} + } + } + + fn format_step_running( + &self, + line: &mut String, + ld_step_info: LineDisplayStepInfo<'_, S>, + ) { + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Running".style(self.formatter.styles.progress_style), + ); + self.formatter.add_step_info(line, ld_step_info); + } + + /// Formats this terminal information. + /// + /// This line does not have a trailing newline; adding one is the caller's + /// responsibility. + pub(super) fn format_terminal_info( + &self, + info: &ExecutionTerminalInfo, + ) -> String { + let mut line = + self.formatter.start_line(self.prefix, info.leaf_total_elapsed); + match info.kind { + TerminalKind::Completed => { + swrite!( + line, + "{:>HEADER_WIDTH$} Execution {}", + "Terminal".style(self.formatter.styles.progress_style), + "completed".style(self.formatter.styles.progress_style), + ); + } + TerminalKind::Failed => { + swrite!( + line, + "{:>HEADER_WIDTH$} Execution {}", + "Terminal".style(self.formatter.styles.error_style), + "failed".style(self.formatter.styles.error_style), + ); + } + TerminalKind::Aborted => { + swrite!( + line, + "{:>HEADER_WIDTH$} Execution {}", + "Terminal".style(self.formatter.styles.error_style), + "aborted".style(self.formatter.styles.error_style), + ); + } + } + line + } + + fn format_progress_event( + &self, + buffer: &EventBuffer, + progress_event: &ProgressEvent, + out: &mut LineDisplayOutput, + ) { + self.format_progress_event_impl( + buffer, + progress_event, + NestData::default(), + progress_event.total_elapsed, + out, + ) + } + + fn format_progress_event_impl( + &self, + buffer: &EventBuffer, + progress_event: &ProgressEvent, + mut nest_data: NestData, + root_total_elapsed: Duration, + out: &mut LineDisplayOutput, + ) { + match &progress_event.kind { + ProgressEventKind::WaitingForProgress { .. } => { + // Don't need to show this because "Running" is shown within + // step events. + } + ProgressEventKind::Progress { + step, + progress, + attempt_elapsed, + .. + } => { + let step_key = StepKey { + execution_id: progress_event.execution_id, + index: step.info.index, + }; + let step_data = + buffer.get(&step_key).expect("step key must exist"); + let ld_step_info = LineDisplayStepInfo { + step_info: &step.info, + total_steps: step_data.total_steps(), + nest_data: &nest_data, + }; + + let mut line = self + .formatter + .start_line(self.prefix, Some(root_total_elapsed)); + + let (before, after) = match progress { + Some(counter) => { + let progress_str = format_progress_counter(counter); + ( + format!( + "{:>HEADER_WIDTH$} ", + "Progress".style( + self.formatter.styles.progress_style + ) + ), + format!( + "{progress_str} after {:.2?}", + attempt_elapsed + .style(self.formatter.styles.meta_style), + ), + ) + } + None => { + let before = format!( + "{:>HEADER_WIDTH$} ", + "Running" + .style(self.formatter.styles.progress_style), + ); + + // If the attempt elapsed is non-zero, show it. + let after = if *attempt_elapsed > Duration::ZERO { + format!( + "after {:.2?}", + attempt_elapsed + .style(self.formatter.styles.meta_style), + ) + } else { + String::new() + }; + + (before, after) + } + }; + + swrite!(line, "{}", before); + self.formatter.add_step_info(&mut line, ld_step_info); + if !after.is_empty() { + swrite!(line, ": {}", after); + } + + out.add_line(line); + } + ProgressEventKind::Nested { step, event, .. } => { + // Look up the child event's ID to add to the nest data. + let child_step_key = StepKey { + execution_id: event.execution_id, + // XXX: we currently look up index 0 because that should + // always exist (unless no steps are defined, in which case + // we skip this). The child index is actually shared by all + // steps within an execution. Fix this by changing + // EventBuffer to also track general per-execution data. + index: 0, + }; + let Some(child_step_data) = buffer.get(&child_step_key) else { + // This should only happen if no steps are defined. See TODO + // above. + return; + }; + let (_, child_index) = child_step_data + .parent_key_and_child_index() + .expect("child steps should have a child index"); + + nest_data.add_nest_level(step.info.index, child_index); + + self.format_progress_event_impl( + buffer, + &**event, + nest_data, + root_total_elapsed, + out, + ); + } + ProgressEventKind::Unknown => {} + } + } +} + +fn format_progress_counter(counter: &ProgressCounter) -> String { + match counter.total { + Some(total) => { + // Show a percentage value. Correct alignment requires converting to + // a string in the middle like this. + let percent = (counter.current as f64 / total as f64) * 100.0; + // <12.34> is 5 characters wide. + let percent_width = 5; + let counter_width = total.to_string().len(); + format!( + "{:>percent_width$.2}% ({:>counter_width$}/{} {})", + percent, counter.current, total, counter.units, + ) + } + None => format!("{} {}", counter.current, counter.units), + } +} + +/// State that tracks line display formatting. +/// +/// Each `LineDisplay` and `GroupDisplay` has one of these. +#[derive(Debug)] +pub(super) struct LineDisplayFormatter { + styles: LineDisplayStyles, + progress_interval: Duration, +} + +impl LineDisplayFormatter { + pub(super) fn new() -> Self { + Self { + styles: LineDisplayStyles::default(), + progress_interval: Duration::from_secs(1), + } + } + + #[inline] + pub(super) fn styles(&self) -> &LineDisplayStyles { + &self.styles + } + + #[inline] + pub(super) fn set_styles(&mut self, styles: LineDisplayStyles) { + self.styles = styles; + } + + #[inline] + pub(super) fn set_progress_interval(&mut self, interval: Duration) { + self.progress_interval = interval; + } + + // --- + // Internal helpers + // --- + + pub(super) fn start_line( + &self, + prefix: &str, + total_elapsed: Option, + ) -> String { + let mut line = format!("[{}", prefix.style(self.styles.prefix_style)); + + if !prefix.is_empty() { + line.push(' '); + } + + // Show total elapsed time in an hh:mm:ss format. + if let Some(total_elapsed) = total_elapsed { + let total_elapsed_secs = total_elapsed.as_secs(); + let hours = total_elapsed_secs / 3600; + let minutes = (total_elapsed_secs % 3600) / 60; + let seconds = total_elapsed_secs % 60; + swrite!(line, "{:02}:{:02}:{:02}", hours, minutes, seconds); + // To show total_elapsed more accurately, use: + // swrite!(line, "{:.2?}", total_elapsed); + } else { + // Add 8 spaces to align with hh:mm:ss. + line.push_str(" "); + } + + line.push_str("] "); + + line + } + + fn add_step_info( + &self, + line: &mut String, + ld_step_info: LineDisplayStepInfo<'_, S>, + ) { + ld_step_info.nest_data.add_prefix(line); + + // Print out "/)". Leave space such that we + // print out e.g. "1/8)" and " 3/14)". + // Add 1 to the index to make it 1-based. + let step_index = ld_step_info.step_info.index + 1; + let step_index_width = ld_step_info.total_steps.to_string().len(); + swrite!( + line, + "{:width$}/{:width$}) ", + step_index, + ld_step_info.total_steps, + width = step_index_width + ); + + swrite!( + line, + "{}", + ld_step_info + .step_info + .description + .style(self.styles.step_name_style) + ); + } + + pub(super) fn add_completion_and_step_info( + &self, + line: &mut String, + ld_step_info: LineDisplayStepInfo<'_, S>, + attempt_elapsed: Duration, + attempt: usize, + outcome: &StepOutcome, + ) { + let mut meta = format!( + "after {:.2?}", + attempt_elapsed.style(self.styles.meta_style) + ); + if attempt > 1 { + swrite!( + meta, + " (at attempt {})", + attempt.style(self.styles.meta_style) + ); + } + + match &outcome { + StepOutcome::Success { message, .. } => { + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Completed".style(self.styles.progress_style), + ); + self.add_step_info(line, ld_step_info); + match message { + Some(message) => { + swrite!( + line, + ": {meta} with message: {}", + message.style(self.styles.progress_message_style) + ); + } + None => { + swrite!(line, ": {meta}"); + } + } + } + StepOutcome::Warning { message, .. } => { + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Completed".style(self.styles.warning_style), + ); + self.add_step_info(line, ld_step_info); + swrite!( + line, + ": {meta} with warning: {}", + message.style(self.styles.warning_message_style) + ); + } + StepOutcome::Skipped { message, .. } => { + swrite!( + line, + "{:>HEADER_WIDTH$} ", + "Skipped".style(self.styles.skipped_style), + ); + self.add_step_info(line, ld_step_info); + swrite!( + line, + ": {}", + message.style(self.styles.warning_message_style) + ); + } + }; + } + + pub(super) fn add_failure_info( + &self, + line: &mut String, + line_prefix: &str, + attempt_elapsed: Duration, + total_attempts: usize, + message: &str, + causes: &[String], + ) { + let mut meta = format!( + "after {:.2?}", + attempt_elapsed.style(self.styles.meta_style) + ); + if total_attempts > 1 { + swrite!( + meta, + " (after {} attempts)", + total_attempts.style(self.styles.meta_style) + ); + } + + swrite!( + line, + "{meta}: {}", + message.style(self.styles.error_message_style) + ); + if !causes.is_empty() { + swrite!( + line, + "\n{line_prefix}{}", + " Caused by:".style(self.styles.meta_style) + ); + for cause in causes { + swrite!(line, "\n{line_prefix} - {}", cause); + } + } + + // The last newline is added by the caller. + } + + pub(super) fn add_abort_info( + &self, + line: &mut String, + attempt_elapsed: Duration, + attempt: usize, + message: &str, + ) { + let mut meta = format!( + "after {:.2?}", + attempt_elapsed.style(self.styles.meta_style) + ); + if attempt > 1 { + swrite!( + meta, + " (at attempt {})", + attempt.style(self.styles.meta_style) + ); + } + + swrite!(line, "{meta} with message \"{}\"", message); + } +} + +#[derive(Clone, Debug)] +pub(super) struct LineDisplayOutput { + lines: Vec, +} + +impl LineDisplayOutput { + pub(super) fn new() -> Self { + Self { lines: Vec::new() } + } + + pub(super) fn add_line(&mut self, line: String) { + self.lines.push(line); + } + + pub(super) fn iter(&self) -> impl Iterator { + self.lines.iter().map(|line| line.as_str()) + } +} + +#[derive(Clone, Copy, Debug)] +pub(super) struct LineDisplayStepInfo<'a, S: StepSpec> { + pub(super) step_info: &'a StepInfo, + pub(super) total_steps: usize, + pub(super) nest_data: &'a NestData, +} + +impl<'a, S: StepSpec> LineDisplayStepInfo<'a, S> { + fn new( + buffer: &'a EventBuffer, + execution_id: ExecutionId, + step_info: &'a StepInfo, + nest_data: &'a NestData, + ) -> Self { + let step_key = StepKey { execution_id, index: step_info.index }; + let step_data = buffer.get(&step_key).expect("step key must exist"); + LineDisplayStepInfo { + step_info, + total_steps: step_data.total_steps(), + nest_data, + } + } +} + +/// Per-step stateful data tracked by the line displayer. +#[derive(Debug, Default)] +struct ExecutionData { + /// The last seen root event index. + /// + /// This is used to avoid displaying the same event twice. + last_seen: Option, + + /// The last `root_total_elapsed` at which a progress event was displayed for + /// this execution. + last_progress_event_at: Option, +} + +#[derive(Clone, Debug, Default)] +pub(super) struct NestData { + nest_indexes: Vec, +} + +impl NestData { + fn add_nest_level(&mut self, parent_step_index: usize, child_index: usize) { + self.nest_indexes.push(NestIndex { parent_step_index, child_index }); + } + + fn add_prefix(&self, line: &mut String) { + if !self.nest_indexes.is_empty() { + line.push_str(&"..".repeat(self.nest_indexes.len())); + line.push_str(" "); + } + + for nest_index in &self.nest_indexes { + swrite!( + line, + "{}{} ", + // Add 1 to the index to make it 1-based. + nest_index.parent_step_index + 1, + AsLetters(nest_index.child_index) + ); + } + } +} + +#[derive(Clone, Debug)] +struct NestIndex { + parent_step_index: usize, + // If a parent has multiple nested executions, this counts which execution + // this is, up from 0. + child_index: usize, +} + +/// A display impl that converts a 0-based index into a letter or a series of +/// letters. +/// +/// This is effectively a conversion to base 26. +struct AsLetters(usize); + +impl fmt::Display for AsLetters { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut index = self.0; + loop { + let letter = (b'a' + (index % 26) as u8) as char; + f.write_char(letter)?; + index /= 26; + if index == 0 { + break; + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_progress_counter() { + let tests = vec![ + (ProgressCounter::new(5, 20, "units"), "25.00% ( 5/20 units)"), + (ProgressCounter::new(0, 20, "bytes"), " 0.00% ( 0/20 bytes)"), + (ProgressCounter::new(20, 20, "cubes"), "100.00% (20/20 cubes)"), + // NaN is a weird case that is a buggy update engine impl in practice + (ProgressCounter::new(0, 0, "units"), " NaN% (0/0 units)"), + (ProgressCounter::current(5, "units"), "5 units"), + ]; + for (input, output) in tests { + assert_eq!( + format_progress_counter(&input), + output, + "format matches for input: {:?}", + input + ); + } + } +} diff --git a/update-engine/src/display/mod.rs b/update-engine/src/display/mod.rs new file mode 100644 index 0000000000..c58a4535a0 --- /dev/null +++ b/update-engine/src/display/mod.rs @@ -0,0 +1,21 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +//! Displayers for the update engine. +//! +//! Currently implemented are: +//! +//! * [`LineDisplay`]: a line-oriented display suitable for the command line. +//! * [`GroupDisplay`]: manages state and shows the results of several +//! [`LineDisplay`]s at once. + +mod group_display; +mod line_display; +mod line_display_shared; + +pub use group_display::GroupDisplay; +pub use line_display::{LineDisplay, LineDisplayStyles}; +use line_display_shared::*; diff --git a/update-engine/src/errors.rs b/update-engine/src/errors.rs index abb0d4cd22..0607ad6e27 100644 --- a/update-engine/src/errors.rs +++ b/update-engine/src/errors.rs @@ -185,3 +185,17 @@ pub enum ConvertGenericPathElement { Path(&'static str), ArrayIndex(&'static str, usize), } + +/// The +/// [`GroupDisplay::add_event_report`](crate::display::GroupDisplay::add_event_report) +/// method was called with an unknown key. +#[derive(Clone, Debug)] +pub struct UnknownReportKey {} + +impl fmt::Display for UnknownReportKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("unknown report key") + } +} + +impl error::Error for UnknownReportKey {} diff --git a/update-engine/src/events.rs b/update-engine/src/events.rs index 3816157d0d..900a9776f5 100644 --- a/update-engine/src/events.rs +++ b/update-engine/src/events.rs @@ -1143,6 +1143,8 @@ impl ProgressEventKind { /// Returns `step_elapsed` for the leaf event, recursing into nested events /// as necessary. + /// + /// Returns None for unknown events. pub fn leaf_step_elapsed(&self) -> Option { match self { ProgressEventKind::WaitingForProgress { step_elapsed, .. } @@ -1156,6 +1158,25 @@ impl ProgressEventKind { } } + /// Returns `attempt_elapsed` for the leaf event, recursing into nested + /// events as necessary. + /// + /// Returns None for unknown events. + pub fn leaf_attempt_elapsed(&self) -> Option { + match self { + ProgressEventKind::WaitingForProgress { + attempt_elapsed, .. + } + | ProgressEventKind::Progress { attempt_elapsed, .. } => { + Some(*attempt_elapsed) + } + ProgressEventKind::Nested { event, .. } => { + event.kind.leaf_attempt_elapsed() + } + ProgressEventKind::Unknown => None, + } + } + /// Converts a generic version into self. /// /// This version can be used to convert a generic type into a more concrete diff --git a/update-engine/src/lib.rs b/update-engine/src/lib.rs index f753fa738a..fea92d3b73 100644 --- a/update-engine/src/lib.rs +++ b/update-engine/src/lib.rs @@ -57,6 +57,7 @@ mod buffer; mod context; +pub mod display; mod engine; pub mod errors; pub mod events; diff --git a/wicket/Cargo.toml b/wicket/Cargo.toml index 5392e72e9f..11f476d98c 100644 --- a/wicket/Cargo.toml +++ b/wicket/Cargo.toml @@ -13,6 +13,7 @@ camino.workspace = true ciborium.workspace = true clap.workspace = true crossterm.workspace = true +debug-ignore.workspace = true futures.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true @@ -33,6 +34,7 @@ slog.workspace = true slog-async.workspace = true slog-envlogger.workspace = true slog-term.workspace = true +supports-color.workspace = true textwrap.workspace = true tokio = { workspace = true, features = ["full"] } tokio-util.workspace = true @@ -40,6 +42,7 @@ toml.workspace = true toml_edit.workspace = true tui-tree-widget = "0.13.0" unicode-width.workspace = true +uuid.workspace = true zeroize.workspace = true omicron-passwords.workspace = true diff --git a/wicket/src/cli/command.rs b/wicket/src/cli/command.rs index 34f041b203..928e54306a 100644 --- a/wicket/src/cli/command.rs +++ b/wicket/src/cli/command.rs @@ -6,32 +6,64 @@ use std::net::SocketAddrV6; -use anyhow::{Context, Result}; -use clap::Parser; +use anyhow::Result; +use clap::{Args, ColorChoice, Parser, Subcommand}; use super::{ - preflight::PreflightArgs, rack_setup::SetupArgs, upload::UploadArgs, + preflight::PreflightArgs, rack_setup::SetupArgs, + rack_update::RackUpdateArgs, upload::UploadArgs, }; -pub fn exec( - log: slog::Logger, - args: &str, - wicketd_addr: SocketAddrV6, -) -> Result<()> { - // The argument is in a quoted form, so split it using Unix shell semantics. - let args = shell_words::split(&args).with_context(|| { - format!("could not parse shell arguments from input {args}") - })?; - - // parse_from uses the the first argument as the command name. Insert "wicket" as - // the command name. - let args = ShellCommand::parse_from( - std::iter::once("wicket".to_owned()).chain(args), - ); - match args { - ShellCommand::UploadRepo(args) => args.exec(log, wicketd_addr), - ShellCommand::Setup(args) => args.exec(log, wicketd_addr), - ShellCommand::Preflight(args) => args.exec(log, wicketd_addr), +/// An app that represents wicket started with arguments over ssh. +#[derive(Debug, Parser)] +pub(crate) struct ShellApp { + /// Global options. + #[clap(flatten)] + pub(crate) global_opts: GlobalOpts, + + /// The command to run. + #[clap(subcommand)] + command: ShellCommand, +} + +impl ShellApp { + pub(crate) fn exec( + self, + log: slog::Logger, + wicketd_addr: SocketAddrV6, + ) -> Result<()> { + match self.command { + ShellCommand::UploadRepo(args) => args.exec(log, wicketd_addr), + ShellCommand::RackUpdate(args) => { + args.exec(log, wicketd_addr, self.global_opts) + } + ShellCommand::Setup(args) => args.exec(log, wicketd_addr), + ShellCommand::Preflight(args) => args.exec(log, wicketd_addr), + } + } +} + +#[derive(Debug, Args)] +#[clap(next_help_heading = "Global options")] +pub(crate) struct GlobalOpts { + /// Color output + /// + /// This may not be obeyed everywhere at the moment. + #[clap(long, value_enum, global = true, default_value_t)] + pub(crate) color: ColorChoice, +} + +impl GlobalOpts { + /// Returns true if color should be used on standard error. + pub(crate) fn use_color(&self) -> bool { + match self.color { + ColorChoice::Auto => { + supports_color::on_cached(supports_color::Stream::Stderr) + .is_some() + } + ColorChoice::Always => true, + ColorChoice::Never => false, + } } } @@ -41,14 +73,20 @@ pub fn exec( /// ForceCommand. If no arguments are specified, wicket behaves like a TUI. /// However, if arguments are specified via SSH_ORIGINAL_COMMAND, wicketd /// accepts an upload command. -#[derive(Debug, Parser)] +#[derive(Debug, Subcommand)] enum ShellCommand { /// Upload a TUF repository to wicketd. #[command(visible_alias = "upload")] UploadRepo(UploadArgs), + + /// Perform a rack update. + #[command(subcommand)] + RackUpdate(RackUpdateArgs), + /// Interact with rack setup configuration. #[command(subcommand)] Setup(SetupArgs), + /// Run checks prior to setting up the rack. #[command(subcommand)] Preflight(PreflightArgs), diff --git a/wicket/src/cli/mod.rs b/wicket/src/cli/mod.rs index 7e8f6540ea..e844e3d247 100644 --- a/wicket/src/cli/mod.rs +++ b/wicket/src/cli/mod.rs @@ -13,6 +13,7 @@ mod command; mod preflight; mod rack_setup; +mod rack_update; mod upload; -pub use command::exec; +pub(super) use command::{GlobalOpts, ShellApp}; diff --git a/wicket/src/cli/rack_update.rs b/wicket/src/cli/rack_update.rs new file mode 100644 index 0000000000..7e0d1a9b31 --- /dev/null +++ b/wicket/src/cli/rack_update.rs @@ -0,0 +1,313 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Command-line driven rack update. +//! +//! This is an alternative to using the Wicket UI to perform a rack update. + +use std::{ + collections::{BTreeMap, BTreeSet}, + net::SocketAddrV6, + time::Duration, +}; + +use anyhow::{anyhow, bail, Context, Result}; +use clap::{Args, Subcommand}; +use slog::Logger; +use tokio::{sync::watch, task::JoinHandle}; +use update_engine::display::{GroupDisplay, LineDisplayStyles}; +use wicket_common::update_events::EventReport; +use wicketd_client::types::StartUpdateParams; + +use crate::{ + cli::GlobalOpts, + state::{parse_event_report_map, ComponentId, CreateStartUpdateOptions}, + wicketd::{create_wicketd_client, WICKETD_TIMEOUT}, +}; + +#[derive(Debug, Subcommand)] +pub(crate) enum RackUpdateArgs { + /// Start one or more updates. + Start(StartRackUpdateArgs), + /// Attach to one or more running updates. + Attach(AttachArgs), +} + +impl RackUpdateArgs { + pub(crate) fn exec( + self, + log: Logger, + wicketd_addr: SocketAddrV6, + global_opts: GlobalOpts, + ) -> Result<()> { + let runtime = + tokio::runtime::Runtime::new().context("creating tokio runtime")?; + runtime.block_on(self.exec_impl(log, wicketd_addr, global_opts)) + } + + async fn exec_impl( + self, + log: Logger, + wicketd_addr: SocketAddrV6, + global_opts: GlobalOpts, + ) -> Result<()> { + match self { + RackUpdateArgs::Start(args) => { + args.exec(log, wicketd_addr, global_opts).await + } + RackUpdateArgs::Attach(args) => { + args.exec(log, wicketd_addr, global_opts).await + } + } + } +} + +#[derive(Debug, Args)] +pub(crate) struct StartRackUpdateArgs { + #[clap(flatten)] + component_ids: ComponentIdSelector, + + /// Force update the RoT even if the version is the same. + #[clap(long, help_heading = "Update options")] + force_update_rot: bool, + + /// Force update the SP even if the version is the same. + #[clap(long, help_heading = "Update options")] + force_update_sp: bool, + + /// Detach after starting the update. + /// + /// The `attach` command can be used to reattach to the running update. + #[clap(short, long, help_heading = "Update options")] + detach: bool, +} + +impl StartRackUpdateArgs { + async fn exec( + self, + log: Logger, + wicketd_addr: SocketAddrV6, + global_opts: GlobalOpts, + ) -> Result<()> { + let client = create_wicketd_client(&log, wicketd_addr, WICKETD_TIMEOUT); + + let update_ids = self.component_ids.to_component_ids()?; + let options = CreateStartUpdateOptions { + force_update_rot: self.force_update_rot, + force_update_sp: self.force_update_sp, + } + .to_start_update_options()?; + + let num_update_ids = update_ids.len(); + + let params = StartUpdateParams { + targets: update_ids.iter().copied().map(Into::into).collect(), + options, + }; + + slog::debug!(log, "Sending post_start_update"; "num_update_ids" => num_update_ids); + match client.post_start_update(¶ms).await { + Ok(_) => { + slog::info!(log, "Update started for {num_update_ids} targets"); + } + Err(error) => { + // Error responses can be printed out more clearly. + if let wicketd_client::Error::ErrorResponse(rv) = &error { + slog::error!( + log, + "Error response from wicketd: {}", + rv.message + ); + bail!("Received error from wicketd while starting update"); + } else { + bail!(error); + } + } + } + + if self.detach { + return Ok(()); + } + + // Now, attach to the update by printing out update logs. + do_attach_to_updates(log, client, update_ids, global_opts).await?; + + Ok(()) + } +} + +#[derive(Debug, Args)] +pub(crate) struct AttachArgs { + #[clap(flatten)] + component_ids: ComponentIdSelector, +} + +impl AttachArgs { + async fn exec( + self, + log: Logger, + wicketd_addr: SocketAddrV6, + global_opts: GlobalOpts, + ) -> Result<()> { + let client = create_wicketd_client(&log, wicketd_addr, WICKETD_TIMEOUT); + + let update_ids = self.component_ids.to_component_ids()?; + do_attach_to_updates(log, client, update_ids, global_opts).await + } +} + +async fn do_attach_to_updates( + log: Logger, + client: wicketd_client::Client, + update_ids: BTreeSet, + global_opts: GlobalOpts, +) -> Result<()> { + let mut display = GroupDisplay::new_with_display( + update_ids.iter().copied(), + std::io::stderr(), + ); + if global_opts.use_color() { + display.set_styles(LineDisplayStyles::colorized()); + } + + let (mut rx, handle) = start_fetch_reports_task(&log, client.clone()).await; + let mut status_timer = tokio::time::interval(Duration::from_secs(5)); + status_timer.tick().await; + + while !display.stats().is_terminal() { + tokio::select! { + res = rx.changed() => { + if res.is_err() { + // The sending end is closed, which means that the task + // created by start_fetch_reports_task died... this can + // happen either due to a panic or due to an error. + match handle.await { + Ok(Ok(())) => { + // The task exited normally, which means that the + // sending end was closed normally. This cannot + // happen. + bail!("fetch_reports task exited with Ok(()) \ + -- this should never happen here"); + } + Ok(Err(error)) => { + // The task exited with an error. + return Err(error).context("fetch_reports task errored out"); + } + Err(error) => { + // The task panicked. + return Err(anyhow!(error)).context("fetch_reports task panicked"); + } + } + } + + let event_reports = rx.borrow_and_update(); + // TODO: parallelize this computation? + for (id, event_report) in &*event_reports { + // If display.add_event_report errors out, it's for a report for a + // component we weren't interested in. Ignore it. + _ = display.add_event_report(&id, event_report.clone()); + } + + // Print out status for each component ID at the end -- do it here so + // that we also consider components for which we haven't seen status + // yet. + display.write_events()?; + } + _ = status_timer.tick() => { + display.write_stats("Status")?; + } + } + } + + // Show any remaining events. + display.write_events()?; + // And also show a summary. + display.write_stats("Summary")?; + + std::mem::drop(rx); + handle + .await + .context("fetch_reports task panicked after rx dropped")? + .context("fetch_reports task errored out after rx dropped")?; + + if display.stats().has_failures() { + bail!("one or more failures occurred"); + } + + Ok(()) +} + +async fn start_fetch_reports_task( + log: &Logger, + client: wicketd_client::Client, +) -> (watch::Receiver>, JoinHandle>) +{ + // Since reports are always cumulative, we can use a watch receiver here + // rather than an mpsc receiver. If we start using incremental reports at + // some point this would need to be changed to be an mpsc receiver. + let (tx, rx) = watch::channel(BTreeMap::new()); + let log = log.new(slog::o!("task" => "fetch_reports")); + + let handle = tokio::spawn(async move { + loop { + let response = client.get_artifacts_and_event_reports().await?; + let reports = response.into_inner().event_reports; + let reports = parse_event_report_map(&log, reports); + if tx.send(reports).is_err() { + // The receiving end is closed, exit. + break; + } + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(1)) => {}, + _ = tx.closed() => { + // The receiving end is closed, exit. + break; + } + } + } + + Ok(()) + }); + (rx, handle) +} + +/// Command-line arguments for selecting component IDs. +#[derive(Debug, Args)] +#[clap(next_help_heading = "Component selectors")] +struct ComponentIdSelector { + /// The sleds to operate on. + #[clap(long, value_delimiter = ',')] + sled: Vec, + + /// The switches to operate on. + #[clap(long, value_delimiter = ',')] + switch: Vec, + + /// The PSCs to operate on. + #[clap(long, value_delimiter = ',')] + psc: Vec, +} + +impl ComponentIdSelector { + /// Validates that all the sleds, switches, and PSCs are reasonable (though + /// they might not exist on the actual hardware), then return the set of + /// selected component IDs. + fn to_component_ids(&self) -> Result> { + let mut component_ids = BTreeSet::new(); + for sled in &self.sled { + component_ids.insert(ComponentId::new_sled(*sled)?); + } + for switch in &self.switch { + component_ids.insert(ComponentId::new_switch(*switch)?); + } + for psc in &self.psc { + component_ids.insert(ComponentId::new_psc(*psc)?); + } + if component_ids.is_empty() { + bail!("at least one component ID must be selected via --sled, --switch or --psc"); + } + + Ok(component_ids) + } +} diff --git a/wicket/src/dispatch.rs b/wicket/src/dispatch.rs index 3ef04ee302..39fad0dab6 100644 --- a/wicket/src/dispatch.rs +++ b/wicket/src/dispatch.rs @@ -8,10 +8,11 @@ use std::net::{Ipv6Addr, SocketAddrV6}; use anyhow::{bail, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; use omicron_common::{address::WICKETD_PORT, FileKv}; use slog::Drain; -use crate::Runner; +use crate::{cli::ShellApp, Runner}; pub fn exec() -> Result<()> { let wicketd_addr = @@ -19,8 +20,21 @@ pub fn exec() -> Result<()> { // SSH_ORIGINAL_COMMAND contains additional arguments, if any. if let Ok(ssh_args) = std::env::var("SSH_ORIGINAL_COMMAND") { - let log = setup_log(&log_path()?, WithStderr::Yes)?; - crate::cli::exec(log, &ssh_args, wicketd_addr) + // The argument is in a quoted form, so split it using Unix shell semantics. + let args = shell_words::split(&ssh_args).with_context(|| { + format!("could not parse shell arguments from input {ssh_args}") + })?; + // parse_from uses the the first argument as the command name. Insert "wicket" as + // the command name. + let app = ShellApp::parse_from( + std::iter::once("wicket".to_owned()).chain(args), + ); + + let log = setup_log( + &log_path()?, + WithStderr::Yes { use_color: app.global_opts.use_color() }, + )?; + app.exec(log, wicketd_addr) } else { // Do not expose log messages via standard error since they'll show up // on top of the TUI. @@ -44,8 +58,8 @@ fn setup_log( let drain = slog_term::FullFormat::new(decorator).build().fuse(); let drain = match with_stderr { - WithStderr::Yes => { - let stderr_drain = stderr_env_drain("RUST_LOG"); + WithStderr::Yes { use_color } => { + let stderr_drain = stderr_env_drain("RUST_LOG", use_color); let drain = slog::Duplicate::new(drain, stderr_drain).fuse(); slog_async::Async::new(drain).build().fuse() } @@ -57,7 +71,7 @@ fn setup_log( #[derive(Copy, Clone, Debug)] enum WithStderr { - Yes, + Yes { use_color: bool }, No, } @@ -71,8 +85,17 @@ fn log_path() -> Result { } } -fn stderr_env_drain(env_var: &str) -> impl Drain { - let stderr_decorator = slog_term::TermDecorator::new().build(); +fn stderr_env_drain( + env_var: &str, + use_color: bool, +) -> impl Drain { + let mut builder = slog_term::TermDecorator::new(); + if use_color { + builder = builder.force_color(); + } else { + builder = builder.force_plain(); + } + let stderr_decorator = builder.build(); let stderr_drain = slog_term::FullFormat::new(stderr_decorator).build().fuse(); let mut builder = slog_envlogger::LogBuilder::new(stderr_drain); diff --git a/wicket/src/helpers.rs b/wicket/src/helpers.rs new file mode 100644 index 0000000000..564b7e9348 --- /dev/null +++ b/wicket/src/helpers.rs @@ -0,0 +1,70 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Utility functions. + +use std::env::VarError; + +use anyhow::{bail, Context}; +use wicketd_client::types::{UpdateSimulatedResult, UpdateTestError}; + +pub(crate) fn get_update_test_error( + env_var: &str, +) -> Result, anyhow::Error> { + // 30 seconds should always be enough to cause a timeout. (The default + // timeout for progenitor is 15 seconds, and in wicket we set an even + // shorter timeout.) + const DEFAULT_TEST_TIMEOUT_SECS: u64 = 30; + + let test_error = match std::env::var(env_var) { + Ok(v) if v == "fail" => Some(UpdateTestError::Fail), + Ok(v) if v == "timeout" => { + Some(UpdateTestError::Timeout { secs: DEFAULT_TEST_TIMEOUT_SECS }) + } + Ok(v) if v.starts_with("timeout:") => { + // Extended start_timeout syntax with a custom + // number of seconds. + let suffix = v.strip_prefix("timeout:").unwrap(); + match suffix.parse::() { + Ok(secs) => Some(UpdateTestError::Timeout { secs }), + Err(error) => { + return Err(error).with_context(|| { + format!( + "could not parse {env_var} \ + in the form `timeout:`: {v}" + ) + }); + } + } + } + Ok(value) => { + bail!("unrecognized value for {env_var}: {value}"); + } + Err(VarError::NotPresent) => None, + Err(VarError::NotUnicode(value)) => { + bail!("invalid Unicode for {env_var}: {}", value.to_string_lossy()); + } + }; + Ok(test_error) +} + +pub(crate) fn get_update_simulated_result( + env_var: &str, +) -> Result, anyhow::Error> { + let result = match std::env::var(env_var) { + Ok(v) if v == "success" => Some(UpdateSimulatedResult::Success), + Ok(v) if v == "warning" => Some(UpdateSimulatedResult::Warning), + Ok(v) if v == "skipped" => Some(UpdateSimulatedResult::Skipped), + Ok(v) if v == "failure" => Some(UpdateSimulatedResult::Failure), + Ok(value) => { + bail!("unrecognized value for {env_var}: {value}"); + } + Err(VarError::NotPresent) => None, + Err(VarError::NotUnicode(value)) => { + bail!("invalid Unicode for {env_var}: {}", value.to_string_lossy()); + } + }; + + Ok(result) +} diff --git a/wicket/src/lib.rs b/wicket/src/lib.rs index 6e760968f8..5e09cb91f4 100644 --- a/wicket/src/lib.rs +++ b/wicket/src/lib.rs @@ -10,6 +10,7 @@ use std::time::Duration; mod cli; mod dispatch; mod events; +mod helpers; mod keymap; mod runner; mod state; diff --git a/wicket/src/runner.rs b/wicket/src/runner.rs index c37b16d5d9..2d988814dd 100644 --- a/wicket/src/runner.rs +++ b/wicket/src/runner.rs @@ -2,8 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use anyhow::bail; -use anyhow::Context; use crossterm::event::Event as TermEvent; use crossterm::event::EventStream; use crossterm::event::{KeyCode, KeyEvent, KeyModifiers}; @@ -17,7 +15,6 @@ use ratatui::backend::CrosstermBackend; use ratatui::Terminal; use slog::Logger; use slog::{debug, error, info}; -use std::env::VarError; use std::io::{stdout, Stdout}; use std::net::SocketAddrV6; use std::time::Instant; @@ -27,11 +24,10 @@ use tokio::sync::mpsc::{ use tokio::time::{interval, Duration}; use wicketd_client::types::AbortUpdateOptions; use wicketd_client::types::ClearUpdateStateOptions; -use wicketd_client::types::StartUpdateOptions; -use wicketd_client::types::UpdateSimulatedResult; -use wicketd_client::types::UpdateTestError; use crate::events::EventReportMap; +use crate::helpers::get_update_test_error; +use crate::state::CreateStartUpdateOptions; use crate::ui::Screen; use crate::wicketd::{self, WicketdHandle, WicketdManager}; use crate::{Action, Cmd, Event, KeyHandler, Recorder, State, TICK_INTERVAL}; @@ -180,43 +176,18 @@ impl RunnerCore { } Action::StartUpdate(component_id) => { if let Some(wicketd) = wicketd { - let test_error = get_update_test_error( - "WICKET_TEST_START_UPDATE_ERROR", - )?; - - // This is a debug environment variable used to - // add a test step. - let test_step_seconds = - std::env::var("WICKET_UPDATE_TEST_STEP_SECONDS") - .ok() - .map(|v| { - v.parse().expect( - "parsed WICKET_UPDATE_TEST_STEP_SECONDS \ - as a u64", - ) - }); - - let test_simulate_rot_result = get_update_simulated_result( - "WICKET_UPDATE_TEST_SIMULATE_ROT_RESULT", - )?; - let test_simulate_sp_result = get_update_simulated_result( - "WICKET_UPDATE_TEST_SIMULATE_SP_RESULT", - )?; - - let options = StartUpdateOptions { - test_error, - test_step_seconds, - test_simulate_rot_result, - test_simulate_sp_result, - skip_rot_version_check: self + let options = CreateStartUpdateOptions { + force_update_rot: self .state .force_update_state .force_update_rot, - skip_sp_version_check: self + force_update_sp: self .state .force_update_state .force_update_sp, - }; + } + .to_start_update_options()?; + wicketd.tx.blocking_send( wicketd::Request::StartUpdate { component_id, options }, )?; @@ -281,66 +252,6 @@ impl RunnerCore { } } -fn get_update_test_error( - env_var: &str, -) -> Result, anyhow::Error> { - // 30 seconds should always be enough to cause a timeout. (The default - // timeout for progenitor is 15 seconds, and in wicket we set an even - // shorter timeout.) - const DEFAULT_TEST_TIMEOUT_SECS: u64 = 30; - - let test_error = match std::env::var(env_var) { - Ok(v) if v == "fail" => Some(UpdateTestError::Fail), - Ok(v) if v == "timeout" => { - Some(UpdateTestError::Timeout { secs: DEFAULT_TEST_TIMEOUT_SECS }) - } - Ok(v) if v.starts_with("timeout:") => { - // Extended start_timeout syntax with a custom - // number of seconds. - let suffix = v.strip_prefix("timeout:").unwrap(); - match suffix.parse::() { - Ok(secs) => Some(UpdateTestError::Timeout { secs }), - Err(error) => { - return Err(error).with_context(|| { - format!( - "could not parse {env_var} \ - in the form `timeout:`: {v}" - ) - }); - } - } - } - Ok(value) => { - bail!("unrecognized value for {env_var}: {value}"); - } - Err(VarError::NotPresent) => None, - Err(VarError::NotUnicode(value)) => { - bail!("invalid Unicode for {env_var}: {}", value.to_string_lossy()); - } - }; - Ok(test_error) -} - -fn get_update_simulated_result( - env_var: &str, -) -> Result, anyhow::Error> { - let result = match std::env::var(env_var) { - Ok(v) if v == "success" => Some(UpdateSimulatedResult::Success), - Ok(v) if v == "warning" => Some(UpdateSimulatedResult::Warning), - Ok(v) if v == "skipped" => Some(UpdateSimulatedResult::Skipped), - Ok(v) if v == "failure" => Some(UpdateSimulatedResult::Failure), - Ok(value) => { - bail!("unrecognized value for {env_var}: {value}"); - } - Err(VarError::NotPresent) => None, - Err(VarError::NotUnicode(value)) => { - bail!("invalid Unicode for {env_var}: {}", value.to_string_lossy()); - } - }; - - Ok(result) -} - /// The `Runner` owns the main UI thread, and starts a tokio runtime /// for interaction with downstream services. pub struct Runner { diff --git a/wicket/src/state/inventory.rs b/wicket/src/state/inventory.rs index 23a0e244cf..1c232e653b 100644 --- a/wicket/src/state/inventory.rs +++ b/wicket/src/state/inventory.rs @@ -4,7 +4,8 @@ //! Information about all top-level Oxide components (sleds, switches, PSCs) -use anyhow::anyhow; +use anyhow::{bail, Result}; +use omicron_common::api::internal::nexus::KnownArtifactKind; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -64,26 +65,13 @@ impl Inventory { }; // Validate and get a ComponentId - let (id, component) = match type_ { - SpType::Sled => { - if i > 31 { - return Err(anyhow!("Invalid sled slot: {}", i)); - } - (ComponentId::Sled(i as u8), Component::Sled(sp)) - } - SpType::Switch => { - if i > 1 { - return Err(anyhow!("Invalid switch slot: {}", i)); - } - (ComponentId::Switch(i as u8), Component::Switch(sp)) - } - SpType::Power => { - if i > 1 { - return Err(anyhow!("Invalid power shelf slot: {}", i)); - } - (ComponentId::Psc(i as u8), Component::Psc(sp)) - } + let id = ComponentId::from_sp_type_and_slot(type_, i as u8)?; + let component = match type_ { + SpType::Sled => Component::Sled(sp), + SpType::Switch => Component::Switch(sp), + SpType::Power => Component::Psc(sp), }; + new_inventory.inventory.insert(id, component); // TODO: Plumb through real power state @@ -204,6 +192,66 @@ pub enum ComponentId { } impl ComponentId { + /// The maximum possible sled ID. + pub const MAX_SLED_ID: u8 = 31; + + /// The maximum possible switch ID. + pub const MAX_SWITCH_ID: u8 = 1; + + /// The maximum possible power shelf ID. + /// + /// Currently shipping racks don't have PSC 1. + pub const MAX_PSC_ID: u8 = 0; + + pub fn new_sled(slot: u8) -> Result { + if slot > Self::MAX_SLED_ID { + bail!("Invalid sled slot: {}", slot); + } + Ok(Self::Sled(slot)) + } + + pub fn new_switch(slot: u8) -> Result { + if slot > Self::MAX_SWITCH_ID { + bail!("Invalid switch slot: {}", slot); + } + Ok(Self::Switch(slot)) + } + + pub fn new_psc(slot: u8) -> Result { + if slot > Self::MAX_PSC_ID { + bail!("Invalid power shelf slot: {}", slot); + } + Ok(Self::Psc(slot)) + } + + pub fn from_sp_type_and_slot(sp_type: SpType, slot: u8) -> Result { + match sp_type { + SpType::Sled => Self::new_sled(slot), + SpType::Switch => Self::new_switch(slot), + SpType::Power => Self::new_psc(slot), + } + } + + pub fn name(&self) -> String { + self.to_string() + } + + pub fn sp_known_artifact_kind(&self) -> KnownArtifactKind { + match self { + ComponentId::Sled(_) => KnownArtifactKind::GimletSp, + ComponentId::Switch(_) => KnownArtifactKind::SwitchSp, + ComponentId::Psc(_) => KnownArtifactKind::PscSp, + } + } + + pub fn rot_known_artifact_kind(&self) -> KnownArtifactKind { + match self { + ComponentId::Sled(_) => KnownArtifactKind::GimletRot, + ComponentId::Switch(_) => KnownArtifactKind::SwitchRot, + ComponentId::Psc(_) => KnownArtifactKind::PscRot, + } + } + pub fn to_string_uppercase(&self) -> String { let mut s = self.to_string(); s.make_ascii_uppercase(); diff --git a/wicket/src/state/mod.rs b/wicket/src/state/mod.rs index ac9cbcae2f..246c4c31ae 100644 --- a/wicket/src/state/mod.rs +++ b/wicket/src/state/mod.rs @@ -18,8 +18,8 @@ pub use inventory::{ pub use rack::{KnightRiderMode, RackState}; pub use status::{Liveness, ServiceStatus}; pub use update::{ - update_component_title, RackUpdateState, UpdateItemState, - UpdateRunningState, + parse_event_report_map, update_component_title, CreateStartUpdateOptions, + RackUpdateState, UpdateItemState, UpdateRunningState, }; use serde::{Deserialize, Serialize}; diff --git a/wicket/src/state/update.rs b/wicket/src/state/update.rs index 1a0aafb9cf..2fde662b2f 100644 --- a/wicket/src/state/update.rs +++ b/wicket/src/state/update.rs @@ -2,21 +2,23 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use anyhow::Result; use ratatui::style::Style; use wicket_common::update_events::{ EventReport, ProgressEventKind, StepEventKind, UpdateComponent, UpdateStepId, }; +use crate::helpers::{get_update_simulated_result, get_update_test_error}; use crate::{events::EventReportMap, ui::defaults::style}; use super::{ComponentId, ParsableComponentId, ALL_COMPONENT_IDS}; use omicron_common::api::internal::nexus::KnownArtifactKind; use serde::{Deserialize, Serialize}; -use slog::{warn, Logger}; -use std::collections::{BTreeMap, HashSet}; +use slog::Logger; +use std::collections::BTreeMap; use std::fmt::Display; -use wicketd_client::types::{ArtifactId, SemverVersion}; +use wicketd_client::types::{ArtifactId, SemverVersion, StartUpdateOptions}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RackUpdateState { @@ -102,34 +104,18 @@ impl RackUpdateState { } } - let mut updated_component_ids = HashSet::new(); - - for (sp_type, logs) in reports { - for (i, log) in logs { - let Ok(id) = ComponentId::try_from(ParsableComponentId { - sp_type: &sp_type, - i: &i, - }) else { - warn!( - logger, - "Invalid ComponentId in EventReport: {} {}", - &sp_type, - &i - ); - continue; - }; - let item_state = self.items.get_mut(&id).unwrap(); - item_state.update(log); - updated_component_ids.insert(id); - } - } - - // Reset all component IDs that weren't updated. + let reports = parse_event_report_map(logger, reports); + // Reset all component IDs that aren't in the event report map. for (id, item) in &mut self.items { - if !updated_component_ids.contains(id) { + if !reports.contains_key(id) { item.reset(); } } + + for (id, report) in reports { + let item_state = self.items.get_mut(&id).unwrap(); + item_state.update(report); + } } } @@ -445,3 +431,68 @@ pub fn update_component_title(component: UpdateComponent) -> &'static str { UpdateComponent::Host => "HOST", } } + +pub struct CreateStartUpdateOptions { + pub(crate) force_update_rot: bool, + pub(crate) force_update_sp: bool, +} + +impl CreateStartUpdateOptions { + pub fn to_start_update_options(&self) -> Result { + let test_error = + get_update_test_error("WICKET_TEST_START_UPDATE_ERROR")?; + + // This is a debug environment variable used to + // add a test step. + let test_step_seconds = + std::env::var("WICKET_UPDATE_TEST_STEP_SECONDS").ok().map(|v| { + v.parse().expect( + "parsed WICKET_UPDATE_TEST_STEP_SECONDS \ + as a u64", + ) + }); + + let test_simulate_rot_result = get_update_simulated_result( + "WICKET_UPDATE_TEST_SIMULATE_ROT_RESULT", + )?; + let test_simulate_sp_result = get_update_simulated_result( + "WICKET_UPDATE_TEST_SIMULATE_SP_RESULT", + )?; + + Ok(StartUpdateOptions { + test_error, + test_step_seconds, + test_simulate_rot_result, + test_simulate_sp_result, + skip_rot_version_check: self.force_update_rot, + skip_sp_version_check: self.force_update_sp, + }) + } +} + +/// Converts an `EventReportMap` to a map by component ID. +pub fn parse_event_report_map( + log: &Logger, + reports: EventReportMap, +) -> BTreeMap { + let mut component_id_map = BTreeMap::new(); + for (sp_type, logs) in reports { + for (i, event_report) in logs { + let Ok(id) = ComponentId::try_from(ParsableComponentId { + sp_type: &sp_type, + i: &i, + }) else { + slog::warn!( + log, + "Invalid ComponentId in EventReportMap: {} {}", + &sp_type, + &i + ); + continue; + }; + component_id_map.insert(id, event_report); + } + } + + component_id_map +} diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs index d76c6c3b49..d14b90dfab 100644 --- a/wicket/src/ui/panes/update.rs +++ b/wicket/src/ui/panes/update.rs @@ -1895,7 +1895,7 @@ impl ComponentUpdateListState { )); None } - ExecutionStatus::Running { step_key } => { + ExecutionStatus::Running { step_key, .. } => { status_text .push(Span::styled("Update ", style::plain_text())); status_text.push(Span::styled( diff --git a/wicket/src/wicketd.rs b/wicket/src/wicketd.rs index 33c80410d8..98f4d0f72b 100644 --- a/wicket/src/wicketd.rs +++ b/wicket/src/wicketd.rs @@ -41,7 +41,7 @@ const WICKETD_POLL_INTERVAL: Duration = Duration::from_millis(500); // WICKETD_TIMEOUT used to be 1 second, but that might be too short (and in // particular might be responsible for // https://github.com/oxidecomputer/omicron/issues/3103). -const WICKETD_TIMEOUT: Duration = Duration::from_secs(5); +pub(crate) const WICKETD_TIMEOUT: Duration = Duration::from_secs(5); // Assume that these requests are periodic on the order of seconds or the // result of human interaction. In either case, this buffer should be plenty From 45468ac052ba97f0cb034d69fe5f3588f70e0e5d Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 7 Nov 2023 20:30:56 -0800 Subject: [PATCH 11/18] Bump RoT to 1.0.4 (#4463) undefined --- .github/buildomat/jobs/tuf-repo.sh | 4 ++-- tools/dvt_dock_version | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index 29cf7fa85e..5e7a2d4a91 100644 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -278,8 +278,8 @@ EOF done } # usage: SERIES ROT_DIR ROT_VERSION BOARDS... -add_hubris_artifacts rot-staging-dev staging/dev cert-staging-dev-v1.0.2 "${ALL_BOARDS[@]}" -add_hubris_artifacts rot-prod-rel prod/rel cert-prod-rel-v1.0.2 "${ALL_BOARDS[@]}" +add_hubris_artifacts rot-staging-dev staging/dev cert-staging-dev-v1.0.4 "${ALL_BOARDS[@]}" +add_hubris_artifacts rot-prod-rel prod/rel cert-prod-rel-v1.0.4 "${ALL_BOARDS[@]}" for series in "${SERIES_LIST[@]}"; do /work/tufaceous assemble --no-generate-key /work/manifest-"$series".toml /work/repo-"$series".zip diff --git a/tools/dvt_dock_version b/tools/dvt_dock_version index b52dded7d0..f7fef543f4 100644 --- a/tools/dvt_dock_version +++ b/tools/dvt_dock_version @@ -1 +1 @@ -COMMIT=9cb2b40cea90ad40f5c0d2c3da96d26913253e06 +COMMIT=ad874c11ecd0c45bdc1e4c2ac35c2bcbe472d55f From 09b7b4dd17fb663ec598d3bfa2c2895afcbd11aa Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 8 Nov 2023 05:29:35 +0000 Subject: [PATCH 12/18] chore(deps): update taiki-e/install-action digest to b4f94d4 (#4464) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`11dea51` -> `b4f94d4`](https://togithub.com/taiki-e/install-action/compare/11dea51...b4f94d4) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 9d5f7444de..3fda700f3a 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -22,7 +22,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@11dea51b35bc2bfa42820716c6cabb14fd4c3266 # v2 + uses: taiki-e/install-action@b4f94d444931c93a5a6d9c6bd65a585838d0f765 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 9ff5aa357eebaa9cc57380e4d9caff3247dce134 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 7 Nov 2023 23:45:49 -0800 Subject: [PATCH 13/18] [wicket] make CLI more usable in wicketd integration tests (#4450) Make it so that wicket's CLI is usable in wicketd's integration tests. To achieve this, make a few changes: 1. Factor out `exec` into `exec_with_args`. 2. Create the Tokio runtime outside of `exec_with_args`, and make `exec_with_args` async. This is so that wicketd's integration tests can run `exec_with_args` with a preexisting runtime. 3. Allow capturing any output not written to logs. For now this is only used by the `rack_update` commands. Depends on #4062 (mostly because it is more convenient for me that way). --- wicket/src/cli/command.rs | 19 ++++++-- wicket/src/cli/mod.rs | 2 +- wicket/src/cli/preflight.rs | 13 +---- wicket/src/cli/rack_setup.rs | 13 +---- wicket/src/cli/rack_update.rs | 30 +++++------- wicket/src/cli/upload.rs | 6 +-- wicket/src/dispatch.rs | 92 ++++++++++++++++++++++++++--------- 7 files changed, 102 insertions(+), 73 deletions(-) diff --git a/wicket/src/cli/command.rs b/wicket/src/cli/command.rs index 928e54306a..63d41ed642 100644 --- a/wicket/src/cli/command.rs +++ b/wicket/src/cli/command.rs @@ -14,6 +14,12 @@ use super::{ rack_update::RackUpdateArgs, upload::UploadArgs, }; +pub(crate) struct CommandOutput<'a> { + #[allow(dead_code)] + pub(crate) stdout: &'a mut dyn std::io::Write, + pub(crate) stderr: &'a mut dyn std::io::Write, +} + /// An app that represents wicket started with arguments over ssh. #[derive(Debug, Parser)] pub(crate) struct ShellApp { @@ -27,18 +33,21 @@ pub(crate) struct ShellApp { } impl ShellApp { - pub(crate) fn exec( + pub(crate) async fn exec( self, log: slog::Logger, wicketd_addr: SocketAddrV6, + output: CommandOutput<'_>, ) -> Result<()> { match self.command { - ShellCommand::UploadRepo(args) => args.exec(log, wicketd_addr), + ShellCommand::UploadRepo(args) => { + args.exec(log, wicketd_addr).await + } ShellCommand::RackUpdate(args) => { - args.exec(log, wicketd_addr, self.global_opts) + args.exec(log, wicketd_addr, self.global_opts, output).await } - ShellCommand::Setup(args) => args.exec(log, wicketd_addr), - ShellCommand::Preflight(args) => args.exec(log, wicketd_addr), + ShellCommand::Setup(args) => args.exec(log, wicketd_addr).await, + ShellCommand::Preflight(args) => args.exec(log, wicketd_addr).await, } } } diff --git a/wicket/src/cli/mod.rs b/wicket/src/cli/mod.rs index e844e3d247..e63ef467e7 100644 --- a/wicket/src/cli/mod.rs +++ b/wicket/src/cli/mod.rs @@ -16,4 +16,4 @@ mod rack_setup; mod rack_update; mod upload; -pub(super) use command::{GlobalOpts, ShellApp}; +pub(super) use command::{CommandOutput, GlobalOpts, ShellApp}; diff --git a/wicket/src/cli/preflight.rs b/wicket/src/cli/preflight.rs index ddbbf95c1a..29b6d2a5cb 100644 --- a/wicket/src/cli/preflight.rs +++ b/wicket/src/cli/preflight.rs @@ -47,18 +47,7 @@ pub(crate) enum PreflightArgs { } impl PreflightArgs { - pub(crate) fn exec( - self, - log: Logger, - wicketd_addr: SocketAddrV6, - ) -> Result<()> { - let runtime = - tokio::runtime::Runtime::new().context("creating tokio runtime")?; - - runtime.block_on(self.exec_impl(log, wicketd_addr)) - } - - async fn exec_impl( + pub(crate) async fn exec( self, log: Logger, wicketd_addr: SocketAddrV6, diff --git a/wicket/src/cli/rack_setup.rs b/wicket/src/cli/rack_setup.rs index c678d14e11..8841046813 100644 --- a/wicket/src/cli/rack_setup.rs +++ b/wicket/src/cli/rack_setup.rs @@ -61,18 +61,7 @@ pub(crate) enum SetupArgs { } impl SetupArgs { - pub(crate) fn exec( - self, - log: Logger, - wicketd_addr: SocketAddrV6, - ) -> Result<()> { - let runtime = - tokio::runtime::Runtime::new().context("creating tokio runtime")?; - - runtime.block_on(self.exec_impl(log, wicketd_addr)) - } - - async fn exec_impl( + pub(crate) async fn exec( self, log: Logger, wicketd_addr: SocketAddrV6, diff --git a/wicket/src/cli/rack_update.rs b/wicket/src/cli/rack_update.rs index 7e0d1a9b31..cb4018956a 100644 --- a/wicket/src/cli/rack_update.rs +++ b/wicket/src/cli/rack_update.rs @@ -26,6 +26,8 @@ use crate::{ wicketd::{create_wicketd_client, WICKETD_TIMEOUT}, }; +use super::command::CommandOutput; + #[derive(Debug, Subcommand)] pub(crate) enum RackUpdateArgs { /// Start one or more updates. @@ -35,29 +37,19 @@ pub(crate) enum RackUpdateArgs { } impl RackUpdateArgs { - pub(crate) fn exec( - self, - log: Logger, - wicketd_addr: SocketAddrV6, - global_opts: GlobalOpts, - ) -> Result<()> { - let runtime = - tokio::runtime::Runtime::new().context("creating tokio runtime")?; - runtime.block_on(self.exec_impl(log, wicketd_addr, global_opts)) - } - - async fn exec_impl( + pub(crate) async fn exec( self, log: Logger, wicketd_addr: SocketAddrV6, global_opts: GlobalOpts, + output: CommandOutput<'_>, ) -> Result<()> { match self { RackUpdateArgs::Start(args) => { - args.exec(log, wicketd_addr, global_opts).await + args.exec(log, wicketd_addr, global_opts, output).await } RackUpdateArgs::Attach(args) => { - args.exec(log, wicketd_addr, global_opts).await + args.exec(log, wicketd_addr, global_opts, output).await } } } @@ -89,6 +81,7 @@ impl StartRackUpdateArgs { log: Logger, wicketd_addr: SocketAddrV6, global_opts: GlobalOpts, + output: CommandOutput<'_>, ) -> Result<()> { let client = create_wicketd_client(&log, wicketd_addr, WICKETD_TIMEOUT); @@ -131,7 +124,8 @@ impl StartRackUpdateArgs { } // Now, attach to the update by printing out update logs. - do_attach_to_updates(log, client, update_ids, global_opts).await?; + do_attach_to_updates(log, client, update_ids, global_opts, output) + .await?; Ok(()) } @@ -149,11 +143,12 @@ impl AttachArgs { log: Logger, wicketd_addr: SocketAddrV6, global_opts: GlobalOpts, + output: CommandOutput<'_>, ) -> Result<()> { let client = create_wicketd_client(&log, wicketd_addr, WICKETD_TIMEOUT); let update_ids = self.component_ids.to_component_ids()?; - do_attach_to_updates(log, client, update_ids, global_opts).await + do_attach_to_updates(log, client, update_ids, global_opts, output).await } } @@ -162,10 +157,11 @@ async fn do_attach_to_updates( client: wicketd_client::Client, update_ids: BTreeSet, global_opts: GlobalOpts, + output: CommandOutput<'_>, ) -> Result<()> { let mut display = GroupDisplay::new_with_display( update_ids.iter().copied(), - std::io::stderr(), + output.stderr, ); if global_opts.use_color() { display.set_styles(LineDisplayStyles::colorized()); diff --git a/wicket/src/cli/upload.rs b/wicket/src/cli/upload.rs index 6e154fd4d9..c9ec4ea2bb 100644 --- a/wicket/src/cli/upload.rs +++ b/wicket/src/cli/upload.rs @@ -31,14 +31,12 @@ pub(crate) struct UploadArgs { } impl UploadArgs { - pub(crate) fn exec( + pub(crate) async fn exec( self, log: slog::Logger, wicketd_addr: SocketAddrV6, ) -> Result<()> { - let runtime = - tokio::runtime::Runtime::new().context("creating tokio runtime")?; - runtime.block_on(self.do_upload(log, wicketd_addr)) + self.do_upload(log, wicketd_addr).await } async fn do_upload( diff --git a/wicket/src/dispatch.rs b/wicket/src/dispatch.rs index 39fad0dab6..fd6f7cd290 100644 --- a/wicket/src/dispatch.rs +++ b/wicket/src/dispatch.rs @@ -12,34 +12,82 @@ use clap::Parser; use omicron_common::{address::WICKETD_PORT, FileKv}; use slog::Drain; -use crate::{cli::ShellApp, Runner}; +use crate::{ + cli::{CommandOutput, ShellApp}, + Runner, +}; pub fn exec() -> Result<()> { let wicketd_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, WICKETD_PORT, 0, 0); // SSH_ORIGINAL_COMMAND contains additional arguments, if any. - if let Ok(ssh_args) = std::env::var("SSH_ORIGINAL_COMMAND") { - // The argument is in a quoted form, so split it using Unix shell semantics. - let args = shell_words::split(&ssh_args).with_context(|| { - format!("could not parse shell arguments from input {ssh_args}") - })?; - // parse_from uses the the first argument as the command name. Insert "wicket" as - // the command name. - let app = ShellApp::parse_from( - std::iter::once("wicket".to_owned()).chain(args), - ); - - let log = setup_log( - &log_path()?, - WithStderr::Yes { use_color: app.global_opts.use_color() }, - )?; - app.exec(log, wicketd_addr) - } else { - // Do not expose log messages via standard error since they'll show up - // on top of the TUI. - let log = setup_log(&log_path()?, WithStderr::No)?; - Runner::new(log, wicketd_addr).run() + match std::env::var("SSH_ORIGINAL_COMMAND") { + Ok(ssh_args) => { + let args = shell_words::split(&ssh_args).with_context(|| { + format!("could not parse shell arguments from input {ssh_args}") + })?; + + let runtime = tokio::runtime::Runtime::new() + .context("creating tokio runtime")?; + runtime.block_on(exec_with_args( + wicketd_addr, + args, + OutputKind::Terminal, + )) + } + Err(_) => { + // Do not expose log messages via standard error since they'll show up + // on top of the TUI. + let log = setup_log(&log_path()?, WithStderr::No)?; + Runner::new(log, wicketd_addr).run() + } + } +} + +/// Enables capturing of wicket's output. +pub enum OutputKind<'a> { + /// Captures output to the provided log, as well as a buffer. + Captured { + log: slog::Logger, + stdout: &'a mut Vec, + stderr: &'a mut Vec, + }, + + /// Writes output to a terminal. + Terminal, +} + +pub async fn exec_with_args( + wicketd_addr: SocketAddrV6, + args: Vec, + output: OutputKind<'_>, +) -> Result<()> +where + S: AsRef, +{ + // parse_from uses the the first argument as the command name. Insert "wicket" as + // the command name. + let app = ShellApp::parse_from( + std::iter::once("wicket").chain(args.iter().map(|s| s.as_ref())), + ); + + match output { + OutputKind::Captured { log, stdout, stderr } => { + let output = CommandOutput { stdout, stderr }; + app.exec(log, wicketd_addr, output).await + } + OutputKind::Terminal => { + let log = setup_log( + &log_path()?, + WithStderr::Yes { use_color: app.global_opts.use_color() }, + )?; + let mut stdout = std::io::stdout(); + let mut stderr = std::io::stderr(); + let output = + CommandOutput { stdout: &mut stdout, stderr: &mut stderr }; + app.exec(log, wicketd_addr, output).await + } } } From 03c7f12cf7a70ed7b360a310e7a6b46d0075ac72 Mon Sep 17 00:00:00 2001 From: Michael Zeller Date: Wed, 8 Nov 2023 09:45:26 -0500 Subject: [PATCH 14/18] CmdError error messages should provide more information (#4428) Fixes: #4419 This commit is a little invasive at various call sites so I would appreciate some feedback. Before this commit you would see: ``` sled-agent: Failed to delete all XDE devices ``` After this commit we now see: ``` sled-agent: Failed to delete all XDE devices Caused by: 0: Failure interacting with the OPTE ioctl(2) interface: command ListPorts failed: BadApiVersion { user: 25, kernel: 23 } 1: command ListPorts failed: BadApiVersion { user: 25, kernel: 23 } ``` --------- Co-authored-by: John Gallagher --- common/src/cmd.rs | 4 +- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 5 +- gateway/src/bin/mgs.rs | 99 ++++++++++--------- .../src/bin/installinator-artifactd.rs | 7 +- nexus/src/bin/nexus.rs | 9 +- nexus/tests/integration_tests/commands.rs | 17 ++-- oximeter/collector/src/bin/oximeter.rs | 19 ++-- sled-agent/src/bin/sled-agent-sim.rs | 12 +-- sled-agent/src/bin/sled-agent.rs | 24 ++--- sp-sim/src/bin/sp-sim.rs | 12 +-- wicketd/src/bin/wicketd.rs | 41 ++++---- 11 files changed, 130 insertions(+), 119 deletions(-) diff --git a/common/src/cmd.rs b/common/src/cmd.rs index d92ebe4c98..f7ede8d127 100644 --- a/common/src/cmd.rs +++ b/common/src/cmd.rs @@ -13,7 +13,7 @@ pub enum CmdError { /// incorrect command-line arguments Usage(String), /// all other errors - Failure(String), + Failure(anyhow::Error), } /// Exits the current process on a fatal error. @@ -26,7 +26,7 @@ pub fn fatal(cmd_error: CmdError) -> ! { .unwrap_or("command"); let (exit_code, message) = match cmd_error { CmdError::Usage(m) => (2, m), - CmdError::Failure(m) => (1, m), + CmdError::Failure(e) => (1, format!("{e:?}")), }; eprintln!("{}: {}", arg0, message); exit(exit_code); diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index 73d698f8d3..f8deae30d0 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -4,8 +4,7 @@ //! Developer tool for easily running bits of Omicron -use anyhow::bail; -use anyhow::Context; +use anyhow::{bail, Context}; use camino::Utf8Path; use camino::Utf8PathBuf; use clap::Args; @@ -34,7 +33,7 @@ async fn main() -> Result<(), anyhow::Error> { OmicronDb::CertCreate { ref args } => cmd_cert_create(args).await, }; if let Err(error) = result { - fatal(CmdError::Failure(format!("{:#}", error))); + fatal(CmdError::Failure(error)); } Ok(()) } diff --git a/gateway/src/bin/mgs.rs b/gateway/src/bin/mgs.rs index 81b10ef669..6917d4f174 100644 --- a/gateway/src/bin/mgs.rs +++ b/gateway/src/bin/mgs.rs @@ -4,6 +4,7 @@ //! Executable program to run gateway, the management gateway service +use anyhow::{anyhow, Context}; use clap::Parser; use futures::StreamExt; use omicron_common::cmd::{fatal, CmdError}; @@ -70,26 +71,24 @@ async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); match args { - Args::Openapi => run_openapi().map_err(CmdError::Failure), + Args::Openapi => { + run_openapi().map_err(|e| CmdError::Failure(anyhow!(e))) + } Args::Run { config_file_path, id_and_address_from_smf, id, address, } => { - let config = Config::from_file(&config_file_path).map_err(|e| { - CmdError::Failure(format!( - "failed to parse {}: {}", - config_file_path.display(), - e - )) - })?; - - let mut signals = Signals::new([signal::SIGUSR1]).map_err(|e| { - CmdError::Failure(format!( - "failed to set up signal handler: {e}" - )) - })?; + let config = Config::from_file(&config_file_path) + .with_context(|| { + format!("failed to parse {}", config_file_path.display()) + }) + .map_err(CmdError::Failure)?; + + let mut signals = Signals::new([signal::SIGUSR1]) + .context("failed to set up signal handler") + .map_err(CmdError::Failure)?; let (id, addresses, rack_id) = if id_and_address_from_smf { let config = read_smf_config()?; @@ -102,8 +101,9 @@ async fn do_run() -> Result<(), CmdError> { (id.unwrap(), vec![address.unwrap()], rack_id) }; let args = MgsArguments { id, addresses, rack_id }; - let mut server = - start_server(config, args).await.map_err(CmdError::Failure)?; + let mut server = start_server(config, args) + .await + .map_err(|e| CmdError::Failure(anyhow!(e)))?; loop { tokio::select! { @@ -111,18 +111,13 @@ async fn do_run() -> Result<(), CmdError> { Some(signal::SIGUSR1) => { let new_config = read_smf_config()?; if new_config.id != id { - return Err(CmdError::Failure( - "cannot change server ID on refresh" - .to_string() - )); + return Err(CmdError::Failure(anyhow!("cannot change server ID on refresh"))); } server.set_rack_id(new_config.rack_id); server .adjust_dropshot_addresses(&new_config.addresses) .await - .map_err(|err| CmdError::Failure( - format!("config refresh failed: {err}") - ))?; + .map_err(|err| CmdError::Failure(anyhow!("config refresh failed: {err}")))?; } // We only register `SIGUSR1` and never close the // handle, so we never expect `None` or any other @@ -130,7 +125,7 @@ async fn do_run() -> Result<(), CmdError> { _ => unreachable!("invalid signal: {signal:?}"), }, result = server.wait_for_finish() => { - return result.map_err(CmdError::Failure) + return result.map_err(|err| CmdError::Failure(anyhow!(err))) } } } @@ -141,7 +136,7 @@ async fn do_run() -> Result<(), CmdError> { #[cfg(target_os = "illumos")] fn read_smf_config() -> Result { fn scf_to_cmd_err(err: illumos_utils::scf::ScfError) -> CmdError { - CmdError::Failure(err.to_string()) + CmdError::Failure(anyhow!(err)) } use illumos_utils::scf::ScfHandle; @@ -165,12 +160,14 @@ fn read_smf_config() -> Result { let prop_id = config.value_as_string(PROP_ID).map_err(scf_to_cmd_err)?; - let prop_id = Uuid::try_parse(&prop_id).map_err(|err| { - CmdError::Failure(format!( - "failed to parse `{CONFIG_PG}/{PROP_ID}` \ - ({prop_id:?}) as a UUID: {err}" - )) - })?; + let prop_id = Uuid::try_parse(&prop_id) + .with_context(|| { + format!( + "failed to parse `{CONFIG_PG}/{PROP_ID}` ({prop_id:?}) as a \ + UUID" + ) + }) + .map_err(CmdError::Failure)?; let prop_rack_id = config.value_as_string(PROP_RACK_ID).map_err(scf_to_cmd_err)?; @@ -178,12 +175,16 @@ fn read_smf_config() -> Result { let rack_id = if prop_rack_id == "unknown" { None } else { - Some(Uuid::try_parse(&prop_rack_id).map_err(|err| { - CmdError::Failure(format!( - "failed to parse `{CONFIG_PG}/{PROP_RACK_ID}` \ - ({prop_rack_id:?}) as a UUID: {err}" - )) - })?) + Some( + Uuid::try_parse(&prop_rack_id) + .with_context(|| { + format!( + "failed to parse `{CONFIG_PG}/{PROP_RACK_ID}` \ + ({prop_rack_id:?}) as a UUID" + ) + }) + .map_err(CmdError::Failure)?, + ) }; let prop_addr = @@ -192,16 +193,20 @@ fn read_smf_config() -> Result { let mut addresses = Vec::with_capacity(prop_addr.len()); for addr in prop_addr { - addresses.push(addr.parse().map_err(|err| { - CmdError::Failure(format!( - "failed to parse `{CONFIG_PG}/{PROP_ADDR}` \ - ({addr:?}) as a socket address: {err}" - )) - })?); + addresses.push( + addr.parse() + .with_context(|| { + format!( + "failed to parse `{CONFIG_PG}/{PROP_ADDR}` ({addr:?}) \ + as a socket address" + ) + }) + .map_err(CmdError::Failure)?, + ); } if addresses.is_empty() { - Err(CmdError::Failure(format!( + Err(CmdError::Failure(anyhow!( "no addresses specified by `{CONFIG_PG}/{PROP_ADDR}`" ))) } else { @@ -211,7 +216,7 @@ fn read_smf_config() -> Result { #[cfg(not(target_os = "illumos"))] fn read_smf_config() -> Result { - Err(CmdError::Failure( - "SMF configuration only available on illumos".to_string(), - )) + Err(CmdError::Failure(anyhow!( + "SMF configuration only available on illumos" + ))) } diff --git a/installinator-artifactd/src/bin/installinator-artifactd.rs b/installinator-artifactd/src/bin/installinator-artifactd.rs index b09dc82acd..abe63bbe31 100644 --- a/installinator-artifactd/src/bin/installinator-artifactd.rs +++ b/installinator-artifactd/src/bin/installinator-artifactd.rs @@ -29,10 +29,9 @@ fn do_run() -> Result<(), CmdError> { match args { Args::Openapi => { installinator_artifactd::run_openapi().map_err(|error| { - CmdError::Failure(format!( - "failed to generate OpenAPI spec: {:?}", - error - )) + CmdError::Failure( + error.context("failed to generate OpenAPI spec"), + ) }) } } diff --git a/nexus/src/bin/nexus.rs b/nexus/src/bin/nexus.rs index 76e4f7aaca..24fef5c8d2 100644 --- a/nexus/src/bin/nexus.rs +++ b/nexus/src/bin/nexus.rs @@ -10,6 +10,7 @@ // - General networking and runtime tuning for availability and security: see // omicron#2184, omicron#2414. +use anyhow::anyhow; use clap::Parser; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; @@ -53,13 +54,13 @@ async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); let config = Config::from_file(args.config_file_path) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; if args.openapi { - run_openapi_external().map_err(CmdError::Failure) + run_openapi_external().map_err(|err| CmdError::Failure(anyhow!(err))) } else if args.openapi_internal { - run_openapi_internal().map_err(CmdError::Failure) + run_openapi_internal().map_err(|err| CmdError::Failure(anyhow!(err))) } else { - run_server(&config).await.map_err(CmdError::Failure) + run_server(&config).await.map_err(|err| CmdError::Failure(anyhow!(err))) } } diff --git a/nexus/tests/integration_tests/commands.rs b/nexus/tests/integration_tests/commands.rs index 66006e0bdf..02d938b2ac 100644 --- a/nexus/tests/integration_tests/commands.rs +++ b/nexus/tests/integration_tests/commands.rs @@ -56,10 +56,9 @@ fn test_nexus_bad_config() { let (exit_status, stdout_text, stderr_text) = run_command(exec); assert_exit_code(exit_status, EXIT_FAILURE, &stderr_text); assert_contents("tests/output/cmd-nexus-badconfig-stdout", &stdout_text); - assert_eq!( - stderr_text, - format!("nexus: read \"nonexistent\": {}\n", error_for_enoent()) - ); + let expected_err = + format!("nexus: read \"nonexistent\": {}\n", error_for_enoent()); + assert!(&stderr_text.starts_with(&expected_err)); } #[test] @@ -73,13 +72,11 @@ fn test_nexus_invalid_config() { "tests/output/cmd-nexus-invalidconfig-stdout", &stdout_text, ); - assert_eq!( - stderr_text, - format!( - "nexus: parse \"{}\": missing field `deployment`\n", - config_path.display() - ), + let expected_err = format!( + "nexus: parse \"{}\": missing field `deployment`\n", + config_path.display() ); + assert!(&stderr_text.starts_with(&expected_err)); } #[track_caller] diff --git a/oximeter/collector/src/bin/oximeter.rs b/oximeter/collector/src/bin/oximeter.rs index 8c4bf0e27c..d97ae5e72e 100644 --- a/oximeter/collector/src/bin/oximeter.rs +++ b/oximeter/collector/src/bin/oximeter.rs @@ -6,6 +6,7 @@ // Copyright 2023 Oxide Computer Company +use anyhow::{anyhow, Context}; use clap::Parser; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; @@ -132,7 +133,9 @@ async fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); match args { - Args::Openapi => run_openapi().map_err(CmdError::Failure), + Args::Openapi => { + run_openapi().map_err(|err| CmdError::Failure(anyhow!(err))) + } Args::Run { config_file, id, address } => { let config = Config::from_file(config_file).unwrap(); let args = OximeterArguments { id, address }; @@ -141,13 +144,15 @@ async fn do_run() -> Result<(), CmdError> { .unwrap() .serve_forever() .await - .map_err(|e| CmdError::Failure(e.to_string())) + .context("Failed to create oximeter") + .map_err(CmdError::Failure) } Args::Standalone { id, address, nexus, clickhouse, log_level } => { // Start the standalone Nexus server, for registration of both the // collector and producers. let nexus_server = StandaloneNexus::new(nexus.into(), log_level) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .context("Failed to create nexus") + .map_err(CmdError::Failure)?; let args = OximeterArguments { id, address }; Oximeter::new_standalone( nexus_server.log(), @@ -159,10 +164,10 @@ async fn do_run() -> Result<(), CmdError> { .unwrap() .serve_forever() .await - .map_err(|e| CmdError::Failure(e.to_string())) - } - Args::StandaloneOpenapi => { - run_standalone_openapi().map_err(CmdError::Failure) + .context("Failed to create standalone oximeter") + .map_err(CmdError::Failure) } + Args::StandaloneOpenapi => run_standalone_openapi() + .map_err(|err| CmdError::Failure(anyhow!(err))), } } diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index adba8eab6b..ee0ebda71e 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -6,7 +6,7 @@ // TODO see the TODO for nexus. -use anyhow::Context; +use anyhow::{anyhow, Context}; use camino::Utf8PathBuf; use clap::Parser; use dropshot::ConfigDropshot; @@ -96,7 +96,7 @@ async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); let tmp = camino_tempfile::tempdir() - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; let config = Config { id: args.uuid, sim_mode: args.sim_mode, @@ -125,10 +125,10 @@ async fn do_run() -> Result<(), CmdError> { (Some(cert_path), Some(key_path)) => { let cert_bytes = std::fs::read_to_string(&cert_path) .with_context(|| format!("read {:?}", &cert_path)) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(CmdError::Failure)?; let key_bytes = std::fs::read_to_string(&key_path) .with_context(|| format!("read {:?}", &key_path)) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(CmdError::Failure)?; Some(NexusTypes::Certificate { cert: cert_bytes, key: key_bytes }) } _ => { @@ -145,7 +145,5 @@ async fn do_run() -> Result<(), CmdError> { tls_certificate, }; - run_standalone_server(&config, &rss_args) - .await - .map_err(|e| CmdError::Failure(e.to_string())) + run_standalone_server(&config, &rss_args).await.map_err(CmdError::Failure) } diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 5764bf8309..b8b5abf07f 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -4,6 +4,7 @@ //! Executable program to run the sled agent +use anyhow::anyhow; use camino::Utf8PathBuf; use clap::{Parser, Subcommand}; use omicron_common::cmd::fatal; @@ -51,16 +52,14 @@ async fn do_run() -> Result<(), CmdError> { match args { Args::Openapi(flavor) => match flavor { - OpenapiFlavor::Sled => { - sled_server::run_openapi().map_err(CmdError::Failure) - } - OpenapiFlavor::Bootstrap => { - bootstrap_server::run_openapi().map_err(CmdError::Failure) - } + OpenapiFlavor::Sled => sled_server::run_openapi() + .map_err(|err| CmdError::Failure(anyhow!(err))), + OpenapiFlavor::Bootstrap => bootstrap_server::run_openapi() + .map_err(|err| CmdError::Failure(anyhow!(err))), }, Args::Run { config_path } => { let config = SledConfig::from_file(&config_path) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; // - Sled agent starts with the normal config file - typically // called "config.toml". @@ -83,7 +82,7 @@ async fn do_run() -> Result<(), CmdError> { let rss_config = if rss_config_path.exists() { Some( RssConfig::from_file(rss_config_path) - .map_err(|e| CmdError::Failure(e.to_string()))?, + .map_err(|e| CmdError::Failure(anyhow!(e)))?, ) } else { None @@ -91,7 +90,7 @@ async fn do_run() -> Result<(), CmdError> { let server = bootstrap_server::Server::start(config) .await - .map_err(|err| CmdError::Failure(format!("{err:#}")))?; + .map_err(|err| CmdError::Failure(anyhow!(err)))?; // If requested, automatically supply the RSS configuration. // @@ -103,12 +102,15 @@ async fn do_run() -> Result<(), CmdError> { // abandon the server. Ok(_) | Err(RssAccessError::AlreadyInitialized) => {} Err(e) => { - return Err(CmdError::Failure(e.to_string())); + return Err(CmdError::Failure(anyhow!(e))); } } } - server.wait_for_finish().await.map_err(CmdError::Failure)?; + server + .wait_for_finish() + .await + .map_err(|err| CmdError::Failure(anyhow!(err)))?; Ok(()) } diff --git a/sp-sim/src/bin/sp-sim.rs b/sp-sim/src/bin/sp-sim.rs index 76cb851528..ee0dd3b70c 100644 --- a/sp-sim/src/bin/sp-sim.rs +++ b/sp-sim/src/bin/sp-sim.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use anyhow::Result; +use anyhow::{anyhow, Result}; use clap::Parser; use omicron_common::cmd::{fatal, CmdError}; use sp_sim::config::Config; @@ -27,14 +27,12 @@ async fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); let config = Config::from_file(args.config_file_path) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; - let log = sp_sim::logger(&config) - .map_err(|e| CmdError::Failure(e.to_string()))?; + let log = sp_sim::logger(&config).map_err(CmdError::Failure)?; - let _rack = SimRack::start(&config, &log) - .await - .map_err(|e| CmdError::Failure(e.to_string()))?; + let _rack = + SimRack::start(&config, &log).await.map_err(CmdError::Failure)?; // for now, do nothing except let the spawned tasks run. in the future // (or when used as a library), the expectation is that a caller can diff --git a/wicketd/src/bin/wicketd.rs b/wicketd/src/bin/wicketd.rs index 2e6d51c0f0..887ac496e0 100644 --- a/wicketd/src/bin/wicketd.rs +++ b/wicketd/src/bin/wicketd.rs @@ -4,6 +4,7 @@ //! Executable for wicketd: technician port based management service +use anyhow::{anyhow, Context}; use clap::Parser; use omicron_common::{ address::Ipv6Subnet, @@ -69,7 +70,9 @@ async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); match args { - Args::Openapi => run_openapi().map_err(CmdError::Failure), + Args::Openapi => { + run_openapi().map_err(|err| CmdError::Failure(anyhow!(err))) + } Args::Run { config_file_path, address, @@ -82,10 +85,10 @@ async fn do_run() -> Result<(), CmdError> { } => { let baseboard = if let Some(baseboard_file) = baseboard_file { let baseboard_file = std::fs::read_to_string(baseboard_file) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; let baseboard: Baseboard = serde_json::from_str(&baseboard_file) - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(|e| CmdError::Failure(anyhow!(e)))?; // TODO-correctness `Baseboard::unknown()` is slated for removal // after some refactoring in sled-agent, at which point we'll @@ -100,19 +103,17 @@ async fn do_run() -> Result<(), CmdError> { None }; - let config = Config::from_file(&config_file_path).map_err(|e| { - CmdError::Failure(format!( - "failed to parse {}: {}", - config_file_path.display(), - e - )) - })?; + let config = Config::from_file(&config_file_path) + .with_context(|| { + format!("failed to parse {}", config_file_path.display()) + }) + .map_err(CmdError::Failure)?; let rack_subnet = match rack_subnet { Some(addr) => Some(Ipv6Subnet::new(addr)), None if read_smf_config => { let smf_values = SmfConfigValues::read_current() - .map_err(|e| CmdError::Failure(e.to_string()))?; + .map_err(CmdError::Failure)?; smf_values.rack_subnet } None => None, @@ -126,12 +127,18 @@ async fn do_run() -> Result<(), CmdError> { baseboard, rack_subnet, }; - let log = config.log.to_logger("wicketd").map_err(|msg| { - CmdError::Failure(format!("initializing logger: {}", msg)) - })?; - let server = - Server::start(log, args).await.map_err(CmdError::Failure)?; - server.wait_for_finish().await.map_err(CmdError::Failure) + let log = config + .log + .to_logger("wicketd") + .context("failed to initialize logger") + .map_err(CmdError::Failure)?; + let server = Server::start(log, args) + .await + .map_err(|err| CmdError::Failure(anyhow!(err)))?; + server + .wait_for_finish() + .await + .map_err(|err| CmdError::Failure(anyhow!(err))) } } } From 9b22b47de93f906b0f9702c7d8ff0e4fc4341354 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:44:40 -0800 Subject: [PATCH 15/18] fix(deps): update rust crate russh to 0.39.0 (#4467) --- Cargo.lock | 4 ++-- end-to-end-tests/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3516f72fac..0f51a0aa33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7406,9 +7406,9 @@ dependencies = [ [[package]] name = "russh" -version = "0.38.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae0efcc0f4cd6c062c07e572ce4b806e3967fa029fcbfcc0aa98fb5910a37925" +checksum = "7878311587d0353a854d5be954fbe68bdf6e77873933b484d1e45db12bb2f8cf" dependencies = [ "aes", "aes-gcm", diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index 732a4a2091..87f63ea1df 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -17,7 +17,7 @@ omicron-test-utils.workspace = true oxide-client.workspace = true rand.workspace = true reqwest.workspace = true -russh = "0.38.0" +russh = "0.39.0" russh-keys = "0.38.0" serde_json.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } From cb3e01c704811a16a1fbdac098ef7af72a317ee3 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:46:18 -0800 Subject: [PATCH 16/18] fix(deps): update rust crate reedline to 0.25.0 (#4465) --- Cargo.lock | 31 ++++++++----------------------- wicket-dbg/Cargo.toml | 2 +- workspace-hack/Cargo.toml | 2 ++ 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f51a0aa33..defbd46367 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1358,23 +1358,6 @@ dependencies = [ "cfg-if 1.0.0", ] -[[package]] -name = "crossterm" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13" -dependencies = [ - "bitflags 1.3.2", - "crossterm_winapi", - "libc", - "mio", - "parking_lot 0.12.1", - "serde", - "signal-hook", - "signal-hook-mio", - "winapi", -] - [[package]] name = "crossterm" version = "0.27.0" @@ -1387,6 +1370,7 @@ dependencies = [ "libc", "mio", "parking_lot 0.12.1", + "serde", "signal-hook", "signal-hook-mio", "winapi", @@ -5437,6 +5421,7 @@ dependencies = [ "const-oid", "crossbeam-epoch", "crossbeam-utils", + "crossterm", "crypto-common", "diesel", "digest", @@ -6992,7 +6977,7 @@ checksum = "2e2e4cd95294a85c3b4446e63ef054eea43e0205b1fd60120c16b74ff7ff96ad" dependencies = [ "bitflags 2.4.0", "cassowary", - "crossterm 0.27.0", + "crossterm", "indoc 2.0.3", "itertools 0.11.0", "paste", @@ -7073,12 +7058,12 @@ dependencies = [ [[package]] name = "reedline" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3defc4467a8909614bcb02cb304a3e8472c31f7a44e6c6c287eb9575b212bc4d" +checksum = "e7dc1d1d369c194cf79acc204397aca1fecc4248df3e1c1eabb15e5ef2d16991" dependencies = [ "chrono", - "crossterm 0.26.1", + "crossterm", "fd-lock", "itertools 0.10.5", "nu-ansi-term", @@ -10150,7 +10135,7 @@ dependencies = [ "camino", "ciborium", "clap 4.4.3", - "crossterm 0.27.0", + "crossterm", "debug-ignore", "futures", "hex", @@ -10215,7 +10200,7 @@ dependencies = [ "camino", "ciborium", "clap 4.4.3", - "crossterm 0.27.0", + "crossterm", "omicron-workspace-hack", "ratatui", "reedline", diff --git a/wicket-dbg/Cargo.toml b/wicket-dbg/Cargo.toml index e7e8a58468..d546c41e44 100644 --- a/wicket-dbg/Cargo.toml +++ b/wicket-dbg/Cargo.toml @@ -21,7 +21,7 @@ tokio = { workspace = true, features = ["full"] } wicket.workspace = true # used only by wicket-dbg binary -reedline = "0.23.0" +reedline = "0.25.0" omicron-workspace-hack.workspace = true [[bin]] diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 43276ef15b..b940b1daa6 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -31,6 +31,7 @@ console = { version = "0.15.7" } const-oid = { version = "0.9.5", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.15" } crossbeam-utils = { version = "0.8.16" } +crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } diesel = { version = "2.1.3", features = ["chrono", "i-implement-a-third-party-backend-and-opt-into-breaking-changes", "network-address", "postgres", "r2d2", "serde_json", "uuid"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } @@ -123,6 +124,7 @@ console = { version = "0.15.7" } const-oid = { version = "0.9.5", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.15" } crossbeam-utils = { version = "0.8.16" } +crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } diesel = { version = "2.1.3", features = ["chrono", "i-implement-a-third-party-backend-and-opt-into-breaking-changes", "network-address", "postgres", "r2d2", "serde_json", "uuid"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } From 0587f767f7bcd3ab15e148b405dd0a027e131271 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:47:31 -0800 Subject: [PATCH 17/18] chore(deps): update rust crate toml_edit to 0.21.0 (#4454) --- Cargo.lock | 14 ++------------ Cargo.toml | 2 +- workspace-hack/Cargo.toml | 6 ++++-- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index defbd46367..1a9c93ee9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5493,6 +5493,7 @@ dependencies = [ "toml 0.7.8", "toml_datetime", "toml_edit 0.19.15", + "toml_edit 0.21.0", "tracing", "trust-dns-proto", "unicode-bidi", @@ -9268,17 +9269,6 @@ dependencies = [ "winnow", ] -[[package]] -name = "toml_edit" -version = "0.20.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70f427fce4d84c72b5b732388bf4a9f4531b53f74e2887e3ecb2481f68f66d81" -dependencies = [ - "indexmap 2.1.0", - "toml_datetime", - "winnow", -] - [[package]] name = "toml_edit" version = "0.21.0" @@ -10166,7 +10156,7 @@ dependencies = [ "tokio", "tokio-util", "toml 0.8.8", - "toml_edit 0.20.7", + "toml_edit 0.21.0", "tui-tree-widget", "unicode-width", "update-engine", diff --git a/Cargo.toml b/Cargo.toml index e915a2d113..cce4438211 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -365,7 +365,7 @@ tokio-stream = "0.1.14" tokio-tungstenite = "0.18" tokio-util = "0.7.10" toml = "0.8.8" -toml_edit = "0.20.7" +toml_edit = "0.21.0" topological-sort = "0.2.2" tough = { version = "0.14", features = [ "http" ] } trust-dns-client = "0.22" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index b940b1daa6..621f1a8a99 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -95,6 +95,7 @@ tokio = { version = "1.33.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.14", features = ["net"] } toml = { version = "0.7.8" } +toml_edit-647d43efb71741da = { package = "toml_edit", version = "0.21.0", features = ["serde"] } tracing = { version = "0.1.37", features = ["log"] } trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.13" } @@ -190,6 +191,7 @@ tokio = { version = "1.33.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.14", features = ["net"] } toml = { version = "0.7.8" } +toml_edit-647d43efb71741da = { package = "toml_edit", version = "0.21.0", features = ["serde"] } tracing = { version = "0.1.37", features = ["log"] } trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.13" } @@ -250,7 +252,7 @@ mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19.15", features = ["serde"] } +toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } [target.x86_64-unknown-illumos.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } @@ -259,6 +261,6 @@ mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } rustix = { version = "0.38.9", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } -toml_edit = { version = "0.19.15", features = ["serde"] } +toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } ### END HAKARI SECTION From 6478a97d37799934b7d56a1a4cd4f055eca0f3e6 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 8 Nov 2023 12:27:27 -0800 Subject: [PATCH 18/18] update database schema update docs (#4393) --- schema/crdb/README.adoc | 107 +++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/schema/crdb/README.adoc b/schema/crdb/README.adoc index f92748f101..fba36ed73b 100644 --- a/schema/crdb/README.adoc +++ b/schema/crdb/README.adoc @@ -7,25 +7,31 @@ This directory describes the schema(s) used by CockroachDB. We use the following conventions: -* `schema/crdb/VERSION/up.sql`: The necessary idempotent migrations to transition from the - previous version of CockroachDB to this version. These migrations will always be placed - within one transaction per file. -** If more than one change is needed per version, any number of files starting with `up` - and ending with `.sql` may be used. These files will be sorted in lexicographic order - before being executed, and each will be executed in a separate transaction. -** CockroachDB documentation recommends the following: "Execute schema changes... in a single - explicit transaction consisting of the single schema change statement". - Practically this means: If you want to change multiple tables, columns, - types, indices, or constraints, do so in separate files. -** More information can be found here: https://www.cockroachlabs.com/docs/stable/online-schema-changes -* `schema/crdb/dbinit.sql`: The necessary operations to create the latest version - of the schema. Should be equivalent to running all `up.sql` migrations, in-order. -* `schema/crdb/dbwipe.sql`: The necessary operations to delete the latest version - of the schema. - -Note that to upgrade from version N to version N+2, we always need to apply the -N+1 upgrade first, before applying the N+2 upgrade. This simplifies our model -of DB schema changes as an incremental linear history. +* `schema/crdb/VERSION/up*.sql`: Files containing the necessary idempotent + statements to transition from the previous version of the database schema to + this version. All of the statements in a given file will be executed + together in one transaction; however, usually only one statement should + appear in each file. More on this below. +** If there's only one statement required, we put it into `up.sql`. +** If more than one change is needed, any number of files starting with `up` + and ending with `.sql` may be used. These files will be sorted in + lexicographic order before being executed. Each will be executed in a + separate transaction. +** CockroachDB documentation recommends the following: "Execute schema + changes ... in an explicit transaction consisting of the single schema + change statement.". Practically this means: If you want to change multiple + tables, columns, types, indices, or constraints, do so in separate files. + See https://www.cockroachlabs.com/docs/stable/online-schema-changes for + more. +* `schema/crdb/dbinit.sql`: The necessary operations to create the latest + version of the schema. Should be equivalent to running all `up.sql` + migrations, in-order. +* `schema/crdb/dbwipe.sql`: The necessary operations to delete the latest + version of the schema. + +Note that to upgrade from version N to version N+2, we always apply the N+1 +upgrade first, before applying the N+2 upgrade. This simplifies our model of DB +schema changes by ensuring an incremental, linear history. == Offline Upgrade @@ -34,9 +40,9 @@ This means we're operating with the following constraints: * We assume that downtime is acceptable to perform an update. * We assume that while an update is occuring, all Nexus services -are running the same version of software. + are running the same version of software. * We assume that no (non-upgrade) concurrent database requests will happen for -the duration of the migration. + the duration of the migration. This is not an acceptable long-term solution - we must be able to update without downtime - but it is an interim solution, and one which provides a @@ -44,7 +50,7 @@ fall-back pathway for performing upgrades. See RFD 319 for more discussion of the online upgrade plans. -=== How to change the schema +== How to change the schema Assumptions: @@ -53,11 +59,22 @@ Assumptions: Process: -* Choose a `NEW_VERSION` number. This should almost certainly be a major version bump over `OLD_VERSION`. -* Add a file to `schema/crdb/NEW_VERSION/up.sql` with your changes to the schema. -** This file should validate the expected current version transactionally. -** This file should only issue a single schema-modifying statement per transaction. -** This file should not issue any data-modifying operations within the schema-modifying transactions. +* Choose a `NEW_VERSION` number. This should almost certainly be a major + version bump over `OLD_VERSION`. +* Create directory `schema/crdb/NEW_VERSION`. +* If only one SQL statement is necessary to get from `OLD_VERSION` to + `NEW_VERSION`, put that statement into `schema/crdb/NEW_VERSION/up.sql`. If + multiple statements are required, put each one into a separate file, naming + these `schema/crdb/NEW_VERSION/upN.sql` for as many `N` as you need. +** Each file should contain _either_ one schema-modifying statement _or_ some + number of data-modifying statements. You can combine multiple data-modifying + statements. But you should not mix schema-modifying statements and + data-modifying statements in one file. And you should not include multiple + schema-modifying statements in one file. +** Beware that the entire file will be run in one transaction. Expensive data- + modifying operations leading to long-running transactions are generally + to-be-avoided; however, there's no better way to do this today if you really + do need to update thousands of rows as part of the update. * Update `schema/crdb/dbinit.sql` to match what the database should look like after your update is applied. Don't forget to update the version field of `db_metadata` at the bottom of the file! @@ -68,35 +85,23 @@ Process: SQL Validation, via Automated Tests: * The `SCHEMA_VERSION` matches the version used in `dbinit.sql` -* The combination of all `up.sql` files results in the same schema as `dbinit.sql` +* The combination of all `up.sql` files results in the same schema as + `dbinit.sql` * All `up.sql` files can be applied twice without error -==== Handling common schema changes +=== General notes -Although CockroachDB's schema includes some opaque internally-generated fields -that are order dependent - such as the names of anonymous CHECK constraints - -our schema comparison tools intentionally ignore these values. As a result, -when performing schema changes, the order of new tables and constraints should -generally not be important. +CockroachDB's representation of the schema includes some opaque +internally-generated fields that are order dependent, like the names of +anonymous CHECK constraints. Our schema comparison tools intentionally ignore +these values. As a result, when performing schema changes, the order of new +tables and constraints should generally not be important. -As convention, however, we recommend keeping the `db_metadata` file at the end of -`dbinit.sql`, so that the database does not contain a version until it is fully -populated. +As convention, however, we recommend keeping the `db_metadata` file at the end +of `dbinit.sql`, so that the database does not contain a version until it is +fully populated. -==== Adding new source tables to an existing view - -An upgrade can add a new table and then use a `CREATE OR REPLACE VIEW` statement -to make an existing view depend on that table. To do this in `dbinit.sql` while -maintaining table and view ordering, use `CREATE VIEW` to create a "placeholder" -view in the correct position, then add the table to the bottom of `dbinit.sql` -and use `CREATE OR REPLACE VIEW` to "fill out" the placeholder definition to -refer to the new table. (You may need to do the `CREATE OR REPLACE VIEW` in a -separate transaction from the original `CREATE VIEW`.) - -Note that `CREATE OR REPLACE VIEW` requires that the new view maintain all of -the columns of the old view with the same type and same order (though the query -used to populate them can change. See -https://www.postgresql.org/docs/15/sql-createview.html. +=== Scenario-specific gotchas ==== Renaming columns