From cf3bdaee3885dc34c838c5587e92787b772133a9 Mon Sep 17 00:00:00 2001 From: Patrick Mooney Date: Fri, 6 Oct 2023 16:40:00 -0500 Subject: [PATCH 01/13] Update Propolis to include UART fix This updates the Propolis packaging to use a version with the fix to oxidecomputer/propolis#540. --- Cargo.lock | 22 +++++++++++----------- Cargo.toml | 6 +++--- package-manifest.toml | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18e0e15c3b..ec4efec0fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -485,7 +485,7 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "bhyve_api_sys", "libc", @@ -495,7 +495,7 @@ dependencies = [ [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "libc", "strum", @@ -1225,7 +1225,7 @@ dependencies = [ [[package]] name = "cpuid_profile_config" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "propolis", "serde", @@ -2016,7 +2016,7 @@ checksum = "7e1a8646b2c125eeb9a84ef0faa6d2d102ea0d5da60b824ade2743263117b848" [[package]] name = "dladm" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "libc", "strum", @@ -6593,7 +6593,7 @@ dependencies = [ [[package]] name = "propolis" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "anyhow", "bhyve_api", @@ -6626,7 +6626,7 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "async-trait", "base64 0.21.4", @@ -6650,7 +6650,7 @@ dependencies = [ [[package]] name = "propolis-server" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "anyhow", "async-trait", @@ -6702,7 +6702,7 @@ dependencies = [ [[package]] name = "propolis-server-config" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "cpuid_profile_config", "serde", @@ -6714,7 +6714,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "schemars", "serde", @@ -9761,7 +9761,7 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "viona_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "libc", "viona_api_sys", @@ -9770,7 +9770,7 @@ dependencies = [ [[package]] name = "viona_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=42c878b71a58d430dfc306126af5d40ca816d70f#42c878b71a58d430dfc306126af5d40ca816d70f" +source = "git+https://github.com/oxidecomputer/propolis?rev=901b710b6e5bd05a94a323693c2b971e7e7b240e#901b710b6e5bd05a94a323693c2b971e7e7b240e" dependencies = [ "libc", ] diff --git a/Cargo.toml b/Cargo.toml index 3b83b2f7c5..9388b2c7d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -277,9 +277,9 @@ pretty-hex = "0.3.0" proc-macro2 = "1.0" progenitor = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "42c878b71a58d430dfc306126af5d40ca816d70f" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "42c878b71a58d430dfc306126af5d40ca816d70f", features = [ "generated-migration" ] } -propolis-server = { git = "https://github.com/oxidecomputer/propolis", rev = "42c878b71a58d430dfc306126af5d40ca816d70f", default-features = false, features = ["mock-only"] } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e", features = [ "generated-migration" ] } +propolis-server = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e", default-features = false, features = ["mock-only"] } proptest = "1.2.0" quote = "1.0" rand = "0.8.5" diff --git a/package-manifest.toml b/package-manifest.toml index a7f8683eee..7cf235c24a 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -406,10 +406,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "42c878b71a58d430dfc306126af5d40ca816d70f" +source.commit = "901b710b6e5bd05a94a323693c2b971e7e7b240e" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "dce4d82bb936e990262abcaa279eee7e33a19930880b23f49fa3851cded18567" +source.sha256 = "0f681cdbe7312f66fd3c99fe033b379e49c59fa4ad04d307f68b12514307e976" output.type = "zone" [package.maghemite] From 9f004d2759b7ae827bc5e49d6cc8b5c5956573a8 Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 6 Oct 2023 21:11:29 -0700 Subject: [PATCH 02/13] [crdb-seed] use a tarball, fix omicron-dev run-all (#4208) Several changes: 1. In https://github.com/oxidecomputer/omicron/issues/4193, @david-crespo observed some missing files in the crdb-seed generated directory. My suspicion is that that is due to the `/tmp` cleaner that runs on macOS. @davepacheco suggested using a tarball to get atomicity (either the file exists or it doesn't), and it ended up being pretty simple to do that at the end. 2. Teach nexus-test-utils to ensure that the seed tarball exists, fixing `omicron-dev run-all` and anything else that uses nexus-test-utils (and avoiding a dependency on the environment). 3. Move `crdb-seed` to `dev-tools` (thanks Dave for pointing it out!) 4. Add a way to invalidate the cache with `CRDB_SEED_INVALIDATE=1` in the environment. 5. Add a readme for `crdb-seed`. Fixes #4206. Hopefully addresses #4193. --- .config/nextest.toml | 4 +- Cargo.lock | 27 +- Cargo.toml | 5 +- crdb-seed/src/main.rs | 92 ------- {crdb-seed => dev-tools/crdb-seed}/Cargo.toml | 8 +- dev-tools/crdb-seed/README.md | 11 + dev-tools/crdb-seed/src/main.rs | 39 +++ dev-tools/omicron-dev/Cargo.toml | 2 +- dev-tools/omicron-dev/src/bin/omicron-dev.rs | 10 +- .../omicron-dev/tests/test_omicron_dev.rs | 11 + nexus/test-utils/Cargo.toml | 3 + nexus/test-utils/src/db.rs | 35 ++- nexus/test-utils/src/lib.rs | 105 +++++++- test-utils/Cargo.toml | 11 +- test-utils/src/dev/mod.rs | 107 +++----- test-utils/src/dev/seed.rs | 239 ++++++++++++++++++ workspace-hack/Cargo.toml | 24 +- 17 files changed, 518 insertions(+), 215 deletions(-) delete mode 100644 crdb-seed/src/main.rs rename {crdb-seed => dev-tools/crdb-seed}/Cargo.toml (60%) create mode 100644 dev-tools/crdb-seed/README.md create mode 100644 dev-tools/crdb-seed/src/main.rs create mode 100644 test-utils/src/dev/seed.rs diff --git a/.config/nextest.toml b/.config/nextest.toml index b2a8b360bb..ba07186c8a 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -8,7 +8,9 @@ nextest-version = { required = "0.9.59", recommended = "0.9.59" } experimental = ["setup-scripts"] [[profile.default.scripts]] -filter = 'rdeps(nexus-test-utils)' +# Exclude omicron-dev tests from crdb-seed as we explicitly want to simulate an +# environment where the seed file doesn't exist. +filter = 'rdeps(nexus-test-utils) - package(omicron-dev)' setup = 'crdb-seed' [profile.ci] diff --git a/Cargo.lock b/Cargo.lock index ec4efec0fc..306e953049 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -366,6 +366,17 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" +[[package]] +name = "atomicwrites" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1163d9d7c51de51a2b79d6df5e8888d11e9df17c752ce4a285fb6ca1580734e" +dependencies = [ + "rustix 0.37.23", + "tempfile", + "windows-sys 0.48.0", +] + [[package]] name = "atty" version = "0.2.14" @@ -1268,13 +1279,10 @@ dependencies = [ name = "crdb-seed" version = "0.1.0" dependencies = [ - "camino", - "camino-tempfile", + "anyhow", "dropshot", - "hex", "omicron-test-utils", "omicron-workspace-hack", - "ring", "slog", "tokio", ] @@ -5338,11 +5346,15 @@ name = "omicron-test-utils" version = "0.1.0" dependencies = [ "anyhow", + "atomicwrites", "camino", + "camino-tempfile", "dropshot", "expectorate", + "filetime", "futures", "headers", + "hex", "http", "libc", "omicron-common 0.1.0", @@ -5351,9 +5363,11 @@ dependencies = [ "rcgen", "regex", "reqwest", + "ring", "rustls", "slog", "subprocess", + "tar", "tempfile", "thiserror", "tokio", @@ -5436,6 +5450,7 @@ dependencies = [ "regex-syntax 0.7.5", "reqwest", "ring", + "rustix 0.37.23", "rustix 0.38.9", "schemars", "semver 1.0.18", @@ -9456,8 +9471,8 @@ version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ - "cfg-if 0.1.10", - "rand 0.4.6", + "cfg-if 1.0.0", + "rand 0.8.5", "static_assertions", ] diff --git a/Cargo.toml b/Cargo.toml index 9388b2c7d6..da7b582fe3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ members = [ "clients/sled-agent-client", "clients/wicketd-client", "common", - "crdb-seed", + "dev-tools/crdb-seed", "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/thing-flinger", @@ -83,6 +83,7 @@ default-members = [ "clients/sled-agent-client", "clients/wicketd-client", "common", + "dev-tools/crdb-seed", "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/thing-flinger", @@ -137,6 +138,7 @@ assert_matches = "1.5.0" assert_cmd = "2.0.12" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "da04c087f835a51e0441addb19c5ef4986e1fcf2" } async-trait = "0.1.73" +atomicwrites = "0.4.1" authz-macros = { path = "nexus/authz-macros" } backoff = { version = "0.4.0", features = [ "tokio" ] } base64 = "0.21.4" @@ -182,6 +184,7 @@ dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", either = "1.9.0" expectorate = "1.1.0" fatfs = "0.3.6" +filetime = "0.2.22" flate2 = "1.0.27" flume = "0.11.0" foreign-types = "0.3.2" diff --git a/crdb-seed/src/main.rs b/crdb-seed/src/main.rs deleted file mode 100644 index b8572bd886..0000000000 --- a/crdb-seed/src/main.rs +++ /dev/null @@ -1,92 +0,0 @@ -use camino::Utf8PathBuf; -use dropshot::{test_util::LogContext, ConfigLogging, ConfigLoggingLevel}; -use omicron_test_utils::dev; -use slog::Logger; -use std::io::Write; - -// Creates a string identifier for the current DB schema and version. -// -// The goal here is to allow to create different "seed" directories -// for each revision of the DB. -fn digest_unique_to_schema() -> String { - let schema = include_str!("../../schema/crdb/dbinit.sql"); - let crdb_version = include_str!("../../tools/cockroachdb_version"); - let mut ctx = ring::digest::Context::new(&ring::digest::SHA256); - ctx.update(&schema.as_bytes()); - ctx.update(&crdb_version.as_bytes()); - let digest = ctx.finish(); - hex::encode(digest.as_ref()) -} - -enum SeedDirectoryStatus { - Created, - Existing, -} - -async fn ensure_seed_directory_exists( - log: &Logger, -) -> (Utf8PathBuf, SeedDirectoryStatus) { - let base_seed_dir = Utf8PathBuf::from_path_buf(std::env::temp_dir()) - .expect("Not a UTF-8 path") - .join("crdb-base"); - std::fs::create_dir_all(&base_seed_dir).unwrap(); - let desired_seed_dir = base_seed_dir.join(digest_unique_to_schema()); - - if desired_seed_dir.exists() { - return (desired_seed_dir, SeedDirectoryStatus::Existing); - } - - // The directory didn't exist when we started, so try to create it. - // - // Nextest will execute it just once, but it is possible for a user to start - // up multiple nextest processes to be running at the same time. So we - // should consider it possible for another caller to create this seed - // directory before we finish setting it up ourselves. - let tmp_seed_dir = - camino_tempfile::Utf8TempDir::new_in(base_seed_dir).unwrap(); - dev::test_setup_database_seed(log, tmp_seed_dir.path()).await; - - // If we can successfully perform the rename, there was either no - // contention or we won a creation race. - // - // If we couldn't perform the rename, the directory might already exist. - // Check that this is the error we encountered -- otherwise, we're - // struggling. - if let Err(err) = std::fs::rename(tmp_seed_dir.path(), &desired_seed_dir) { - if !desired_seed_dir.exists() { - panic!("Cannot rename seed directory for CockroachDB: {err}"); - } - } - - (desired_seed_dir, SeedDirectoryStatus::Created) -} - -#[tokio::main] -async fn main() { - // TODO: dropshot is v heavyweight for this, we should be able to pull in a - // smaller binary - let logctx = LogContext::new( - "crdb_seeding", - &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Info }, - ); - let (dir, status) = ensure_seed_directory_exists(&logctx.log).await; - match status { - SeedDirectoryStatus::Created => { - slog::info!(logctx.log, "Created seed directory: `{dir}`"); - } - SeedDirectoryStatus::Existing => { - slog::info!(logctx.log, "Using existing seed directory: `{dir}`"); - } - } - if let Ok(env_path) = std::env::var("NEXTEST_ENV") { - let mut file = std::fs::File::create(&env_path) - .expect("failed to open NEXTEST_ENV file"); - writeln!(file, "CRDB_SEED_DIR={dir}") - .expect("failed to write to NEXTEST_ENV file"); - } else { - slog::warn!( - logctx.log, - "NEXTEST_ENV not set (is this script running under nextest?)" - ); - } -} diff --git a/crdb-seed/Cargo.toml b/dev-tools/crdb-seed/Cargo.toml similarity index 60% rename from crdb-seed/Cargo.toml rename to dev-tools/crdb-seed/Cargo.toml index 8d6d570d08..aff26995dc 100644 --- a/crdb-seed/Cargo.toml +++ b/dev-tools/crdb-seed/Cargo.toml @@ -3,14 +3,12 @@ name = "crdb-seed" version = "0.1.0" edition = "2021" license = "MPL-2.0" +readme = "README.md" [dependencies] -camino.workspace = true -camino-tempfile.workspace = true +anyhow.workspace = true dropshot.workspace = true -hex.workspace = true -omicron-test-utils.workspace = true -ring.workspace = true +omicron-test-utils = { workspace = true, features = ["seed-gen"] } slog.workspace = true tokio.workspace = true omicron-workspace-hack.workspace = true diff --git a/dev-tools/crdb-seed/README.md b/dev-tools/crdb-seed/README.md new file mode 100644 index 0000000000..3b77f23066 --- /dev/null +++ b/dev-tools/crdb-seed/README.md @@ -0,0 +1,11 @@ +# crdb-seed + +This is a small utility that creates a seed tarball for our CockroachDB instance +in the temporary directory. It is used as a setup script for nextest (see +`.config/nextest.rs`). + +This utility hashes inputs and attempts to reuse a tarball if it already exists +(see `digest_unique_to_schema` in `omicron/test-utils/src/dev/seed.rs`). + +To invalidate the tarball and cause it to be recreated from scratch, set +`CRDB_SEED_INVALIDATE=1` in the environment. diff --git a/dev-tools/crdb-seed/src/main.rs b/dev-tools/crdb-seed/src/main.rs new file mode 100644 index 0000000000..26b0e19410 --- /dev/null +++ b/dev-tools/crdb-seed/src/main.rs @@ -0,0 +1,39 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{Context, Result}; +use dropshot::{test_util::LogContext, ConfigLogging, ConfigLoggingLevel}; +use omicron_test_utils::dev::seed::{ + ensure_seed_tarball_exists, should_invalidate_seed, +}; +use omicron_test_utils::dev::CRDB_SEED_TAR_ENV; +use std::io::Write; + +#[tokio::main] +async fn main() -> Result<()> { + // TODO: dropshot is v heavyweight for this, we should be able to pull in a + // smaller binary + let logctx = LogContext::new( + "crdb_seeding", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Info }, + ); + let (seed_tar, status) = + ensure_seed_tarball_exists(&logctx.log, should_invalidate_seed()) + .await?; + status.log(&logctx.log, &seed_tar); + + if let Ok(env_path) = std::env::var("NEXTEST_ENV") { + let mut file = std::fs::File::create(&env_path) + .context("failed to open NEXTEST_ENV file")?; + writeln!(file, "{CRDB_SEED_TAR_ENV}={seed_tar}") + .context("failed to write to NEXTEST_ENV file")?; + } else { + slog::warn!( + logctx.log, + "NEXTEST_ENV not set (is this script running under nextest?)" + ); + } + + Ok(()) +} diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 251ee16c01..ec7cafb559 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -16,7 +16,7 @@ futures.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true libc.workspace = true -nexus-test-utils.workspace = true +nexus-test-utils = { workspace = true, features = ["omicron-dev"] } nexus-test-interface.workspace = true omicron-common.workspace = true omicron-nexus.workspace = true diff --git a/dev-tools/omicron-dev/src/bin/omicron-dev.rs b/dev-tools/omicron-dev/src/bin/omicron-dev.rs index 9107766d8a..e79184f7e5 100644 --- a/dev-tools/omicron-dev/src/bin/omicron-dev.rs +++ b/dev-tools/omicron-dev/src/bin/omicron-dev.rs @@ -14,7 +14,6 @@ use futures::stream::StreamExt; use nexus_test_interface::NexusServer; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; -use omicron_sled_agent::sim; use omicron_test_utils::dev; use signal_hook::consts::signal::SIGINT; use signal_hook_tokio::Signals; @@ -348,13 +347,12 @@ async fn cmd_run_all(args: &RunAllArgs) -> Result<(), anyhow::Error> { config.deployment.dropshot_external.dropshot.bind_address.set_port(p); } - // Start up a ControlPlaneTestContext, which tautologically sets up - // everything needed for a simulated control plane. println!("omicron-dev: setting up all services ... "); - let cptestctx = nexus_test_utils::test_setup_with_config::< + let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< omicron_nexus::Server, - >("omicron-dev", &mut config, sim::SimMode::Auto, None) - .await; + >(&mut config) + .await + .context("error setting up services")?; println!("omicron-dev: services are running."); // Print out basic information about what was started. diff --git a/dev-tools/omicron-dev/tests/test_omicron_dev.rs b/dev-tools/omicron-dev/tests/test_omicron_dev.rs index f855d8935d..f1e8177243 100644 --- a/dev-tools/omicron-dev/tests/test_omicron_dev.rs +++ b/dev-tools/omicron-dev/tests/test_omicron_dev.rs @@ -13,6 +13,7 @@ use omicron_test_utils::dev::test_cmds::path_to_executable; use omicron_test_utils::dev::test_cmds::run_command; use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS; use omicron_test_utils::dev::test_cmds::EXIT_USAGE; +use omicron_test_utils::dev::CRDB_SEED_TAR_ENV; use oxide_client::ClientHiddenExt; use std::io::BufRead; use std::path::Path; @@ -389,6 +390,16 @@ async fn test_db_run() { // This mirrors the `test_db_run()` test. #[tokio::test] async fn test_run_all() { + // Ensure that the CRDB_SEED_TAR environment variable is not set. We want to + // simulate a user running omicron-dev without the test environment. + // Check if CRDB_SEED_TAR_ENV is set and panic if it is + if let Ok(val) = std::env::var(CRDB_SEED_TAR_ENV) { + panic!( + "CRDB_SEED_TAR_ENV should not be set here, but is set to {}", + val + ); + } + let cmd_path = path_to_omicron_dev(); let cmdstr = format!( diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 8eb8df4a5b..8cd25582be 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -39,3 +39,6 @@ trust-dns-proto.workspace = true trust-dns-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true + +[features] +omicron-dev = ["omicron-test-utils/seed-gen"] diff --git a/nexus/test-utils/src/db.rs b/nexus/test-utils/src/db.rs index 37d7128c49..ff23f35df0 100644 --- a/nexus/test-utils/src/db.rs +++ b/nexus/test-utils/src/db.rs @@ -8,7 +8,7 @@ use camino::Utf8PathBuf; use omicron_test_utils::dev; use slog::Logger; -/// Path to the "seed" CockroachDB directory. +/// Path to the "seed" CockroachDB tarball. /// /// Populating CockroachDB unfortunately isn't free - creation of /// tables, indices, and users takes several seconds to complete. @@ -16,20 +16,39 @@ use slog::Logger; /// By creating a "seed" version of the database, we can cut down /// on the time spent performing this operation. Instead, we opt /// to copy the database from this seed location. -fn seed_dir() -> Utf8PathBuf { +fn seed_tar() -> Utf8PathBuf { // The setup script should set this environment variable. - let seed_dir = std::env::var("CRDB_SEED_DIR") - .expect("CRDB_SEED_DIR missing -- are you running this test with `cargo nextest run`?"); + let seed_dir = std::env::var(dev::CRDB_SEED_TAR_ENV).unwrap_or_else(|_| { + panic!( + "{} missing -- are you running this test \ + with `cargo nextest run`?", + dev::CRDB_SEED_TAR_ENV, + ) + }); seed_dir.into() } -/// Wrapper around [`dev::test_setup_database`] which uses a a -/// seed directory provided at build-time. +/// Wrapper around [`dev::test_setup_database`] which uses a seed tarball +/// provided from the environment. pub async fn test_setup_database(log: &Logger) -> dev::db::CockroachInstance { - let dir = seed_dir(); + let input_tar = seed_tar(); dev::test_setup_database( log, - dev::StorageSource::CopyFromSeed { input_dir: dir }, + dev::StorageSource::CopyFromSeed { input_tar }, + ) + .await +} + +/// Wrapper around [`dev::test_setup_database`] which uses a seed tarball +/// provided as an argument. +#[cfg(feature = "omicron-dev")] +pub async fn test_setup_database_from_seed( + log: &Logger, + input_tar: Utf8PathBuf, +) -> dev::db::CockroachInstance { + dev::test_setup_database( + log, + dev::StorageSource::CopyFromSeed { input_tar }, ) .await } diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index d219da7e96..34c218b3e2 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -5,6 +5,7 @@ //! Integration testing facilities for Nexus use anyhow::Context; +use anyhow::Result; use camino::Utf8Path; use dns_service_client::types::DnsConfigParams; use dropshot::test_util::ClientTestContext; @@ -284,14 +285,30 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { } pub async fn start_crdb(&mut self, populate: bool) { + let populate = if populate { + PopulateCrdb::FromEnvironmentSeed + } else { + PopulateCrdb::Empty + }; + self.start_crdb_impl(populate).await; + } + + /// Private implementation of `start_crdb` that allows for a seed tarball to + /// be passed in. See [`PopulateCrdb`] for more details. + async fn start_crdb_impl(&mut self, populate: PopulateCrdb) { let log = &self.logctx.log; debug!(log, "Starting CRDB"); // Start up CockroachDB. - let database = if populate { - db::test_setup_database(log).await - } else { - db::test_setup_database_empty(log).await + let database = match populate { + PopulateCrdb::FromEnvironmentSeed => { + db::test_setup_database(log).await + } + #[cfg(feature = "omicron-dev")] + PopulateCrdb::FromSeed { input_tar } => { + db::test_setup_database_from_seed(log, input_tar).await + } + PopulateCrdb::Empty => db::test_setup_database_empty(log).await, }; eprintln!("DB URL: {}", database.pg_config()); @@ -759,17 +776,89 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { } } +/// How to populate CockroachDB. +/// +/// This is private because we want to ensure that tests use the setup script +/// rather than trying to create their own seed tarballs. This may need to be +/// revisited if circumstances change. +#[derive(Clone, Debug)] +enum PopulateCrdb { + /// Populate Cockroach from the `CRDB_SEED_TAR_ENV` environment variable. + /// + /// Any tests that depend on nexus-test-utils should have this environment + /// variable available. + FromEnvironmentSeed, + + /// Populate Cockroach from the seed located at this path. + #[cfg(feature = "omicron-dev")] + FromSeed { input_tar: camino::Utf8PathBuf }, + + /// Do not populate Cockroach. + Empty, +} + +/// Setup routine to use for `omicron-dev`. Use [`test_setup_with_config`] for +/// tests. +/// +/// The main difference from tests is that this routine ensures the seed tarball +/// exists (or creates a seed tarball if it doesn't exist). For tests, this +/// should be done in the `crdb-seed` setup script. +#[cfg(feature = "omicron-dev")] +pub async fn omicron_dev_setup_with_config( + config: &mut omicron_common::nexus_config::Config, +) -> Result> { + let builder = + ControlPlaneTestContextBuilder::::new("omicron-dev", config); + + let log = &builder.logctx.log; + debug!(log, "Ensuring seed tarball exists"); + + // Start up a ControlPlaneTestContext, which tautologically sets up + // everything needed for a simulated control plane. + let why_invalidate = + omicron_test_utils::dev::seed::should_invalidate_seed(); + let (seed_tar, status) = + omicron_test_utils::dev::seed::ensure_seed_tarball_exists( + log, + why_invalidate, + ) + .await + .context("error ensuring seed tarball exists")?; + status.log(log, &seed_tar); + + Ok(setup_with_config_impl( + builder, + PopulateCrdb::FromSeed { input_tar: seed_tar }, + sim::SimMode::Auto, + None, + ) + .await) +} + +/// Setup routine to use for tests. pub async fn test_setup_with_config( test_name: &str, config: &mut omicron_common::nexus_config::Config, sim_mode: sim::SimMode, initial_cert: Option, ) -> ControlPlaneTestContext { - let mut builder = - ControlPlaneTestContextBuilder::::new(test_name, config); + let builder = ControlPlaneTestContextBuilder::::new(test_name, config); + setup_with_config_impl( + builder, + PopulateCrdb::FromEnvironmentSeed, + sim_mode, + initial_cert, + ) + .await +} - let populate = true; - builder.start_crdb(populate).await; +async fn setup_with_config_impl( + mut builder: ControlPlaneTestContextBuilder<'_, N>, + populate: PopulateCrdb, + sim_mode: sim::SimMode, + initial_cert: Option, +) -> ControlPlaneTestContext { + builder.start_crdb_impl(populate).await; builder.start_clickhouse().await; builder.start_dendrite(SwitchLocation::Switch0).await; builder.start_dendrite(SwitchLocation::Switch1).await; diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 9e21f3ca12..7b1f70c79e 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -6,20 +6,26 @@ license = "MPL-2.0" [dependencies] anyhow.workspace = true +atomicwrites.workspace = true camino.workspace = true +camino-tempfile.workspace = true dropshot.workspace = true +filetime = { workspace = true, optional = true } futures.workspace = true headers.workspace = true +hex.workspace = true http.workspace = true libc.workspace = true omicron-common.workspace = true pem.workspace = true +ring.workspace = true rustls.workspace = true slog.workspace = true subprocess.workspace = true tempfile.workspace = true thiserror.workspace = true -tokio = { workspace = true, features = [ "full" ] } +tar.workspace = true +tokio = { workspace = true, features = ["full"] } tokio-postgres.workspace = true usdt.workspace = true rcgen.workspace = true @@ -29,3 +35,6 @@ omicron-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true + +[features] +seed-gen = ["dep:filetime"] diff --git a/test-utils/src/dev/mod.rs b/test-utils/src/dev/mod.rs index ea95a1de76..dbd66fe1f8 100644 --- a/test-utils/src/dev/mod.rs +++ b/test-utils/src/dev/mod.rs @@ -9,55 +9,21 @@ pub mod clickhouse; pub mod db; pub mod dendrite; pub mod poll; +#[cfg(feature = "seed-gen")] +pub mod seed; pub mod test_cmds; -use anyhow::Context; -use camino::Utf8Path; +use anyhow::{Context, Result}; use camino::Utf8PathBuf; pub use dropshot::test_util::LogContext; use dropshot::ConfigLogging; use dropshot::ConfigLoggingIfExists; use dropshot::ConfigLoggingLevel; use slog::Logger; -use std::path::Path; +use std::io::BufReader; -// Helper for copying all the files in one directory to another. -fn copy_dir( - src: impl AsRef, - dst: impl AsRef, -) -> Result<(), anyhow::Error> { - let src = src.as_ref(); - let dst = dst.as_ref(); - std::fs::create_dir_all(&dst) - .with_context(|| format!("Failed to create dst {}", dst.display()))?; - for entry in std::fs::read_dir(src) - .with_context(|| format!("Failed to read_dir {}", src.display()))? - { - let entry = entry.with_context(|| { - format!("Failed to read entry in {}", src.display()) - })?; - let ty = entry.file_type().context("Failed to access file type")?; - let target = dst.join(entry.file_name()); - if ty.is_dir() { - copy_dir(entry.path(), &target).with_context(|| { - format!( - "Failed to copy subdirectory {} to {}", - entry.path().display(), - target.display() - ) - })?; - } else { - std::fs::copy(entry.path(), &target).with_context(|| { - format!( - "Failed to copy file at {} to {}", - entry.path().display(), - target.display() - ) - })?; - } - } - Ok(()) -} +/// The environment variable via which the path to the seed tarball is passed. +pub static CRDB_SEED_TAR_ENV: &str = "CRDB_SEED_TAR"; /// Set up a [`dropshot::test_util::LogContext`] appropriate for a test named /// `test_name` @@ -80,36 +46,9 @@ pub enum StorageSource { DoNotPopulate, /// Populate the latest version of the database. PopulateLatest { output_dir: Utf8PathBuf }, - /// Copy the database from a seed directory, which has previously + /// Copy the database from a seed tarball, which has previously /// been created with `PopulateLatest`. - CopyFromSeed { input_dir: Utf8PathBuf }, -} - -/// Creates a [`db::CockroachInstance`] with a populated storage directory. -/// -/// This is intended to optimize subsequent calls to [`test_setup_database`] -/// by reducing the latency of populating the storage directory. -pub async fn test_setup_database_seed(log: &Logger, dir: &Utf8Path) { - let _ = std::fs::remove_dir_all(dir); - std::fs::create_dir_all(dir).unwrap(); - let mut db = setup_database( - log, - StorageSource::PopulateLatest { output_dir: dir.to_owned() }, - ) - .await; - db.cleanup().await.unwrap(); - - // See https://github.com/cockroachdb/cockroach/issues/74231 for context on - // this. We use this assertion to check that our seed directory won't point - // back to itself, even if it is copied elsewhere. - assert_eq!( - 0, - dir.join("temp-dirs-record.txt") - .metadata() - .expect("Cannot access metadata") - .len(), - "Temporary directory record should be empty after graceful shutdown", - ); + CopyFromSeed { input_tar: Utf8PathBuf }, } /// Set up a [`db::CockroachInstance`] for running tests. @@ -118,13 +57,15 @@ pub async fn test_setup_database( source: StorageSource, ) -> db::CockroachInstance { usdt::register_probes().expect("Failed to register USDT DTrace probes"); - setup_database(log, source).await + setup_database(log, source).await.unwrap() } +// TODO: switch to anyhow entirely -- this function is currently a mishmash of +// anyhow and unwrap/expect calls. async fn setup_database( log: &Logger, storage_source: StorageSource, -) -> db::CockroachInstance { +) -> Result { let builder = db::CockroachStarterBuilder::new(); let mut builder = match &storage_source { StorageSource::DoNotPopulate | StorageSource::CopyFromSeed { .. } => { @@ -135,7 +76,7 @@ async fn setup_database( } }; builder.redirect_stdio_to_files(); - let starter = builder.build().unwrap(); + let starter = builder.build().context("error building CockroachStarter")?; info!( &log, "cockroach temporary directory: {}", @@ -147,13 +88,22 @@ async fn setup_database( match &storage_source { StorageSource::DoNotPopulate | StorageSource::PopulateLatest { .. } => { } - StorageSource::CopyFromSeed { input_dir } => { + StorageSource::CopyFromSeed { input_tar } => { info!(&log, - "cockroach: copying from seed directory ({}) to storage directory ({})", - input_dir, starter.store_dir().to_string_lossy(), + "cockroach: copying from seed tarball ({}) to storage directory ({})", + input_tar, starter.store_dir().to_string_lossy(), ); - copy_dir(input_dir, starter.store_dir()) - .expect("Cannot copy storage from seed directory"); + let reader = std::fs::File::open(input_tar).with_context(|| { + format!("cannot open input tar {}", input_tar) + })?; + let mut tar = tar::Archive::new(BufReader::new(reader)); + tar.unpack(starter.store_dir()).with_context(|| { + format!( + "cannot unpack input tar {} into {}", + input_tar, + starter.store_dir().display() + ) + })?; } } @@ -184,7 +134,8 @@ async fn setup_database( info!(&log, "cockroach: populated"); } } - database + + Ok(database) } /// Returns whether the given process is currently running diff --git a/test-utils/src/dev/seed.rs b/test-utils/src/dev/seed.rs new file mode 100644 index 0000000000..841ecd5f35 --- /dev/null +++ b/test-utils/src/dev/seed.rs @@ -0,0 +1,239 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::io::{BufWriter, Write}; + +use anyhow::{ensure, Context, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use filetime::FileTime; +use slog::Logger; + +use super::CRDB_SEED_TAR_ENV; + +/// Creates a string identifier for the current DB schema and version. +// +/// The goal here is to allow to create different "seed" tarballs +/// for each revision of the DB. +pub fn digest_unique_to_schema() -> String { + let schema = include_str!("../../../schema/crdb/dbinit.sql"); + let crdb_version = include_str!("../../../tools/cockroachdb_version"); + let mut ctx = ring::digest::Context::new(&ring::digest::SHA256); + ctx.update(&schema.as_bytes()); + ctx.update(&crdb_version.as_bytes()); + let digest = ctx.finish(); + hex::encode(digest.as_ref()) +} + +/// Looks up the standard environment variable `CRDB_SEED_INVALIDATE` to check +/// if a seed should be invalidated. Returns a string to pass in as the +/// `why_invalidate` argument of [`ensure_seed_tarball_exists`]. +pub fn should_invalidate_seed() -> Option<&'static str> { + (std::env::var("CRDB_SEED_INVALIDATE").as_deref() == Ok("1")) + .then_some("CRDB_SEED_INVALIDATE=1 set in environment") +} + +/// The return value of [`ensure_seed_tarball_exists`]. +#[derive(Clone, Copy, Debug)] +pub enum SeedTarballStatus { + Created, + Invalidated, + Existing, +} + +impl SeedTarballStatus { + pub fn log(self, log: &Logger, seed_tar: &Utf8Path) { + match self { + SeedTarballStatus::Created => { + info!(log, "Created CRDB seed tarball: `{seed_tar}`"); + } + SeedTarballStatus::Invalidated => { + info!( + log, + "Invalidated and created new CRDB seed tarball: `{seed_tar}`", + ); + } + SeedTarballStatus::Existing => { + info!(log, "Using existing CRDB seed tarball: `{seed_tar}`"); + } + } + } +} + +/// Ensures that a seed tarball corresponding to the schema returned by +/// [`digest_unique_to_schema`] exists, recreating it if necessary. +/// +/// This used to create a directory rather than a tarball, but that was changed +/// due to [Omicron issue +/// #4193](https://github.com/oxidecomputer/omicron/issues/4193). +/// +/// If `why_invalidate` is `Some`, then if the seed tarball exists, it will be +/// deleted before being recreated. +/// +/// # Notes +/// +/// This method should _not_ be used by tests. Instead, rely on the `crdb-seed` +/// setup script. +pub async fn ensure_seed_tarball_exists( + log: &Logger, + why_invalidate: Option<&str>, +) -> Result<(Utf8PathBuf, SeedTarballStatus)> { + // If the CRDB_SEED_TAR_ENV variable is set, return an error. + // + // Even though this module is gated behind a feature flag, omicron-dev needs + // this function -- and so, if you're doing a top-level `cargo nextest run` + // like CI does, feature unification would mean this gets included in test + // binaries anyway. So this acts as a belt-and-suspenders check. + if let Ok(val) = std::env::var(CRDB_SEED_TAR_ENV) { + anyhow::bail!( + "{CRDB_SEED_TAR_ENV} is set to `{val}` -- implying that a test called \ + ensure_seed_tarball_exists. Instead, tests should rely on the `crdb-seed` \ + setup script." + ); + } + + // XXX: we aren't considering cross-user permissions for this file. Might be + // worth setting more restrictive permissions on it, or using a per-user + // cache dir. + let base_seed_dir = Utf8PathBuf::from_path_buf(std::env::temp_dir()) + .expect("Not a UTF-8 path") + .join("crdb-base"); + std::fs::create_dir_all(&base_seed_dir).unwrap(); + let mut desired_seed_tar = base_seed_dir.join(digest_unique_to_schema()); + desired_seed_tar.set_extension("tar"); + + let invalidated = match (desired_seed_tar.exists(), why_invalidate) { + (true, Some(why)) => { + slog::info!( + log, + "{why}: invalidating seed tarball: `{desired_seed_tar}`", + ); + std::fs::remove_file(&desired_seed_tar) + .context("failed to remove seed tarball")?; + true + } + (true, None) => { + // The tarball exists. Update its atime and mtime (i.e. `touch` it) + // to ensure that it doesn't get deleted by a /tmp cleaner. + let now = FileTime::now(); + filetime::set_file_times(&desired_seed_tar, now, now) + .context("failed to update seed tarball atime and mtime")?; + return Ok((desired_seed_tar, SeedTarballStatus::Existing)); + } + (false, Some(why)) => { + slog::info!( + log, + "{why}, but seed tarball does not exist: `{desired_seed_tar}`", + ); + false + } + (false, None) => { + // The tarball doesn't exist. + false + } + }; + + // At this point the tarball does not exist (either because it didn't exist + // in the first place or because it was deleted above), so try to create it. + // + // Nextest will execute this function just once via the `crdb-seed` binary, + // but it is possible for a user to start up multiple nextest processes to + // be running at the same time. So we should consider it possible for + // another caller to create this seed tarball before we finish setting it up + // ourselves. + test_setup_database_seed(log, &desired_seed_tar) + .await + .context("failed to setup seed tarball")?; + + let status = if invalidated { + SeedTarballStatus::Invalidated + } else { + SeedTarballStatus::Created + }; + Ok((desired_seed_tar, status)) +} + +/// Creates a seed file for a Cockroach database at the output tarball. +/// +/// This is intended to optimize subsequent calls to +/// [`test_setup_database`](super::test_setup_database) by reducing the latency +/// of populating the storage directory. +pub async fn test_setup_database_seed( + log: &Logger, + output_tar: &Utf8Path, +) -> Result<()> { + let base_seed_dir = output_tar.parent().unwrap(); + let tmp_seed_dir = camino_tempfile::Utf8TempDir::new_in(base_seed_dir) + .context("failed to create temporary seed directory")?; + + let mut db = super::setup_database( + log, + super::StorageSource::PopulateLatest { + output_dir: tmp_seed_dir.path().to_owned(), + }, + ) + .await + .context("failed to setup database")?; + db.cleanup().await.context("failed to cleanup database")?; + + // See https://github.com/cockroachdb/cockroach/issues/74231 for context on + // this. We use this assertion to check that our seed directory won't point + // back to itself, even if it is copied elsewhere. + let dirs_record_path = tmp_seed_dir.path().join("temp-dirs-record.txt"); + let dirs_record_len = dirs_record_path + .metadata() + .with_context(|| { + format!("cannot access metadata for {dirs_record_path}") + })? + .len(); + ensure!( + dirs_record_len == 0, + "Temporary directory record should be empty (was {dirs_record_len}) \ + after graceful shutdown", + ); + + let output_tar = output_tar.to_owned(); + + tokio::task::spawn_blocking(move || { + // Tar up the directory -- this prevents issues where some but not all of + // the files get cleaned up by /tmp cleaners. See + // https://github.com/oxidecomputer/omicron/issues/4193. + let atomic_file = atomicwrites::AtomicFile::new( + &output_tar, + // We don't expect this to exist, but if it does, we want to overwrite + // it. That is because there's a remote possibility that multiple + // instances of test_setup_database_seed are running simultaneously. + atomicwrites::OverwriteBehavior::AllowOverwrite, + ); + let res = atomic_file.write(|f| { + // Tar up the directory here. + let writer = BufWriter::new(f); + let mut tar = tar::Builder::new(writer); + tar.follow_symlinks(false); + tar.append_dir_all(".", tmp_seed_dir.path()).with_context( + || { + format!( + "failed to append directory `{}` to tarball", + tmp_seed_dir.path(), + ) + }, + )?; + + let mut writer = + tar.into_inner().context("failed to finish writing tarball")?; + writer.flush().context("failed to flush tarball")?; + + Ok::<_, anyhow::Error>(()) + }); + match res { + Ok(()) => Ok(()), + Err(atomicwrites::Error::Internal(error)) => Err(error) + .with_context(|| { + format!("failed to write seed tarball: `{}`", output_tar) + }), + Err(atomicwrites::Error::User(error)) => Err(error), + } + }) + .await + .context("error in task to tar up contents")? +} diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 8854ef27bc..106da92f62 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -215,49 +215,56 @@ bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-f hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.x86_64-unknown-linux-gnu.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.x86_64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.x86_64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.aarch64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.aarch64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } [target.x86_64-unknown-illumos.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } toml_datetime = { version = "0.6.3", default-features = false, features = ["serde"] } toml_edit = { version = "0.19.15", features = ["serde"] } @@ -266,7 +273,8 @@ bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-f hyper-rustls = { version = "0.24.1" } mio = { version = "0.8.8", features = ["net", "os-ext"] } once_cell = { version = "1.18.0", features = ["unstable"] } -rustix = { version = "0.38.9", features = ["fs", "termios"] } +rustix-d585fab2519d2d1 = { package = "rustix", version = "0.38.9", features = ["fs", "termios"] } +rustix-d736d0ac4424f0f1 = { package = "rustix", version = "0.37.23", features = ["fs", "termios"] } toml_datetime = { version = "0.6.3", default-features = false, features = ["serde"] } toml_edit = { version = "0.19.15", features = ["serde"] } From d624bce9af25e5bc1141de4dcdb50a15223f106d Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Mon, 9 Oct 2023 20:02:08 +0100 Subject: [PATCH 03/13] Back /var/fm/fmd with a dataset from the boot M.2 (#4212) `/var/fm/fmd` is where the illumos fault management system records data. We want to preserve this data across system reboots and in real time rather than via periodic data copying, so that the information is available should the system panic shortly thereafter. Fixes: https://github.com/oxidecomputer/omicron/issues/4211 --- illumos-utils/src/zfs.rs | 41 ++++-- sled-agent/src/backing_fs.rs | 178 +++++++++++++++++++++++++ sled-agent/src/bootstrap/pre_server.rs | 1 + sled-agent/src/lib.rs | 1 + sled-agent/src/sled_agent.rs | 22 ++- sled-agent/src/storage_manager.rs | 1 + sled-agent/src/swap_device.rs | 3 - sled-hardware/src/disk.rs | 15 ++- 8 files changed, 242 insertions(+), 20 deletions(-) create mode 100644 sled-agent/src/backing_fs.rs diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index ba8cd8c84a..9118a9a3cd 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -61,6 +61,9 @@ enum EnsureFilesystemErrorRaw { #[error("Failed to mount encrypted filesystem: {0}")] MountEncryptedFsFailed(crate::ExecutionError), + + #[error("Failed to mount overlay filesystem: {0}")] + MountOverlayFsFailed(crate::ExecutionError), } /// Error returned by [`Zfs::ensure_filesystem`]. @@ -202,6 +205,7 @@ impl Zfs { /// Creates a new ZFS filesystem named `name`, unless one already exists. /// /// Applies an optional quota, provided _in bytes_. + #[allow(clippy::too_many_arguments)] pub fn ensure_filesystem( name: &str, mountpoint: Mountpoint, @@ -209,6 +213,7 @@ impl Zfs { do_format: bool, encryption_details: Option, size_details: Option, + additional_options: Option>, ) -> Result<(), EnsureFilesystemError> { let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?; if exists { @@ -261,7 +266,14 @@ impl Zfs { ]); } + if let Some(opts) = additional_options { + for o in &opts { + cmd.args(&["-o", &o]); + } + } + cmd.args(&["-o", &format!("mountpoint={}", mountpoint), name]); + execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), mountpoint: mountpoint.clone(), @@ -322,6 +334,20 @@ impl Zfs { Ok(()) } + pub fn mount_overlay_dataset( + name: &str, + mountpoint: &Mountpoint, + ) -> Result<(), EnsureFilesystemError> { + let mut command = std::process::Command::new(PFEXEC); + let cmd = command.args(&[ZFS, "mount", "-O", name]); + execute(cmd).map_err(|err| EnsureFilesystemError { + name: name.to_string(), + mountpoint: mountpoint.clone(), + err: EnsureFilesystemErrorRaw::MountOverlayFsFailed(err), + })?; + Ok(()) + } + // Return (true, mounted) if the dataset exists, (false, false) otherwise, // where mounted is if the dataset is mounted. fn dataset_exists( @@ -385,7 +411,7 @@ impl Zfs { Zfs::get_value(filesystem_name, &format!("oxide:{}", name)) } - fn get_value( + pub fn get_value( filesystem_name: &str, name: &str, ) -> Result { @@ -422,13 +448,12 @@ pub fn get_all_omicron_datasets_for_delete() -> anyhow::Result> { let internal = pool.kind() == crate::zpool::ZpoolKind::Internal; let pool = pool.to_string(); for dataset in &Zfs::list_datasets(&pool)? { - // Avoid erasing crashdump datasets on internal pools - if dataset == "crash" && internal { - continue; - } - - // The swap device might be in use, so don't assert that it can be deleted. - if dataset == "swap" && internal { + // Avoid erasing crashdump, backing data and swap datasets on + // internal pools. The swap device may be in use. + if internal + && (["crash", "backing", "swap"].contains(&dataset.as_str()) + || dataset.starts_with("backing/")) + { continue; } diff --git a/sled-agent/src/backing_fs.rs b/sled-agent/src/backing_fs.rs new file mode 100644 index 0000000000..5014ac5999 --- /dev/null +++ b/sled-agent/src/backing_fs.rs @@ -0,0 +1,178 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Operations for dealing with persistent backing mounts for OS data + +// On Oxide hardware, the root filesystem is backed by a ramdisk and +// non-persistent. However, there are several things within the root filesystem +// which are useful to preserve across reboots, and these are backed persistent +// datasets on the boot disk. +// +// Each boot disk contains a dataset sled_hardware::disk::M2_BACKING_DATASET +// and for each backing mount, a child dataset is created under there that +// is configured with the desired mountpoint in the root filesystem. Since +// there are multiple disks which can be used to boot, these datasets are also +// marked with the "canmount=noauto" attribute so that they do not all try to +// mount automatically and race -- only one could ever succeed. This allows us +// to come along later and specifically mount the one that we want (the one from +// the current boot disk) and also perform an overlay mount so that it succeeds +// even if there is content from the ramdisk image or early boot services +// present underneath. The overlay mount action is optionally bracketed with a +// service stop/start. + +use camino::Utf8PathBuf; +use illumos_utils::zfs::{ + EnsureFilesystemError, GetValueError, Mountpoint, SizeDetails, Zfs, +}; + +#[derive(Debug, thiserror::Error)] +pub enum BackingFsError { + #[error("Error administering service: {0}")] + Adm(#[from] smf::AdmError), + + #[error("Error retrieving dataset property: {0}")] + DatasetProperty(#[from] GetValueError), + + #[error("Error initializing dataset: {0}")] + Mount(#[from] EnsureFilesystemError), +} + +struct BackingFs { + // Dataset name + name: &'static str, + // Mountpoint + mountpoint: &'static str, + // Optional quota, in _bytes_ + quota: Option, + // Optional compression mode + compression: Option<&'static str>, + // Linked service + service: Option<&'static str>, +} + +impl BackingFs { + const fn new(name: &'static str) -> Self { + Self { + name, + mountpoint: "legacy", + quota: None, + compression: None, + service: None, + } + } + + const fn mountpoint(mut self, mountpoint: &'static str) -> Self { + self.mountpoint = mountpoint; + self + } + + const fn quota(mut self, quota: usize) -> Self { + self.quota = Some(quota); + self + } + + const fn compression(mut self, compression: &'static str) -> Self { + self.compression = Some(compression); + self + } + + const fn service(mut self, service: &'static str) -> Self { + self.service = Some(service); + self + } +} + +const BACKING_FMD_DATASET: &'static str = "fmd"; +const BACKING_FMD_MOUNTPOINT: &'static str = "/var/fm/fmd"; +const BACKING_FMD_SERVICE: &'static str = "svc:/system/fmd:default"; +const BACKING_FMD_QUOTA: usize = 500 * (1 << 20); // 500 MiB + +const BACKING_COMPRESSION: &'static str = "on"; + +const BACKINGFS_COUNT: usize = 1; +static BACKINGFS: [BackingFs; BACKINGFS_COUNT] = + [BackingFs::new(BACKING_FMD_DATASET) + .mountpoint(BACKING_FMD_MOUNTPOINT) + .quota(BACKING_FMD_QUOTA) + .compression(BACKING_COMPRESSION) + .service(BACKING_FMD_SERVICE)]; + +/// Ensure that the backing filesystems are mounted. +/// If the underlying dataset for a backing fs does not exist on the specified +/// boot disk then it will be created. +pub(crate) fn ensure_backing_fs( + log: &slog::Logger, + boot_zpool_name: &illumos_utils::zpool::ZpoolName, +) -> Result<(), BackingFsError> { + let log = log.new(o!( + "component" => "BackingFs", + )); + for bfs in BACKINGFS.iter() { + info!(log, "Processing {}", bfs.name); + + let dataset = format!( + "{}/{}/{}", + boot_zpool_name, + sled_hardware::disk::M2_BACKING_DATASET, + bfs.name + ); + let mountpoint = Mountpoint::Path(Utf8PathBuf::from(bfs.mountpoint)); + + info!(log, "Ensuring dataset {}", dataset); + + let size_details = Some(SizeDetails { + quota: bfs.quota, + compression: bfs.compression, + }); + + Zfs::ensure_filesystem( + &dataset, + mountpoint.clone(), + false, // zoned + true, // do_format + None, // encryption_details, + size_details, + Some(vec!["canmount=noauto".to_string()]), // options + )?; + + // Check if a ZFS filesystem is already mounted on bfs.mountpoint by + // retrieving the ZFS `mountpoint` property and comparing it. This + // might seem counter-intuitive but if there is a filesystem mounted + // there, its mountpoint will match, and if not then we will retrieve + // the mountpoint of a higher level filesystem, such as '/'. If we + // can't retrieve the property at all, then there is definitely no ZFS + // filesystem mounted there - most likely we are running with a non-ZFS + // root, such as when net booted during CI. + if Zfs::get_value(&bfs.mountpoint, "mountpoint") + .unwrap_or("not-zfs".to_string()) + == bfs.mountpoint + { + info!(log, "{} is already mounted", bfs.mountpoint); + return Ok(()); + } + + if let Some(service) = bfs.service { + info!(log, "Stopping service {}", service); + smf::Adm::new() + .disable() + .temporary() + .synchronous() + .run(smf::AdmSelection::ByPattern(&[service]))?; + } + + info!(log, "Mounting {} on {}", dataset, mountpoint); + + Zfs::mount_overlay_dataset(&dataset, &mountpoint)?; + + if let Some(service) = bfs.service { + info!(log, "Starting service {}", service); + smf::Adm::new() + .enable() + .synchronous() + .run(smf::AdmSelection::ByPattern(&[service]))?; + } + } + + Ok(()) +} diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 0899bdd82f..71325fef3d 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -381,6 +381,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { do_format, encryption_details, quota, + None, ) .map_err(StartError::EnsureZfsRamdiskDataset) } diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 5c4dbd8310..4e7921c605 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -17,6 +17,7 @@ pub mod sim; pub mod common; // Modules for the non-simulated sled agent. +mod backing_fs; pub mod bootstrap; pub mod config; mod http_entrypoints; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 7e62f6a8a7..5574edca55 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -59,9 +59,15 @@ use illumos_utils::{dladm::MockDladm as Dladm, zone::MockZones as Zones}; #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("Could not find boot disk")] + BootDiskNotFound, + #[error("Configuration error: {0}")] Config(#[from] crate::config::ConfigError), + #[error("Error setting up backing filesystems: {0}")] + BackingFs(#[from] crate::backing_fs::BackingFsError), + #[error("Error setting up swap device: {0}")] SwapDevice(#[from] crate::swap_device::SwapDeviceError), @@ -268,14 +274,17 @@ impl SledAgent { )); info!(&log, "SledAgent::new(..) starting"); - // Configure a swap device of the configured size before other system setup. + let boot_disk = storage + .resources() + .boot_disk() + .await + .ok_or_else(|| Error::BootDiskNotFound)?; + + // Configure a swap device of the configured size before other system + // setup. match config.swap_device_size_gb { Some(sz) if sz > 0 => { info!(log, "Requested swap device of size {} GiB", sz); - let boot_disk = - storage.resources().boot_disk().await.ok_or_else(|| { - crate::swap_device::SwapDeviceError::BootDiskNotFound - })?; crate::swap_device::ensure_swap_device( &parent_log, &boot_disk.1, @@ -290,6 +299,9 @@ impl SledAgent { } } + info!(log, "Mounting backing filesystems"); + crate::backing_fs::ensure_backing_fs(&parent_log, &boot_disk.1)?; + // Ensure we have a thread that automatically reaps process contracts // when they become empty. See the comments in // illumos-utils/src/running_zone.rs for more detail. diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index bd71371396..c31a4dc0bc 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -417,6 +417,7 @@ impl StorageWorker { do_format, encryption_details, size_details, + None, )?; // Ensure the dataset has a usable UUID. if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { diff --git a/sled-agent/src/swap_device.rs b/sled-agent/src/swap_device.rs index 5a8f40adbd..6a00b42672 100644 --- a/sled-agent/src/swap_device.rs +++ b/sled-agent/src/swap_device.rs @@ -9,9 +9,6 @@ use zeroize::Zeroize; #[derive(Debug, thiserror::Error)] pub enum SwapDeviceError { - #[error("Could not find boot disk")] - BootDiskNotFound, - #[error("Error running ZFS command: {0}")] Zfs(illumos_utils::ExecutionError), diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index aec99ae3f8..e3078cbeea 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -256,6 +256,7 @@ pub const CRASH_DATASET: &'static str = "crash"; pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; pub const M2_DEBUG_DATASET: &'static str = "debug"; +pub const M2_BACKING_DATASET: &'static str = "backing"; // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be // tuned as needed. pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); @@ -282,7 +283,7 @@ static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ .compression(DUMP_DATASET_COMPRESSION), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 5; +const M2_EXPECTED_DATASET_COUNT: usize = 6; static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // @@ -290,7 +291,11 @@ static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ ExpectedDataset::new(INSTALL_DATASET), // Stores crash dumps. ExpectedDataset::new(CRASH_DATASET), - // Stores cluter configuration information. + // Backing store for OS data that should be persisted across reboots. + // Its children are selectively overlay mounted onto parts of the ramdisk + // root. + ExpectedDataset::new(M2_BACKING_DATASET), + // Stores cluster configuration information. // // Should be duplicated to both M.2s. ExpectedDataset::new(CLUSTER_DATASET), @@ -524,6 +529,7 @@ impl Disk { do_format, Some(encryption_details), None, + None, ); keyfile.zero_and_unlink().await.map_err(|error| { @@ -562,8 +568,8 @@ impl Disk { "Automatically destroying dataset: {}", name ); Zfs::destroy_dataset(name).or_else(|err| { - // If we can't find the dataset, that's fine -- it might - // not have been formatted yet. + // If we can't find the dataset, that's fine -- it + // might not have been formatted yet. if let DestroyDatasetErrorVariant::NotFound = err.err { @@ -588,6 +594,7 @@ impl Disk { do_format, encryption_details, size_details, + None, )?; if dataset.wipe { From 47a6b42c986c65292ee61b0c79090fa57dec5fe9 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Mon, 9 Oct 2023 16:35:01 -0500 Subject: [PATCH 04/13] [nexus] Add `/v1/ping` endpoint (#3925) Closes #3923 Adds `/v1/ping` that always returns `{ "status": "ok" }` if it returns anything at all. I went with `ping` over the initial `/v1/system/health` because the latter is vague about its meaning, whereas everyone know ping means a trivial request and response. I also thought it was weird to put an endpoint with no auth check under `/v1/system`, where ~all the other endpoints require fleet-level perms. This doesn't add too much over hitting an existing endpoint, but I think it's worth it because * It doesn't hit the DB * It has no auth check * It gives a very simple answer to "what endpoint should I use to ping the API?" (a question we have gotten at least once) * It's easy (I already did it) Questions that occurred to me while working through this: - Should we actually attempt to do something in the handler that would tell us, e.g., whether the DB is up? - No, that would be more than a ping - Raises DoS questions if not auth gated - Could add a db status endpoint or or you could use any endpoint that returns data - What tag should this be under? - Initially added a `system` tag because a) this doesn't fit under existing `system/blah` tags and b) it really does feel miscellaneous - Changed to `system/status`, with the idea that if we add other kinds of checks, they would be new endpoints under this tag. --- nexus/src/external_api/http_entrypoints.rs | 16 ++++++ nexus/src/external_api/tag-config.json | 6 ++ nexus/tests/integration_tests/basic.rs | 13 ++++- nexus/tests/integration_tests/endpoints.rs | 1 - nexus/tests/output/nexus_tags.txt | 4 ++ .../output/uncovered-authz-endpoints.txt | 1 + nexus/types/src/external_api/views.rs | 15 +++++ openapi/nexus.json | 57 +++++++++++++++++++ 8 files changed, 111 insertions(+), 2 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 6e614d5644..ac5cf76775 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -98,6 +98,8 @@ type NexusApiDescription = ApiDescription>; /// Returns a description of the external nexus API pub(crate) fn external_api() -> NexusApiDescription { fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> { + api.register(ping)?; + api.register(system_policy_view)?; api.register(system_policy_update)?; @@ -364,6 +366,20 @@ pub(crate) fn external_api() -> NexusApiDescription { // clients. Client generators use operationId to name API methods, so changing // a function name is a breaking change from a client perspective. +/// Ping API +/// +/// Always responds with Ok if it responds at all. +#[endpoint { + method = GET, + path = "/v1/ping", + tags = ["system/status"], +}] +async fn ping( + _rqctx: RequestContext>, +) -> Result, HttpError> { + Ok(HttpResponseOk(views::Ping { status: views::PingStatus::Ok })) +} + /// Fetch the top-level IAM policy #[endpoint { method = GET, diff --git a/nexus/src/external_api/tag-config.json b/nexus/src/external_api/tag-config.json index e985ea7db4..07eb198016 100644 --- a/nexus/src/external_api/tag-config.json +++ b/nexus/src/external_api/tag-config.json @@ -80,6 +80,12 @@ "url": "http://docs.oxide.computer/api/vpcs" } }, + "system/status": { + "description": "Endpoints related to system health", + "external_docs": { + "url": "http://docs.oxide.computer/api/system-status" + } + }, "system/hardware": { "description": "These operations pertain to hardware inventory and management. Racks are the unit of expansion of an Oxide deployment. Racks are in turn composed of sleds, switches, power supplies, and a cabled backplane.", "external_docs": { diff --git a/nexus/tests/integration_tests/basic.rs b/nexus/tests/integration_tests/basic.rs index ab54c97197..282ec0cd96 100644 --- a/nexus/tests/integration_tests/basic.rs +++ b/nexus/tests/integration_tests/basic.rs @@ -10,7 +10,8 @@ use dropshot::HttpErrorResponseBody; use http::method::Method; use http::StatusCode; -use nexus_types::external_api::{params, views::Project}; +use nexus_types::external_api::params; +use nexus_types::external_api::views::{self, Project}; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::IdentityMetadataUpdateParams; use omicron_common::api::external::Name; @@ -546,3 +547,13 @@ async fn test_projects_list(cptestctx: &ControlPlaneTestContext) { .collect::>() ); } + +#[nexus_test] +async fn test_ping(cptestctx: &ControlPlaneTestContext) { + let client = &cptestctx.external_client; + + let health = NexusRequest::object_get(client, "/v1/ping") + .execute_and_parse_unwrap::() + .await; + assert_eq!(health.status, views::PingStatus::Ok); +} diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index e04d26cc45..e9ae11c21f 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -1876,6 +1876,5 @@ lazy_static! { AllowedMethod::GetNonexistent ], }, - ]; } diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index ca2f737cb0..1d7f5556c2 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -172,6 +172,10 @@ silo_view GET /v1/system/silos/{silo} user_builtin_list GET /v1/system/users-builtin user_builtin_view GET /v1/system/users-builtin/{user} +API operations found with tag "system/status" +OPERATION ID METHOD URL PATH +ping GET /v1/ping + API operations found with tag "vpcs" OPERATION ID METHOD URL PATH vpc_create POST /v1/vpcs diff --git a/nexus/tests/output/uncovered-authz-endpoints.txt b/nexus/tests/output/uncovered-authz-endpoints.txt index 0e53222a8a..d76d9c5495 100644 --- a/nexus/tests/output/uncovered-authz-endpoints.txt +++ b/nexus/tests/output/uncovered-authz-endpoints.txt @@ -1,4 +1,5 @@ API endpoints with no coverage in authz tests: +ping (get "/v1/ping") device_auth_request (post "/device/auth") device_auth_confirm (post "/device/confirm") device_access_token (post "/device/token") diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 4b30b0be1c..ef3835c618 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -522,3 +522,18 @@ pub struct UpdateDeployment { pub version: SemverVersion, pub status: UpdateStatus, } + +// SYSTEM HEALTH + +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum PingStatus { + Ok, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, JsonSchema)] +pub struct Ping { + /// Whether the external API is reachable. Will always be Ok if the endpoint + /// returns anything at all. + pub status: PingStatus, +} diff --git a/openapi/nexus.json b/openapi/nexus.json index 9330b0ef47..9dda94f283 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -2816,6 +2816,34 @@ } } }, + "/v1/ping": { + "get": { + "tags": [ + "system/status" + ], + "summary": "Ping API", + "description": "Always responds with Ok if it responds at all.", + "operationId": "ping", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Ping" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/policy": { "get": { "tags": [ @@ -12031,6 +12059,28 @@ "items" ] }, + "Ping": { + "type": "object", + "properties": { + "status": { + "description": "Whether the external API is reachable. Will always be Ok if the endpoint returns anything at all.", + "allOf": [ + { + "$ref": "#/components/schemas/PingStatus" + } + ] + } + }, + "required": [ + "status" + ] + }, + "PingStatus": { + "type": "string", + "enum": [ + "ok" + ] + }, "Project": { "description": "View of a Project", "type": "object", @@ -15277,6 +15327,13 @@ "url": "http://docs.oxide.computer/api/system-silos" } }, + { + "name": "system/status", + "description": "Endpoints related to system health", + "externalDocs": { + "url": "http://docs.oxide.computer/api/system-status" + } + }, { "name": "system/update" }, From d9d39531991cc8843ef38c4d0afc03afe1a58722 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 10 Oct 2023 12:19:23 -0400 Subject: [PATCH 05/13] Do not double count region snapshots records! (#4095) `decrease_crucible_resource_count_and_soft_delete_volume` does not disambiguate cases where the snapshot_addr of a region_snapshot is duplicated with another one, which can occur due to the Crucible Agent reclaiming ports from destroyed daemons (see also #4049, which makes the simulated Crucible agent do this). Several invocations of the snapshot create and snapshot delete sagas could race in such a way that one of these ports would be reclaimed, and then be used in a different snapshot, and the lifetime of both of these would overlap! This would confuse our reference counting, which was written with a naive assumption that this port reuse **wouldn't** occur with these overlapping lifetimes. Spoiler alert, it can: root@[fd00:1122:3344:101::3]:32221/omicron> select * from region_snapshot where snapshot_addr = '[fd00:1122:3344:102::7]:19016'; dataset_id | region_id | snapshot_id | snapshot_addr | volume_references ---------------------------------------+--------------------------------------+--------------------------------------+-------------------------------+-------------------- 80790bfd-4b81-4381-9262-20912e3826cc | 0387bbb7-1d54-4683-943c-6c17d6804de9 | 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 80790bfd-4b81-4381-9262-20912e3826cc | ff20e066-8815-4eb6-ac84-fab9b9103462 | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 (2 rows) One way to solve this would be to create a UNIQUE INDEX on `snapshot_addr` here, but then in these cases the snapshot creation would return a 500 error to the user. This commit adds a sixth column: `deleting`, a boolean that is true when the region snapshot is part of a volume's `resources_to_clean_up`, and false otherwise. This is used to select (as part of the transaction for `decrease_crucible_resource_count_and_soft_delete_volume`) only the region_snapshot records that were decremented as part of that transaction, and skip re-deleting them otherwise. This works because the overlapping lifetime of the records in the DB is **not** the overlapping lifetime of the actual read-only downstairs daemon: for the port to be reclaimed, the original daemon has to be DELETEd, which happens after the decrement transaction has already computed which resources to clean up: 1) a snapshot record is created: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 2) it is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 3) when the volume is deleted, then the decrement transaction will: a) decrease `volume_references` by 1 ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ 1a800928-8f93-4cd3-9df1-4129582ffc20 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | ``` 4) That read-only snapshot daemon is DELETEd, freeing up the port. Another snapshot creation occurs, using that reclaimed port: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 5) That new snapshot is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 6) It is later deleted, and the decrement transaction will: a) decrease `volume_references` by 1: ``` j snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ bdd9614e-f089-4a94-ae46-e10b96b79ba3 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | true | ``` --- dev-tools/omdb/tests/env.out | 6 +- dev-tools/omdb/tests/successes.out | 12 +- nexus/db-model/src/region_snapshot.rs | 3 + nexus/db-model/src/schema.rs | 3 +- nexus/db-queries/src/db/datastore/dataset.rs | 16 + .../src/db/datastore/region_snapshot.rs | 23 ++ nexus/db-queries/src/db/datastore/volume.rs | 100 +++--- nexus/src/app/sagas/snapshot_create.rs | 1 + nexus/src/app/sagas/volume_delete.rs | 177 ++++++---- nexus/tests/integration_tests/snapshots.rs | 36 +- .../integration_tests/volume_management.rs | 308 ++++++++++++++++++ schema/crdb/6.0.0/up1.sql | 1 + schema/crdb/6.0.0/up2.sql | 1 + schema/crdb/dbinit.sql | 5 +- 14 files changed, 563 insertions(+), 129 deletions(-) create mode 100644 schema/crdb/6.0.0/up1.sql create mode 100644 schema/crdb/6.0.0/up2.sql diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index eb4cd0d32d..07a6d3fae5 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -7,7 +7,7 @@ sim-b6d65341 [::1]:REDACTED_PORT - REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "--db-url", "junk", "sleds"] termination: Exited(2) @@ -172,7 +172,7 @@ stderr: note: database URL not specified. Will search DNS. note: (override with --db-url or OMDB_DB_URL) note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["--dns-server", "[::1]:REDACTED_PORT", "db", "sleds"] termination: Exited(0) @@ -185,5 +185,5 @@ stderr: note: database URL not specified. Will search DNS. note: (override with --db-url or OMDB_DB_URL) note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index eb075a84ea..038f365e8e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -8,7 +8,7 @@ external oxide-dev.test 2 create silo: "tes --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "dns", "diff", "external", "2"] termination: Exited(0) @@ -24,7 +24,7 @@ changes: names added: 1, names removed: 0 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "dns", "names", "external", "2"] termination: Exited(0) @@ -36,7 +36,7 @@ External zone: oxide-dev.test --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "services", "list-instances"] termination: Exited(0) @@ -52,7 +52,7 @@ Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_ --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "services", "list-by-sled"] termination: Exited(0) @@ -71,7 +71,7 @@ sled: sim-b6d65341 (id REDACTED_UUID_REDACTED_UUID_REDACTED) --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "sleds"] termination: Exited(0) @@ -82,7 +82,7 @@ sim-b6d65341 [::1]:REDACTED_PORT - REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["mgs", "inventory"] termination: Exited(0) diff --git a/nexus/db-model/src/region_snapshot.rs b/nexus/db-model/src/region_snapshot.rs index 9addeb83e3..af1cf8b2b3 100644 --- a/nexus/db-model/src/region_snapshot.rs +++ b/nexus/db-model/src/region_snapshot.rs @@ -32,4 +32,7 @@ pub struct RegionSnapshot { // how many volumes reference this? pub volume_references: i64, + + // true if part of a volume's `resources_to_clean_up` already + pub deleting: bool, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 94a770e2ca..0165ab1568 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -856,6 +856,7 @@ table! { snapshot_id -> Uuid, snapshot_addr -> Text, volume_references -> Int8, + deleting -> Bool, } } @@ -1130,7 +1131,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(5, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(6, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 99972459c8..0b26789e8f 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -13,15 +13,31 @@ use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::Zpool; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use diesel::upsert::excluded; use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; +use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use uuid::Uuid; impl DataStore { + pub async fn dataset_get(&self, dataset_id: Uuid) -> LookupResult { + use db::schema::dataset::dsl; + + dsl::dataset + .filter(dsl::id.eq(dataset_id)) + .select(Dataset::as_select()) + .first_async::( + &*self.pool_connection_unauthorized().await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Stores a new dataset in the database. pub async fn dataset_upsert( &self, diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 0a707e4504..148cfe4812 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -10,9 +10,11 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::RegionSnapshot; use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::OptionalExtension; use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; +use omicron_common::api::external::LookupResult; use uuid::Uuid; impl DataStore { @@ -31,6 +33,27 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + pub async fn region_snapshot_get( + &self, + dataset_id: Uuid, + region_id: Uuid, + snapshot_id: Uuid, + ) -> LookupResult> { + use db::schema::region_snapshot::dsl; + + dsl::region_snapshot + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .select(RegionSnapshot::as_select()) + .first_async::( + &*self.pool_connection_unauthorized().await?, + ) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + pub async fn region_snapshot_remove( &self, dataset_id: Uuid, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index b3e82886de..b97b8451cf 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -119,6 +119,7 @@ impl DataStore { .filter( rs_dsl::snapshot_addr.eq(read_only_target.clone()), ) + .filter(rs_dsl::deleting.eq(false)) .set( rs_dsl::volume_references .eq(rs_dsl::volume_references + 1), @@ -573,9 +574,7 @@ impl DataStore { // multiple times, and that is done by soft-deleting the volume during // the transaction, and returning the previously serialized list of // resources to clean up if a soft-delete has already occurred. - // - // TODO it would be nice to make this transaction_async, but I couldn't - // get the async optional extension to work. + self.pool_connection_unauthorized() .await? .transaction_async(|conn| async move { @@ -639,7 +638,9 @@ impl DataStore { } }; - // Decrease the number of uses for each referenced region snapshot. + // Decrease the number of uses for each non-deleted referenced + // region snapshot. + use db::schema::region_snapshot::dsl; diesel::update(dsl::region_snapshot) @@ -647,12 +648,40 @@ impl DataStore { dsl::snapshot_addr .eq_any(crucible_targets.read_only_targets.clone()), ) + .filter(dsl::volume_references.gt(0)) + .filter(dsl::deleting.eq(false)) .set(dsl::volume_references.eq(dsl::volume_references - 1)) .execute_async(&conn) .await?; + // Then, note anything that was set to zero from the above + // UPDATE, and then mark all those as deleted. + let snapshots_to_delete: Vec = + dsl::region_snapshot + .filter( + dsl::snapshot_addr.eq_any( + crucible_targets.read_only_targets.clone(), + ), + ) + .filter(dsl::volume_references.eq(0)) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .load_async(&conn) + .await?; + + diesel::update(dsl::region_snapshot) + .filter( + dsl::snapshot_addr + .eq_any(crucible_targets.read_only_targets.clone()), + ) + .filter(dsl::volume_references.eq(0)) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(&conn) + .await?; + // Return what results can be cleaned up - let result = CrucibleResources::V1(CrucibleResourcesV1 { + let result = CrucibleResources::V2(CrucibleResourcesV2 { // The only use of a read-write region will be at the top level of a // Volume. These are not shared, but if any snapshots are taken this // will prevent deletion of the region. Filter out any regions that @@ -681,6 +710,7 @@ impl DataStore { .eq(0) // Despite the SQL specifying that this column is NOT NULL, // this null check is required for this function to work! + // The left join of region_snapshot might cause a null here. .or(dsl::volume_references.is_null()), ) .select((Dataset::as_select(), Region::as_select())) @@ -688,46 +718,17 @@ impl DataStore { .await? }, - // A volume (for a disk or snapshot) may reference another nested - // volume as a read-only parent, and this may be arbitrarily deep. - // After decrementing volume_references above, get the region - // snapshot records for these read_only_targets where the - // volume_references has gone to 0. Consumers of this struct will - // be responsible for deleting the read-only downstairs running - // for the snapshot and the snapshot itself. - datasets_and_snapshots: { - use db::schema::dataset::dsl as dataset_dsl; - - dsl::region_snapshot - // Only return region_snapshot records related to - // this volume that have zero references. This will - // only happen one time, on the last decrease of a - // volume containing these read-only targets. - // - // It's important to not return *every* region - // snapshot with zero references: multiple volume - // delete sub-sagas will then be issues duplicate - // DELETE calls to Crucible agents, and a request to - // delete a read-only downstairs running for a - // snapshot that doesn't exist will return a 404, - // causing the saga to error and unwind. - .filter(dsl::snapshot_addr.eq_any( - crucible_targets.read_only_targets.clone(), - )) - .filter(dsl::volume_references.eq(0)) - .inner_join( - dataset_dsl::dataset - .on(dsl::dataset_id.eq(dataset_dsl::id)), - ) - .select(( - Dataset::as_select(), - RegionSnapshot::as_select(), - )) - .get_results_async::<(Dataset, RegionSnapshot)>( - &conn, - ) - .await? - }, + // Consumers of this struct will be responsible for deleting + // the read-only downstairs running for the snapshot and the + // snapshot itself. + // + // It's important to not return *every* region snapshot with + // zero references: multiple volume delete sub-sagas will + // then be issues duplicate DELETE calls to Crucible agents, + // and a request to delete a read-only downstairs running + // for a snapshot that doesn't exist will return a 404, + // causing the saga to error and unwind. + snapshots_to_delete, }); // Soft delete this volume, and serialize the resources that are to @@ -967,7 +968,7 @@ impl DataStore { #[derive(Default, Debug, Serialize, Deserialize)] pub struct CrucibleTargets { - read_only_targets: Vec, + pub read_only_targets: Vec, } // Serialize this enum into the `resources_to_clean_up` column to handle @@ -975,6 +976,7 @@ pub struct CrucibleTargets { #[derive(Debug, Serialize, Deserialize)] pub enum CrucibleResources { V1(CrucibleResourcesV1), + V2(CrucibleResourcesV2), } #[derive(Debug, Default, Serialize, Deserialize)] @@ -983,6 +985,12 @@ pub struct CrucibleResourcesV1 { pub datasets_and_snapshots: Vec<(Dataset, RegionSnapshot)>, } +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct CrucibleResourcesV2 { + pub datasets_and_regions: Vec<(Dataset, Region)>, + pub snapshots_to_delete: Vec, +} + /// Return the targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index eeabf64894..9c8a33fb17 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1280,6 +1280,7 @@ async fn ssc_start_running_snapshot( snapshot_id, snapshot_addr, volume_references: 0, // to be filled later + deleting: false, }) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index 4cd633f575..d6358d5435 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -155,39 +155,39 @@ async fn svd_delete_crucible_regions( sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_regions = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_regions( - log, - crucible_resources_to_delete.datasets_and_regions.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_regions: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_regions + } - // Remove DB records - let region_ids_to_delete = crucible_resources_to_delete - .datasets_and_regions - .iter() - .map(|(_, r)| r.id()) - .collect(); - - osagactx - .datastore() - .regions_hard_delete(log, region_ids_to_delete) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to regions_hard_delete: {:?}", - e, - )) - })?; + CrucibleResources::V2(crucible_resources_to_delete) => { + crucible_resources_to_delete.datasets_and_regions } - } + }; + + delete_crucible_regions(log, datasets_and_regions.clone()).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to delete_crucible_regions: {:?}", + e, + )) + }, + )?; + + // Remove DB records + let region_ids_to_delete = + datasets_and_regions.iter().map(|(_, r)| r.id()).collect(); + + osagactx + .datastore() + .regions_hard_delete(log, region_ids_to_delete) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to regions_hard_delete: {:?}", + e, + )) + })?; Ok(()) } @@ -202,26 +202,46 @@ async fn svd_delete_crucible_running_snapshots( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); let crucible_resources_to_delete = sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_snapshots = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_running_snapshots( - log, - crucible_resources_to_delete.datasets_and_snapshots.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_running_snapshots: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_snapshots } - } + + CrucibleResources::V2(crucible_resources_to_delete) => { + let mut datasets_and_snapshots: Vec<_> = Vec::with_capacity( + crucible_resources_to_delete.snapshots_to_delete.len(), + ); + + for region_snapshot in + crucible_resources_to_delete.snapshots_to_delete + { + let dataset = osagactx + .datastore() + .dataset_get(region_snapshot.dataset_id) + .await + .map_err(ActionError::action_failed)?; + + datasets_and_snapshots.push((dataset, region_snapshot)); + } + + datasets_and_snapshots + } + }; + + delete_crucible_running_snapshots(log, datasets_and_snapshots.clone()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete_crucible_running_snapshots: {:?}", + e, + )) + })?; Ok(()) } @@ -235,26 +255,46 @@ async fn svd_delete_crucible_snapshots( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); let crucible_resources_to_delete = sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_snapshots = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_snapshots( - log, - crucible_resources_to_delete.datasets_and_snapshots.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_snapshots: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_snapshots } - } + + CrucibleResources::V2(crucible_resources_to_delete) => { + let mut datasets_and_snapshots: Vec<_> = Vec::with_capacity( + crucible_resources_to_delete.snapshots_to_delete.len(), + ); + + for region_snapshot in + crucible_resources_to_delete.snapshots_to_delete + { + let dataset = osagactx + .datastore() + .dataset_get(region_snapshot.dataset_id) + .await + .map_err(ActionError::action_failed)?; + + datasets_and_snapshots.push((dataset, region_snapshot)); + } + + datasets_and_snapshots + } + }; + + delete_crucible_snapshots(log, datasets_and_snapshots.clone()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete_crucible_snapshots: {:?}", + e, + )) + })?; Ok(()) } @@ -293,6 +333,31 @@ async fn svd_delete_crucible_snapshot_records( })?; } } + + CrucibleResources::V2(crucible_resources_to_delete) => { + // Remove DB records + for region_snapshot in + &crucible_resources_to_delete.snapshots_to_delete + { + osagactx + .datastore() + .region_snapshot_remove( + region_snapshot.dataset_id, + region_snapshot.region_id, + region_snapshot.snapshot_id, + ) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to region_snapshot_remove {} {} {}: {:?}", + region_snapshot.dataset_id, + region_snapshot.region_id, + region_snapshot.snapshot_id, + e, + )) + })?; + } + } } Ok(()) diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index d212175415..68f4cdadd2 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -1094,6 +1094,7 @@ async fn test_region_snapshot_create_idempotent( snapshot_addr: "[::]:12345".to_string(), volume_references: 1, + deleting: false, }; datastore.region_snapshot_create(region_snapshot.clone()).await.unwrap(); @@ -1287,13 +1288,16 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { .unwrap(); let resources_1 = match resources_1 { - db::datastore::CrucibleResources::V1(resources_1) => resources_1, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_1) => resources_1, }; let resources_2 = match resources_2 { - db::datastore::CrucibleResources::V1(resources_2) => resources_2, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_2) => resources_2, }; let resources_3 = match resources_3 { - db::datastore::CrucibleResources::V1(resources_3) => resources_3, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_3) => resources_3, }; // No region deletions yet, these are just snapshot deletes @@ -1304,24 +1308,24 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { // But there are snapshots to delete - assert!(!resources_1.datasets_and_snapshots.is_empty()); - assert!(!resources_2.datasets_and_snapshots.is_empty()); - assert!(!resources_3.datasets_and_snapshots.is_empty()); + assert!(!resources_1.snapshots_to_delete.is_empty()); + assert!(!resources_2.snapshots_to_delete.is_empty()); + assert!(!resources_3.snapshots_to_delete.is_empty()); - // Assert there are no overlaps in the datasets_and_snapshots to delete. + // Assert there are no overlaps in the snapshots_to_delete to delete. - for tuple in &resources_1.datasets_and_snapshots { - assert!(!resources_2.datasets_and_snapshots.contains(tuple)); - assert!(!resources_3.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_1.snapshots_to_delete { + assert!(!resources_2.snapshots_to_delete.contains(tuple)); + assert!(!resources_3.snapshots_to_delete.contains(tuple)); } - for tuple in &resources_2.datasets_and_snapshots { - assert!(!resources_1.datasets_and_snapshots.contains(tuple)); - assert!(!resources_3.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_2.snapshots_to_delete { + assert!(!resources_1.snapshots_to_delete.contains(tuple)); + assert!(!resources_3.snapshots_to_delete.contains(tuple)); } - for tuple in &resources_3.datasets_and_snapshots { - assert!(!resources_1.datasets_and_snapshots.contains(tuple)); - assert!(!resources_2.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_3.snapshots_to_delete { + assert!(!resources_1.snapshots_to_delete.contains(tuple)); + assert!(!resources_2.snapshots_to_delete.contains(tuple)); } } diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 70d34fb778..e263593def 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -19,6 +19,7 @@ use nexus_test_utils::resource_helpers::DiskTest; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; +use nexus_types::identity::Asset; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -1813,6 +1814,313 @@ async fn test_volume_checkout_updates_sparse_mid_multiple_gen( volume_match_gen(new_vol, vec![Some(8), None, Some(10)]); } +/// Test that the Crucible agent's port reuse does not confuse +/// `decrease_crucible_resource_count_and_soft_delete_volume`, due to the +/// `[ipv6]:port` targets being reused. +#[nexus_test] +async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + + // Four zpools, one dataset each + let mut disk_test = DiskTest::new(&cptestctx).await; + disk_test + .add_zpool_with_dataset(&cptestctx, DiskTest::DEFAULT_ZPOOL_SIZE_GIB) + .await; + + // This bug occurs when region_snapshot records share a snapshot_addr, so + // insert those here manually. + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + // first snapshot-create + ( + disk_test.zpools[0].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), + ), + ( + disk_test.zpools[1].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102:7]:19016"), + ), + ( + disk_test.zpools[2].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19016"), + ), + // second snapshot-create + ( + disk_test.zpools[0].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), // duplicate! + ), + ( + disk_test.zpools[3].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104:7]:19016"), + ), + ( + disk_test.zpools[2].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19017"), + ), + ]; + + // First, three `region_snapshot` records created in the snapshot-create + // saga, which are then used to make snapshot's volume construction request + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + let volume_id = Uuid::new_v4(); + let volume = datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Sanity check + + assert_eq!(volume.id(), volume_id); + + // Make sure the volume has only three read-only targets: + + let crucible_targets = datastore + .read_only_resources_associated_with_volume(volume_id) + .await + .unwrap(); + assert_eq!(crucible_targets.read_only_targets.len(), 3); + + // Also validate the volume's region_snapshots got incremented by + // volume_create + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 1); + assert_eq!(region_snapshot.deleting, false); + } + + // Soft delete the volume, and validate that only three region_snapshot + // records are returned. + + let cr = datastore + .decrease_crucible_resource_count_and_soft_delete_volume(volume_id) + .await + .unwrap(); + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + + match cr { + nexus_db_queries::db::datastore::CrucibleResources::V1(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.datasets_and_snapshots.len(), 3); + } + + nexus_db_queries::db::datastore::CrucibleResources::V2(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.snapshots_to_delete.len(), 3); + } + } + + // Now, let's say we're at a spot where the running snapshots have been + // deleted, but before volume_hard_delete or region_snapshot_remove are + // called. Pretend another snapshot-create and snapshot-delete snuck in + // here, and the second snapshot hits a agent that reuses the first target. + + for i in 3..6 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + let volume_id = Uuid::new_v4(); + let volume = datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[3].3.clone(), + region_snapshots[4].3.clone(), + region_snapshots[5].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Sanity check + + assert_eq!(volume.id(), volume_id); + + // Make sure the volume has only three read-only targets: + + let crucible_targets = datastore + .read_only_resources_associated_with_volume(volume_id) + .await + .unwrap(); + assert_eq!(crucible_targets.read_only_targets.len(), 3); + + // Also validate only the volume's region_snapshots got incremented by + // volume_create. + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + for i in 3..6 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 1); + assert_eq!(region_snapshot.deleting, false); + } + + // Soft delete the volume, and validate that only three region_snapshot + // records are returned. + + let cr = datastore + .decrease_crucible_resource_count_and_soft_delete_volume(volume_id) + .await + .unwrap(); + + // Make sure every region_snapshot is now 0, and deleting + + for i in 0..6 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + + match cr { + nexus_db_queries::db::datastore::CrucibleResources::V1(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.datasets_and_snapshots.len(), 3); + } + + nexus_db_queries::db::datastore::CrucibleResources::V2(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.snapshots_to_delete.len(), 3); + } + } +} + #[nexus_test] async fn test_disk_create_saga_unwinds_correctly( cptestctx: &ControlPlaneTestContext, diff --git a/schema/crdb/6.0.0/up1.sql b/schema/crdb/6.0.0/up1.sql new file mode 100644 index 0000000000..4a3cdc302e --- /dev/null +++ b/schema/crdb/6.0.0/up1.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot ADD COLUMN IF NOT EXISTS deleting BOOL NOT NULL DEFAULT false; diff --git a/schema/crdb/6.0.0/up2.sql b/schema/crdb/6.0.0/up2.sql new file mode 100644 index 0000000000..77c136a3bf --- /dev/null +++ b/schema/crdb/6.0.0/up2.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot ALTER COLUMN deleting DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ad09092f8f..a62cbae5ea 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -505,6 +505,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot ( /* How many volumes reference this? */ volume_references INT8 NOT NULL, + /* Is this currently part of some resources_to_delete? */ + deleting BOOL NOT NULL, + PRIMARY KEY (dataset_id, region_id, snapshot_id) ); @@ -2574,7 +2577,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '5.0.0', NULL) + ( TRUE, NOW(), NOW(), '6.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 3e9f46c057b223ad390c742f882ef05e09366b77 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Tue, 10 Oct 2023 12:57:14 -0700 Subject: [PATCH 06/13] update softnpu version (#4227) This pulls in a new version of the `npuzone` tool from the softnpu repo that automatically pulls the latest sidecar-lite code. --- tools/ci_download_softnpu_machinery | 2 +- tools/create_virtual_hardware.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_download_softnpu_machinery b/tools/ci_download_softnpu_machinery index d37d428476..7975a310f0 100755 --- a/tools/ci_download_softnpu_machinery +++ b/tools/ci_download_softnpu_machinery @@ -15,7 +15,7 @@ OUT_DIR="out/npuzone" # Pinned commit for softnpu ASIC simulator SOFTNPU_REPO="softnpu" -SOFTNPU_COMMIT="41b3a67b3d44f51528816ff8e539b4001df48305" +SOFTNPU_COMMIT="eb27e6a00f1082c9faac7cf997e57d0609f7a309" # This is the softnpu ASIC simulator echo "fetching npuzone" diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index dd6d9af9dd..95c2aa63df 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -37,7 +37,7 @@ function ensure_simulated_links { dladm create-simnet -t "net$I" dladm create-simnet -t "sc${I}_0" dladm modify-simnet -t -p "net$I" "sc${I}_0" - dladm set-linkprop -p mtu=1600 "sc${I}_0" # encap headroom + dladm set-linkprop -p mtu=9000 "sc${I}_0" # match emulated devices fi success "Simnet net$I/sc${I}_0 exists" done From 97ddc7da3a5cdbded9097827f90151980755c1e4 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 10 Oct 2023 13:12:23 -0700 Subject: [PATCH 07/13] [dependencies] add Renovate config (#4236) * Add configuration for automatically creating dependencies, and for pinning GitHub Actions digests * Add a post-upgrade script that runs cargo-hakari. Depends on https://github.com/oxidecomputer/renovate-config/pull/5. See [RFD 434](https://rfd.shared.oxide.computer/rfd/0434) and #4166. --- .github/renovate.json | 9 ++++++++ tools/renovate-post-upgrade.sh | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 .github/renovate.json create mode 100755 tools/renovate-post-upgrade.sh diff --git a/.github/renovate.json b/.github/renovate.json new file mode 100644 index 0000000000..405a3e282b --- /dev/null +++ b/.github/renovate.json @@ -0,0 +1,9 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "local>oxidecomputer/renovate-config", + "local>oxidecomputer/renovate-config//rust/autocreate", + "local>oxidecomputer/renovate-config:post-upgrade", + "helpers:pinGitHubActionDigests" + ] +} diff --git a/tools/renovate-post-upgrade.sh b/tools/renovate-post-upgrade.sh new file mode 100755 index 0000000000..c21832e0a9 --- /dev/null +++ b/tools/renovate-post-upgrade.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# This script is run after Renovate upgrades dependencies or lock files. + +set -euo pipefail + +# Function to retry a command up to 3 times. +function retry_command { + local retries=3 + local delay=5 + local count=0 + until "$@"; do + exit_code=$? + count=$((count+1)) + if [ $count -lt $retries ]; then + echo "Command failed with exit code $exit_code. Retrying in $delay seconds..." + sleep $delay + else + echo "Command failed with exit code $exit_code after $count attempts." + return $exit_code + fi + done +} + +# Download and install cargo-hakari if it is not already installed. +if ! command -v cargo-hakari &> /dev/null; then + # Need cargo-binstall to install cargo-hakari. + if ! command -v cargo-binstall &> /dev/null; then + # Fetch cargo binstall. + echo "Installing cargo-binstall..." + curl --retry 3 -L --proto '=https' --tlsv1.2 -sSfO https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh + retry_command bash install-from-binstall-release.sh + fi + + # Install cargo-hakari. + echo "Installing cargo-hakari..." + retry_command cargo binstall cargo-hakari --no-confirm +fi + +# Run cargo hakari to regenerate the workspace-hack file. +echo "Running cargo-hakari..." +cargo hakari generate From 72a0429debfaf4feeec2f952fefe3ffbffeb06f6 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 10 Oct 2023 17:50:06 -0700 Subject: [PATCH 08/13] [update-engine] fix buffer tests (#4163) Apparently I'd made a couple of mistakes while writing tests: * I was adding all events a second time by accident, which was hiding the fact that... * A couple not signs were flipped, whoops. --- update-engine/src/buffer.rs | 46 ++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/update-engine/src/buffer.rs b/update-engine/src/buffer.rs index 3de0e45f24..1779ef7da6 100644 --- a/update-engine/src/buffer.rs +++ b/update-engine/src/buffer.rs @@ -1389,7 +1389,10 @@ mod tests { test_cx .run_filtered_test( "all events passed in", - |buffer, event| buffer.add_event(event.clone()), + |buffer, event| { + buffer.add_event(event.clone()); + true + }, WithDeltas::No, ) .unwrap(); @@ -1397,10 +1400,12 @@ mod tests { test_cx .run_filtered_test( "progress events skipped", - |buffer, event| { - if let Event::Step(event) = event { + |buffer, event| match event { + Event::Step(event) => { buffer.add_step_event(event.clone()); + true } + Event::Progress(_) => false, }, WithDeltas::Both, ) @@ -1410,13 +1415,16 @@ mod tests { .run_filtered_test( "low-priority events skipped", |buffer, event| match event { - Event::Step(event) => { - if event.kind.priority() == StepEventPriority::Low { + Event::Step(event) => match event.kind.priority() { + StepEventPriority::High => { buffer.add_step_event(event.clone()); + true } - } + StepEventPriority::Low => false, + }, Event::Progress(event) => { buffer.add_progress_event(event.clone()); + true } }, WithDeltas::Both, @@ -1427,13 +1435,16 @@ mod tests { .run_filtered_test( "low-priority and progress events skipped", |buffer, event| match event { - Event::Step(event) => { - if event.kind.priority() == StepEventPriority::Low { + Event::Step(event) => match event.kind.priority() { + StepEventPriority::High => { buffer.add_step_event(event.clone()); + true } - } + StepEventPriority::Low => false, + }, Event::Progress(_) => { - // Don't add progress events either. + // Don't add progress events. + false } }, WithDeltas::Both, @@ -1565,7 +1576,10 @@ mod tests { fn run_filtered_test( &self, event_fn_description: &str, - mut event_fn: impl FnMut(&mut EventBuffer, &Event), + mut event_fn: impl FnMut( + &mut EventBuffer, + &Event, + ) -> bool, with_deltas: WithDeltas, ) -> anyhow::Result<()> { match with_deltas { @@ -1590,7 +1604,10 @@ mod tests { fn run_filtered_test_inner( &self, - mut event_fn: impl FnMut(&mut EventBuffer, &Event), + mut event_fn: impl FnMut( + &mut EventBuffer, + &Event, + ) -> bool, with_deltas: bool, ) -> anyhow::Result<()> { let description = format!("with deltas = {with_deltas}"); @@ -1608,8 +1625,9 @@ mod tests { let mut last_seen_opt = with_deltas.then_some(None); for (i, event) in self.generated_events.iter().enumerate() { - (event_fn)(&mut buffer, event); - buffer.add_event(event.clone()); + // Going to use event_added in an upcoming commit. + let _event_added = (event_fn)(&mut buffer, event); + let report = match last_seen_opt { Some(last_seen) => buffer.generate_report_since(last_seen), None => buffer.generate_report(), From 194889b956abbb3e01ce25b11b733c02598c3215 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 10 Oct 2023 19:27:33 -0700 Subject: [PATCH 09/13] [buildomat] authorize PRs generated by oxide-renovate (#4244) Means that PRs like https://github.com/oxidecomputer/omicron/pull/4241 will be automatically authorized. Also skip cargo-hakari update if cargo isn't present. --- .github/buildomat/config.toml | 1 + tools/renovate-post-upgrade.sh | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/.github/buildomat/config.toml b/.github/buildomat/config.toml index 922de631f2..419173fa50 100644 --- a/.github/buildomat/config.toml +++ b/.github/buildomat/config.toml @@ -17,5 +17,6 @@ org_only = true allow_users = [ "dependabot[bot]", "oxide-reflector-bot[bot]", + "oxide-renovate[bot]", "renovate[bot]", ] diff --git a/tools/renovate-post-upgrade.sh b/tools/renovate-post-upgrade.sh index c21832e0a9..2699f9f6a0 100755 --- a/tools/renovate-post-upgrade.sh +++ b/tools/renovate-post-upgrade.sh @@ -22,6 +22,13 @@ function retry_command { done } +# If cargo isn't present, skip this -- it implies that a non-Rust dependency was +# updated. +if ! command -v cargo &> /dev/null; then + echo "Skipping cargo-hakari update because cargo is not present." + exit 0 +fi + # Download and install cargo-hakari if it is not already installed. if ! command -v cargo-hakari &> /dev/null; then # Need cargo-binstall to install cargo-hakari. From a972c80c407b68848c178aca236ae00067bc4d3b Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Wed, 11 Oct 2023 17:59:57 +0100 Subject: [PATCH 10/13] destroy_virtual_hardware.sh needs to unmount backing filesystems (#4255) The backing filesystems added in d624bce9af2 prevent the destroy_virtual_hardware.sh script from properly cleaning up all ZFS pools and cause the fmd service to go into maintenance which delays control plane startup. This updates the script to unwind the backing datasets as part of its work. --- tools/destroy_virtual_hardware.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/destroy_virtual_hardware.sh b/tools/destroy_virtual_hardware.sh index ae6fef0673..46c6f117c4 100755 --- a/tools/destroy_virtual_hardware.sh +++ b/tools/destroy_virtual_hardware.sh @@ -56,7 +56,23 @@ function remove_softnpu_zone { --ports sc0_1,tfportqsfp0_0 } +# Some services have their working data overlaid by backing mounts from the +# internal boot disk. Before we can destroy the ZFS pools, we need to unmount +# these. + +BACKED_SERVICES="svc:/system/fmd:default" + +function demount_backingfs { + svcadm disable -st $BACKED_SERVICES + zpool list -Hpo name | grep '^oxi_' \ + | xargs -i zfs list -Hpo name,canmount,mounted -r {}/backing \ + | awk '$3 == "yes" && $2 == "noauto" { print $1 }' \ + | xargs -l zfs umount + svcadm enable -st $BACKED_SERVICES +} + verify_omicron_uninstalled +demount_backingfs unload_xde_driver remove_softnpu_zone try_remove_vnics From 1a21fdd581d80d92287c9f29e095dbee11f65b28 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 12:23:46 -0700 Subject: [PATCH 11/13] Update Rust crate proptest to 1.3.1 (#4243) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 10 +++++----- Cargo.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 306e953049..421bbd5e16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6737,19 +6737,19 @@ dependencies = [ [[package]] name = "proptest" -version = "1.2.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e35c06b98bf36aba164cc17cb25f7e232f5c4aeea73baa14b8a9f0d92dbfa65" +checksum = "7c003ac8c77cb07bb74f5f198bce836a689bcd5a42574612bf14d17bfd08c20e" dependencies = [ "bit-set", - "bitflags 1.3.2", - "byteorder", + "bit-vec", + "bitflags 2.4.0", "lazy_static", "num-traits", "rand 0.8.5", "rand_chacha 0.3.1", "rand_xorshift", - "regex-syntax 0.6.29", + "regex-syntax 0.7.5", "rusty-fork", "tempfile", "unarray", diff --git a/Cargo.toml b/Cargo.toml index da7b582fe3..fdd67c3b5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -283,7 +283,7 @@ progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branc bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e", features = [ "generated-migration" ] } propolis-server = { git = "https://github.com/oxidecomputer/propolis", rev = "901b710b6e5bd05a94a323693c2b971e7e7b240e", default-features = false, features = ["mock-only"] } -proptest = "1.2.0" +proptest = "1.3.1" quote = "1.0" rand = "0.8.5" ratatui = "0.23.0" From 02aef4bec751b47b6d19adbeef9e51c42c10204d Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 12:34:34 -0700 Subject: [PATCH 12/13] Update Rust crate predicates to 3.0.4 (#4254) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 12 ++++++------ Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 421bbd5e16..d58ba77133 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -283,7 +283,7 @@ dependencies = [ "anstyle", "bstr 1.6.0", "doc-comment", - "predicates 3.0.3", + "predicates 3.0.4", "predicates-core", "predicates-tree", "wait-timeout", @@ -5442,7 +5442,7 @@ dependencies = [ "phf_shared 0.11.2", "postgres-types", "ppv-lite86", - "predicates 3.0.3", + "predicates 3.0.4", "rand 0.8.5", "rand_chacha 0.3.1", "regex", @@ -6437,14 +6437,14 @@ dependencies = [ [[package]] name = "predicates" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09963355b9f467184c04017ced4a2ba2d75cbcb4e7462690d388233253d4b1a9" +checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0" dependencies = [ "anstyle", "difflib", "float-cmp", - "itertools 0.10.5", + "itertools 0.11.0", "normalize-line-endings", "predicates-core", "regex", @@ -9374,7 +9374,7 @@ dependencies = [ "omicron-common 0.1.0", "omicron-test-utils", "omicron-workspace-hack", - "predicates 3.0.3", + "predicates 3.0.4", "slog", "slog-async", "slog-envlogger", diff --git a/Cargo.toml b/Cargo.toml index fdd67c3b5c..832b8663e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -274,7 +274,7 @@ percent-encoding = "2.2.0" pem = "1.1" petgraph = "0.6.4" postgres-protocol = "0.6.6" -predicates = "3.0.3" +predicates = "3.0.4" pretty_assertions = "1.4.0" pretty-hex = "0.3.0" proc-macro2 = "1.0" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 106da92f62..a91477678b 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -73,7 +73,7 @@ petgraph = { version = "0.6.4", features = ["serde-1"] } phf_shared = { version = "0.11.2" } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", "std"] } -predicates = { version = "3.0.3" } +predicates = { version = "3.0.4" } rand = { version = "0.8.5", features = ["min_const_gen", "small_rng"] } rand_chacha = { version = "0.3.1" } regex = { version = "1.9.5" } @@ -171,7 +171,7 @@ petgraph = { version = "0.6.4", features = ["serde-1"] } phf_shared = { version = "0.11.2" } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", "std"] } -predicates = { version = "3.0.3" } +predicates = { version = "3.0.4" } rand = { version = "0.8.5", features = ["min_const_gen", "small_rng"] } rand_chacha = { version = "0.3.1" } regex = { version = "1.9.5" } From d12cb0ffceeb09c1cccdada29ca24c3829a2c9fa Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 11 Oct 2023 12:37:42 -0700 Subject: [PATCH 13/13] Pin GitHub Actions dependencies (#4240) --- .github/workflows/check-opte-ver.yml | 2 +- .github/workflows/check-workspace-deps.yml | 2 +- .github/workflows/hakari.yml | 10 +++++----- .github/workflows/rust.yml | 14 +++++++------- .github/workflows/update-dendrite.yml | 2 +- .github/workflows/update-maghemite.yml | 2 +- .github/workflows/validate-openapi-spec.yml | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index 3b57f2795f..a8e18f080e 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -9,7 +9,7 @@ jobs: check-opte-ver: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3.5.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 - name: Install jq run: sudo apt-get install -y jq - name: Install toml-cli diff --git a/.github/workflows/check-workspace-deps.yml b/.github/workflows/check-workspace-deps.yml index 9611c4103c..521afa7359 100644 --- a/.github/workflows/check-workspace-deps.yml +++ b/.github/workflows/check-workspace-deps.yml @@ -10,6 +10,6 @@ jobs: check-workspace-deps: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3.5.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 - name: Check Workspace Dependencies run: cargo xtask check-workspace-deps diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index d79196d318..df4cbc9b59 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -17,21 +17,21 @@ jobs: env: RUSTFLAGS: -D warnings steps: - - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608 # v4 + - uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1 with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@v2 + uses: taiki-e/install-action@e659bf85ee986e37e35cc1c53bfeebe044d8133e # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date - uses: actions-rs/cargo@v1 + uses: actions-rs/cargo@844f36862e911db73fe0815f00a4a2602c279505 # v1 with: command: hakari args: generate --diff - name: Check all crates depend on workspace-hack - uses: actions-rs/cargo@v1 + uses: actions-rs/cargo@844f36862e911db73fe0815f00a4a2602c279505 # v1 with: command: hakari args: manage-deps --dry-run diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f5cf1dc885..873b316e16 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -9,7 +9,7 @@ jobs: check-style: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3.5.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 - name: Report cargo version run: cargo --version - name: Report rustfmt version @@ -27,8 +27,8 @@ jobs: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - - uses: actions/checkout@v3.5.0 - - uses: Swatinem/rust-cache@v2.2.1 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version @@ -53,8 +53,8 @@ jobs: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - - uses: actions/checkout@v3.5.0 - - uses: Swatinem/rust-cache@v2.2.1 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version @@ -79,8 +79,8 @@ jobs: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - - uses: actions/checkout@v3.5.0 - - uses: Swatinem/rust-cache@v2.2.1 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + - uses: Swatinem/rust-cache@6fd3edff6979b79f87531400ad694fb7f2c84b1f # v2.2.1 if: ${{ github.ref != 'refs/heads/main' }} - name: Report cargo version run: cargo --version diff --git a/.github/workflows/update-dendrite.yml b/.github/workflows/update-dendrite.yml index 86049dcafc..10d8ef7618 100644 --- a/.github/workflows/update-dendrite.yml +++ b/.github/workflows/update-dendrite.yml @@ -29,7 +29,7 @@ jobs: steps: # Checkout both the target and integration branches - - uses: actions/checkout@v3.5.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 with: token: ${{ inputs.reflector_access_token }} fetch-depth: 0 diff --git a/.github/workflows/update-maghemite.yml b/.github/workflows/update-maghemite.yml index 07fe329af3..7aa2b8b6c8 100644 --- a/.github/workflows/update-maghemite.yml +++ b/.github/workflows/update-maghemite.yml @@ -29,7 +29,7 @@ jobs: steps: # Checkout both the target and integration branches - - uses: actions/checkout@v3.5.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 with: token: ${{ inputs.reflector_access_token }} fetch-depth: 0 diff --git a/.github/workflows/validate-openapi-spec.yml b/.github/workflows/validate-openapi-spec.yml index 06fc7526a8..1d6c152296 100644 --- a/.github/workflows/validate-openapi-spec.yml +++ b/.github/workflows/validate-openapi-spec.yml @@ -10,8 +10,8 @@ jobs: format: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3.5.0 - - uses: actions/setup-node@v3.6.0 + - uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + - uses: actions/setup-node@64ed1c7eab4cce3362f8c340dee64e5eaeef8f7c # v3.6.0 with: node-version: '18' - name: Install our tools