diff --git a/.envrc b/.envrc index 036459a4a9..48df8e3c63 100644 --- a/.envrc +++ b/.envrc @@ -6,7 +6,7 @@ PATH_add out/clickhouse PATH_add out/dendrite-stub/bin PATH_add out/mgd/root/opt/oxide/mgd/bin -if nix flake show &> /dev/null +if [ "$OMICRON_USE_FLAKE" = 1 ] && nix flake show &> /dev/null then use flake; -fi \ No newline at end of file +fi diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 92f7af36d5..68d816fc2d 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@e17a4e247d4a36441181d7758c499d97e1e006bd # v2 + uses: taiki-e/install-action@717ed1cb83959ef327137c2f806e1d8597bfca9f # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fa99017b0d..724f88e7a3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,10 +26,11 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-22.04, macos-12 ] + os: [ ubuntu-22.04, macos-14 ] steps: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo + if: ${{ startsWith(matrix.os, 'ubuntu') }} run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: diff --git a/.gitignore b/.gitignore index fc3cb4133a..fc5fd5f297 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,6 @@ README.html TODO.html logs out -tools/clickhouse* -tools/cockroach* /clickhouse/ /cockroachdb/ smf/nexus/root.json diff --git a/Cargo.lock b/Cargo.lock index c00f0c17f6..5814dd101a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -332,7 +332,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7b2dbe9169059af0f821e811180fddc971fc210c776c133c7819ccd6e478db" dependencies = [ - "rustix 0.38.30", + "rustix 0.38.31", "tempfile", "windows-sys 0.52.0", ] @@ -466,7 +466,7 @@ source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2 dependencies = [ "bhyve_api_sys", "libc", - "strum", + "strum 0.25.0", ] [[package]] @@ -475,7 +475,7 @@ version = "0.0.0" source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" dependencies = [ "libc", - "strum", + "strum 0.25.0", ] [[package]] @@ -830,6 +830,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cbc" version = "0.1.2" @@ -1049,6 +1058,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "console" version = "0.15.8" @@ -1397,6 +1419,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.9.2" @@ -2020,9 +2063,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.13" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" +checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" [[package]] name = "ecdsa" @@ -2224,9 +2267,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fatfs" @@ -2247,7 +2290,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -2258,7 +2301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -3654,7 +3697,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -4180,6 +4223,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +dependencies = [ + "serde", +] + [[package]] name = "nanorand" version = "0.7.0" @@ -4313,7 +4365,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4385,7 +4437,7 @@ dependencies = [ "slog", "static_assertions", "steno", - "strum", + "strum 0.25.0", "subprocess", "swrite", "term", @@ -4448,7 +4500,7 @@ dependencies = [ "serde_json", "sled-agent-client", "slog", - "strum", + "strum 0.25.0", "thiserror", "tokio", "uuid", @@ -4537,7 +4589,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4854,7 +4906,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "slog", - "strum", + "strum 0.25.0", "test-strategy", "thiserror", "tokio", @@ -5057,7 +5109,7 @@ dependencies = [ "slog-term", "sp-sim", "steno", - "strum", + "strum 0.25.0", "subprocess", "tempfile", "term", @@ -5080,9 +5132,12 @@ dependencies = [ "async-bb8-diesel", "chrono", "clap 4.4.3", + "crossterm", "crucible-agent-client", + "csv", "diesel", "dropshot", + "dyn-clone", "expectorate", "futures", "gateway-client", @@ -5091,6 +5146,7 @@ dependencies = [ "humantime", "internal-dns", "ipnetwork", + "multimap", "nexus-client", "nexus-db-model", "nexus-db-queries", @@ -5104,13 +5160,14 @@ dependencies = [ "omicron-workspace-hack", "oximeter-client", "pq-sys", + "ratatui", "regex", "serde", "serde_json", "sled-agent-client", "slog", "slog-error-chain", - "strum", + "strum 0.25.0", "subprocess", "tabled", "textwrap 0.16.0", @@ -5145,7 +5202,7 @@ dependencies = [ "slog-bunyan", "slog-term", "smf", - "strum", + "strum 0.25.0", "swrite", "tar", "thiserror", @@ -5383,7 +5440,7 @@ dependencies = [ "regex-syntax 0.8.2", "reqwest", "ring 0.17.7", - "rustix 0.38.30", + "rustix 0.38.31", "schemars", "semver 1.0.21", "serde", @@ -5669,7 +5726,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "strum", + "strum 0.25.0", "thiserror", "trybuild", "uuid", @@ -5722,7 +5779,7 @@ dependencies = [ "slog-async", "slog-dtrace", "slog-term", - "strum", + "strum 0.25.0", "subprocess", "thiserror", "tokio", @@ -5764,7 +5821,7 @@ dependencies = [ "slog-term", "sqlformat", "sqlparser", - "strum", + "strum 0.25.0", "tabled", "tempfile", "thiserror", @@ -5932,27 +5989,26 @@ dependencies = [ [[package]] name = "parse-display" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6509d08722b53e8dafe97f2027b22ccbe3a5db83cb352931e9716b0aa44bc5c" +checksum = "06af5f9333eb47bd9ba8462d612e37a8328a5cb80b13f0af4de4c3b89f52dee5" dependencies = [ - "once_cell", "parse-display-derive", "regex", + "regex-syntax 0.8.2", ] [[package]] name = "parse-display-derive" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68517892c8daf78da08c0db777fcc17e07f2f63ef70041718f8a7630ad84f341" +checksum = "dc9252f259500ee570c75adcc4e317fa6f57a1e47747d622e0bf838002a7b790" dependencies = [ - "once_cell", "proc-macro2", "quote", "regex", - "regex-syntax 0.7.5", - "structmeta", + "regex-syntax 0.8.2", + "structmeta 0.3.0", "syn 2.0.48", ] @@ -6850,19 +6906,20 @@ dependencies = [ [[package]] name = "ratatui" -version = "0.25.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5659e52e4ba6e07b2dad9f1158f578ef84a73762625ddb51536019f34d180eb" +checksum = "154b85ef15a5d1719bcaa193c3c81fe645cd120c156874cd660fe49fd21d1373" dependencies = [ "bitflags 2.4.0", "cassowary", + "compact_str", "crossterm", "indoc 2.0.3", "itertools 0.12.1", "lru", "paste", "stability", - "strum", + "strum 0.26.1", "unicode-segmentation", "unicode-width", ] @@ -6959,7 +7016,7 @@ dependencies = [ "nu-ansi-term", "serde", "strip-ansi-escapes", - "strum", + "strum 0.25.0", "strum_macros 0.25.2", "thiserror", "unicode-segmentation", @@ -7027,12 +7084,6 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.2" @@ -7398,9 +7449,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.30" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.0", "errno", @@ -7572,8 +7623,7 @@ dependencies = [ [[package]] name = "samael" version = "0.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75583aad4a51c50fc0af69c230d18078c9d5a69a98d0f6013d01053acf744f4" +source = "git+https://github.com/oxidecomputer/samael?branch=oxide/omicron#9e609a8f6fa0dd84e3bb8f579f46bd780c8be62b" dependencies = [ "base64", "bindgen", @@ -8118,6 +8168,7 @@ dependencies = [ name = "sled-agent-client" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", "chrono", "ipnetwork", @@ -8615,7 +8666,19 @@ checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d" dependencies = [ "proc-macro2", "quote", - "structmeta-derive", + "structmeta-derive 0.2.0", + "syn 2.0.48", +] + +[[package]] +name = "structmeta" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive 0.3.0", "syn 2.0.48", ] @@ -8630,6 +8693,17 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "structmeta-derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "structopt" version = "0.3.26" @@ -8663,6 +8737,15 @@ dependencies = [ "strum_macros 0.25.2", ] +[[package]] +name = "strum" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "723b93e8addf9aa965ebe2d11da6d7540fa2283fcea14b3371ff055f7ba13f5f" +dependencies = [ + "strum_macros 0.26.1", +] + [[package]] name = "strum_macros" version = "0.24.3" @@ -8689,6 +8772,19 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "strum_macros" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3417fc93d76740d974a01654a09777cb500428cc874ca9f45edfe0c4d4cd18" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] + [[package]] name = "subprocess" version = "0.2.9" @@ -8853,14 +8949,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -8917,7 +9012,7 @@ checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9" dependencies = [ "proc-macro2", "quote", - "structmeta", + "structmeta 0.2.0", "syn 2.0.48", ] @@ -9597,9 +9692,9 @@ dependencies = [ [[package]] name = "tui-tree-widget" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136011b328c4f392499a02c4b5b78d509fb297bf9c10f2bda5d11d65cb946e4c" +checksum = "5c317bb061f42d943a2eb118b5de0ee98fc2443f0631e54b24a19de014a28810" dependencies = [ "ratatui", "unicode-width", diff --git a/Cargo.toml b/Cargo.toml index 6e4799d184..65197da650 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -183,6 +183,7 @@ crossterm = { version = "0.27.0", features = ["event-stream"] } crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } +csv = "1.3.0" curve25519-dalek = "4" datatest-stable = "0.2.3" display-error-chain = "0.2.0" @@ -197,6 +198,7 @@ dns-server = { path = "dns-server" } dns-service-client = { path = "clients/dns-service-client" } dpd-client = { path = "clients/dpd-client" } dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] } +dyn-clone = "1.0.16" either = "1.9.0" expectorate = "1.1.0" fatfs = "0.3.6" @@ -248,6 +250,7 @@ mime_guess = "2.0.4" mockall = "0.12" newtype_derive = "0.1.6" mg-admin-client = { path = "clients/mg-admin-client" } +multimap = "0.8.1" nexus-blueprint-execution = { path = "nexus/blueprint-execution" } nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } @@ -293,7 +296,7 @@ oximeter-instruments = { path = "oximeter/instruments" } oximeter-macro-impl = { path = "oximeter/oximeter-macro-impl" } oximeter-producer = { path = "oximeter/producer" } p256 = "0.13" -parse-display = "0.8.2" +parse-display = "0.9.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } parse-size = "1.0.0" paste = "1.0.14" @@ -314,7 +317,7 @@ propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev proptest = "1.4.0" quote = "1.0" rand = "0.8.5" -ratatui = "0.25.0" +ratatui = "0.26.0" rayon = "1.8" rcgen = "0.12.1" reedline = "0.28.0" @@ -378,7 +381,7 @@ syn = { version = "2.0" } tabled = "0.15.0" tar = "0.4" tempdir = "0.3" -tempfile = "3.9" +tempfile = "3.10" term = "0.7" termios = "0.3" textwrap = "0.16.0" @@ -400,7 +403,7 @@ trust-dns-server = "0.22" trybuild = "1.0.89" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } -tui-tree-widget = "0.16.0" +tui-tree-widget = "0.17.0" unicode-width = "0.1.11" update-common = { path = "update-common" } update-engine = { path = "update-engine" } @@ -611,3 +614,8 @@ branch = "oxide/omicron" # to it. [patch.crates-io.omicron-workspace-hack] path = "workspace-hack" + +# Pulls in https://github.com/njaremko/samael/pull/41 +[patch.crates-io.samael] +git = "https://github.com/oxidecomputer/samael" +branch = "oxide/omicron" diff --git a/clients/ddm-admin-client/build.rs b/clients/ddm-admin-client/build.rs index da74ee9962..c51ec05faa 100644 --- a/clients/ddm-admin-client/build.rs +++ b/clients/ddm-admin-client/build.rs @@ -33,7 +33,9 @@ fn main() -> Result<()> { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. let local_path = - format!("../../out/downloads/ddm-admin-{commit}.json"); + env::var("DDM_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/ddm-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_ddm_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/clients/dpd-client/build.rs b/clients/dpd-client/build.rs index 6a65ab9495..536869b4a2 100644 --- a/clients/dpd-client/build.rs +++ b/clients/dpd-client/build.rs @@ -38,7 +38,10 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the // requisite openapi spec. - let local_path = format!("../../out/downloads/dpd-{commit}.json"); + let local_path = + env::var("DPD_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/dpd-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_dendrite_openapi` (after updating `tools/dendrite_openapi_version` if the dendrite commit in package-manifest.toml has changed)"); } diff --git a/clients/mg-admin-client/build.rs b/clients/mg-admin-client/build.rs index dcc7ae61cb..d9886d0ece 100644 --- a/clients/mg-admin-client/build.rs +++ b/clients/mg-admin-client/build.rs @@ -31,8 +31,9 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. - let local_path = - format!("../../out/downloads/mg-admin-{commit}.json"); + let local_path = env::var("MG_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/mg-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_mg_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index 8630030b24..71b94441ed 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" license = "MPL-2.0" [dependencies] +anyhow.workspace = true async-trait.workspace = true chrono.workspace = true omicron-common.workspace = true diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 39de64ec62..eb1e57b11f 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -4,8 +4,11 @@ //! Interface for making API requests to a Sled Agent +use anyhow::Context; use async_trait::async_trait; use std::convert::TryFrom; +use std::net::IpAddr; +use std::net::SocketAddr; use uuid::Uuid; progenitor::generate_api!( @@ -86,6 +89,74 @@ impl types::OmicronZoneType { | types::OmicronZoneType::Oximeter { .. } => false, } } + + /// Identifies whether this is a Nexus zone + pub fn is_nexus(&self) -> bool { + match self { + types::OmicronZoneType::Nexus { .. } => true, + + types::OmicronZoneType::BoundaryNtp { .. } + | types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::ExternalDns { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => false, + } + } + + /// This zone's external IP + pub fn external_ip(&self) -> anyhow::Result> { + match self { + types::OmicronZoneType::Nexus { external_ip, .. } => { + Ok(Some(*external_ip)) + } + + types::OmicronZoneType::ExternalDns { dns_address, .. } => { + let dns_address = + dns_address.parse::().with_context(|| { + format!( + "failed to parse ExternalDns address {dns_address}" + ) + })?; + Ok(Some(dns_address.ip())) + } + + types::OmicronZoneType::BoundaryNtp { snat_cfg, .. } => { + Ok(Some(snat_cfg.ip)) + } + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => Ok(None), + } + } + + /// The service vNIC providing external connectivity to this zone + pub fn service_vnic(&self) -> Option<&types::NetworkInterface> { + match self { + types::OmicronZoneType::Nexus { nic, .. } + | types::OmicronZoneType::ExternalDns { nic, .. } + | types::OmicronZoneType::BoundaryNtp { nic, .. } => Some(nic), + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => None, + } + } } impl omicron_common::api::external::ClientError for types::Error { @@ -351,7 +422,6 @@ impl From for types::Ipv6Net { impl From for types::IpNet { fn from(s: std::net::IpAddr) -> Self { - use std::net::IpAddr; match s { IpAddr::V4(v4) => Self::V4(v4.into()), IpAddr::V6(v6) => Self::V6(v6.into()), diff --git a/clients/wicketd-client/src/lib.rs b/clients/wicketd-client/src/lib.rs index 01c3b04f87..09f9ca1418 100644 --- a/clients/wicketd-client/src/lib.rs +++ b/clients/wicketd-client/src/lib.rs @@ -51,6 +51,7 @@ progenitor::generate_api!( CurrentRssUserConfigInsensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfigSensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, + UserSpecifiedRackNetworkConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, GetLocationResponse = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, }, replace = { diff --git a/common/src/address.rs b/common/src/address.rs index 65a6604daf..152fb9319e 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -24,6 +24,12 @@ pub const MAX_PORT: u16 = u16::MAX; /// minimum possible value for a tcp or udp port pub const MIN_PORT: u16 = u16::MIN; +/// The amount of redundancy for Nexus services. +/// +/// This is used by both RSS (to distribute the initial set of services) and the +/// Reconfigurator (to know whether to add new Nexus zones) +pub const NEXUS_REDUNDANCY: usize = 3; + /// The amount of redundancy for internal DNS servers. /// /// Must be less than or equal to MAX_DNS_REDUNDANCY. @@ -457,6 +463,18 @@ impl TryFrom<(Ipv6Addr, Ipv6Addr)> for IpRange { } } +impl From for IpRange { + fn from(value: Ipv4Range) -> Self { + Self::V4(value) + } +} + +impl From for IpRange { + fn from(value: Ipv6Range) -> Self { + Self::V6(value) + } +} + /// A non-decreasing IPv4 address range, inclusive of both ends. /// /// The first address must be less than or equal to the last address. diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 24f4c34797..2545d4cb91 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -340,6 +340,8 @@ pub struct BackgroundTaskConfig { pub sync_service_zone_nat: SyncServiceZoneNatConfig, /// configuration for the bfd manager task pub bfd_manager: BfdManagerConfig, + /// configuration for region replacement task + pub region_replacement: RegionReplacementConfig, } #[serde_as] @@ -444,6 +446,14 @@ pub struct BlueprintTasksConfig { pub period_secs_execute: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionReplacementConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -548,8 +558,8 @@ mod test { ConfigDropshotWithTls, ConsoleConfig, Database, DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, LoadErrorKind, MgdConfig, NatCleanupConfig, - PackageConfig, PhantomDiskConfig, SchemeName, TimeseriesDbConfig, - Tunables, UpdatesConfig, + PackageConfig, PhantomDiskConfig, RegionReplacementConfig, SchemeName, + TimeseriesDbConfig, Tunables, UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; @@ -706,6 +716,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -819,7 +830,10 @@ mod test { }, sync_service_zone_nat: SyncServiceZoneNatConfig { period_secs: Duration::from_secs(30) - } + }, + region_replacement: RegionReplacementConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -882,6 +896,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index e08d5f9477..3f566f55ee 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -12,9 +12,12 @@ anyhow.workspace = true async-bb8-diesel.workspace = true chrono.workspace = true clap.workspace = true +crossterm.workspace = true crucible-agent-client.workspace = true +csv.workspace = true diesel.workspace = true dropshot.workspace = true +dyn-clone.workspace = true futures.workspace = true gateway-client.workspace = true gateway-messages.workspace = true @@ -29,6 +32,7 @@ omicron-common.workspace = true oximeter-client.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" +ratatui.workspace = true serde.workspace = true serde_json.workspace = true sled-agent-client.workspace = true @@ -43,6 +47,7 @@ uuid.workspace = true ipnetwork.workspace = true omicron-workspace-hack.workspace = true nexus-test-utils.workspace = true +multimap.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index c2a4250595..9c41c25cc0 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -160,6 +160,8 @@ pub struct DbFetchOptions { /// Subcommands that query or update the database #[derive(Debug, Subcommand)] enum DbCommands { + /// Print information about the rack + Rack(RackArgs), /// Print information about disks Disks(DiskArgs), /// Print information about internal and external DNS @@ -180,6 +182,18 @@ enum DbCommands { Validate(ValidateArgs), } +#[derive(Debug, Args)] +struct RackArgs { + #[command(subcommand)] + command: RackCommands, +} + +#[derive(Debug, Subcommand)] +enum RackCommands { + /// Summarize current racks + List, +} + #[derive(Debug, Args)] struct DiskArgs { #[command(subcommand)] @@ -399,13 +413,16 @@ impl DbArgs { // here. We will then check the schema version explicitly and warn the // user if it doesn't match. let datastore = Arc::new( - DataStore::new_unchecked(pool) + DataStore::new_unchecked(log.clone(), pool) .map_err(|e| anyhow!(e).context("creating datastore"))?, ); check_schema_version(&datastore).await; let opctx = OpContext::for_tests(log.clone(), datastore.clone()); match &self.command { + DbCommands::Rack(RackArgs { command: RackCommands::List }) => { + cmd_db_rack_list(&opctx, &datastore, &self.fetch_opts).await + } DbCommands::Disks(DiskArgs { command: DiskCommands::Info(uuid), }) => cmd_db_disk_info(&opctx, &datastore, uuid).await, @@ -619,6 +636,50 @@ async fn cmd_db_disk_list( Ok(()) } +/// Run `omdb db rack info`. +async fn cmd_db_rack_list( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct RackRow { + id: String, + initialized: bool, + tuf_base_url: String, + rack_subnet: String, + } + + let ctx = || "listing racks".to_string(); + + let limit = fetch_opts.fetch_limit; + let rack_list = datastore + .rack_list(opctx, &first_page(limit)) + .await + .context("listing racks")?; + check_limit(&rack_list, limit, ctx); + + let rows = rack_list.into_iter().map(|rack| RackRow { + id: rack.id().to_string(), + initialized: rack.initialized, + tuf_base_url: rack.tuf_base_url.unwrap_or_else(|| "-".to_string()), + rack_subnet: rack + .rack_subnet + .map(|subnet| subnet.to_string()) + .unwrap_or_else(|| "-".to_string()), + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + /// Run `omdb db disk info `. async fn cmd_db_disk_info( opctx: &OpContext, diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index 770cba9f62..ece4c4f109 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -22,6 +22,12 @@ use gateway_client::types::SpState; use gateway_client::types::SpType; use tabled::Tabled; +mod dashboard; +mod sensors; + +use dashboard::DashboardArgs; +use sensors::SensorsArgs; + /// Arguments to the "omdb mgs" subcommand #[derive(Debug, Args)] pub struct MgsArgs { @@ -35,19 +41,25 @@ pub struct MgsArgs { #[derive(Debug, Subcommand)] enum MgsCommands { + /// Dashboard of SPs + Dashboard(DashboardArgs), + /// Show information about devices and components visible to MGS Inventory(InventoryArgs), + + /// Show information about sensors, as gleaned by MGS + Sensors(SensorsArgs), } #[derive(Debug, Args)] struct InventoryArgs {} impl MgsArgs { - pub(crate) async fn run_cmd( + async fn mgs_client( &self, omdb: &Omdb, log: &slog::Logger, - ) -> Result<(), anyhow::Error> { + ) -> Result { let mgs_url = match &self.mgs_url { Some(cli_or_env_url) => cli_or_env_url.clone(), None => { @@ -68,11 +80,24 @@ impl MgsArgs { } }; eprintln!("note: using MGS URL {}", &mgs_url); - let mgs_client = gateway_client::Client::new(&mgs_url, log.clone()); + Ok(gateway_client::Client::new(&mgs_url, log.clone())) + } + pub(crate) async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> Result<(), anyhow::Error> { match &self.command { - MgsCommands::Inventory(inventory_args) => { - cmd_mgs_inventory(&mgs_client, inventory_args).await + MgsCommands::Dashboard(args) => { + dashboard::cmd_mgs_dashboard(omdb, log, self, args).await + } + MgsCommands::Inventory(args) => { + let mgs_client = self.mgs_client(omdb, log).await?; + cmd_mgs_inventory(&mgs_client, args).await + } + MgsCommands::Sensors(args) => { + sensors::cmd_mgs_sensors(omdb, log, self, args).await } } } @@ -156,6 +181,10 @@ fn sp_type_to_str(s: &SpType) -> &'static str { } } +fn sp_to_string(s: &SpIdentifier) -> String { + format!("{} {}", sp_type_to_str(&s.type_), s.slot) +} + fn show_sp_ids(sp_ids: &[SpIdentifier]) -> Result<(), anyhow::Error> { #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs new file mode 100644 index 0000000000..153618b7c0 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs @@ -0,0 +1,1113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Code for the MGS dashboard subcommand + +use anyhow::{Context, Result}; +use chrono::{Local, Offset, TimeZone}; +use crossterm::{ + event::{ + self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, + KeyModifiers, + }, + execute, + terminal::{ + disable_raw_mode, enable_raw_mode, EnterAlternateScreen, + LeaveAlternateScreen, + }, +}; +use dyn_clone::DynClone; +use ratatui::{ + backend::{Backend, CrosstermBackend}, + layout::{Alignment, Constraint, Direction, Layout, Rect}, + style::{Color, Modifier, Style}, + symbols, + text::{Line, Span}, + widgets::{ + Axis, Block, Borders, Chart, Dataset, List, ListItem, ListState, + Paragraph, + }, + Frame, Terminal, +}; + +use crate::mgs::sensors::{ + sensor_data, sensor_metadata, SensorId, SensorInput, SensorMetadata, + SensorValues, SensorsArgs, +}; +use crate::mgs::sp_to_string; +use clap::Args; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpIdentifier; +use multimap::MultiMap; +use std::collections::HashMap; +use std::fs::File; +use std::io; +use std::time::{Duration, Instant, SystemTime}; + +#[derive(Debug, Args)] +pub(crate) struct DashboardArgs { + #[clap(flatten)] + sensors_args: SensorsArgs, + + /// simulate real-time with input + #[clap(long)] + simulate_realtime: bool, +} + +struct StatefulList { + state: ListState, + n: usize, +} + +impl StatefulList { + fn next(&mut self) { + self.state.select(match self.state.selected() { + Some(ndx) => Some((ndx + 1) % self.n), + None => Some(0), + }); + } + + fn previous(&mut self) { + self.state.select(match self.state.selected() { + Some(0) => Some(self.n - 1), + Some(ndx) => Some(ndx - 1), + None => Some(0), + }); + } + + fn unselect(&mut self) { + self.state.select(None); + } + + fn selected(&self) -> Option { + self.state.selected() + } +} + +struct Series { + name: String, + color: Color, + data: Vec<(f64, f64)>, + raw: Vec>, +} + +trait Attributes: DynClone { + fn label(&self) -> String; + fn legend_label(&self) -> String; + fn x_axis_label(&self) -> String { + "Time".to_string() + } + fn y_axis_label(&self) -> String; + fn axis_value(&self, val: f64) -> String; + fn legend_value(&self, val: f64) -> String; + + fn increase(&mut self, _ndx: usize) -> Option { + None + } + + fn decrease(&mut self, _ndx: usize) -> Option { + None + } + + fn clear(&mut self) {} +} + +dyn_clone::clone_trait_object!(Attributes); + +#[derive(Clone)] +struct TempGraph; + +impl Attributes for TempGraph { + fn label(&self) -> String { + "Temperature".to_string() + } + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Degrees Celsius".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.0}°", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:4.2}°", val) + } +} + +#[derive(Clone)] +struct FanGraph; + +impl Attributes for FanGraph { + fn label(&self) -> String { + "Fan speed".to_string() + } + fn legend_label(&self) -> String { + "Fans".to_string() + } + + fn y_axis_label(&self) -> String { + "RPM".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:3.1}K", val / 1000.0) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:.0}", val) + } +} + +#[derive(Clone)] +struct CurrentGraph; + +impl Attributes for CurrentGraph { + fn label(&self) -> String { + "Output current".to_string() + } + + fn legend_label(&self) -> String { + "Regulators".to_string() + } + + fn y_axis_label(&self) -> String { + "Rails".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}A", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}A", val) + } +} + +#[derive(Clone)] +struct VoltageGraph; + +impl Attributes for VoltageGraph { + fn label(&self) -> String { + "Voltage".to_string() + } + + fn legend_label(&self) -> String { + "Rails".to_string() + } + + fn y_axis_label(&self) -> String { + "Volts".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}V", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}V", val) + } +} + +#[derive(Clone)] +struct SensorGraph; + +impl Attributes for SensorGraph { + fn label(&self) -> String { + "Sensor output".to_string() + } + + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Units".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}", val) + } +} + +struct Graph { + series: Vec, + legend: StatefulList, + time: usize, + width: usize, + offs: usize, + interpolate: usize, + bounds: [f64; 2], + attributes: Box, +} + +impl Graph { + fn new(all: &[String], attr: Box) -> Result { + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, s) in all.iter().enumerate() { + series.push(Series { + name: s.to_string(), + color: colors[ndx % colors.len()], + data: Vec::new(), + raw: Vec::new(), + }) + } + + Ok(Graph { + series, + legend: StatefulList { state: ListState::default(), n: all.len() }, + time: 0, + width: 600, + offs: 0, + interpolate: 0, + bounds: [20.0, 120.0], + attributes: attr, + }) + } + + fn flip(from: &[(&Self, String)], series_ndx: usize) -> Self { + let rep = from[0].0; + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, (graph, name)) in from.iter().enumerate() { + series.push(Series { + name: name.clone(), + color: colors[ndx % colors.len()], + data: graph.series[series_ndx].data.clone(), + raw: graph.series[series_ndx].raw.clone(), + }); + } + + Graph { + series, + legend: StatefulList { state: ListState::default(), n: from.len() }, + time: rep.time, + width: rep.width, + offs: rep.offs, + interpolate: rep.interpolate, + bounds: rep.bounds, + attributes: rep.attributes.clone(), + } + } + + fn data(&mut self, data: &[Option]) { + for (ndx, s) in self.series.iter_mut().enumerate() { + s.raw.push(data[ndx]); + } + + self.time += 1; + + if self.offs > 0 { + self.offs += 1; + } + } + + fn update_data(&mut self) { + for s in &mut self.series { + s.data = Vec::new(); + } + + for i in 0..self.width { + if self.time < (self.width - i) + self.offs { + continue; + } + + let offs = self.time - (self.width - i) - self.offs; + + for (_ndx, s) in &mut self.series.iter_mut().enumerate() { + if let Some(datum) = s.raw[offs] { + let point = (i as f64, datum as f64); + + if self.interpolate != 0 { + if let Some(last) = s.data.last() { + let x_delta = point.0 - last.0; + let slope = (point.1 - last.1) / x_delta; + let x_inc = x_delta / self.interpolate as f64; + + for x in 0..self.interpolate { + s.data.push(( + point.0 + x as f64 * x_inc, + point.1 + (slope * x_inc), + )); + } + } + } + + s.data.push((i as f64, datum as f64)); + } + } + } + + self.update_bounds(); + } + + fn update_bounds(&mut self) { + let selected = self.legend.state.selected(); + let mut min = None; + let mut max = None; + + for (ndx, s) in self.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + for (_, datum) in &s.data { + min = match min { + Some(min) if datum < min => Some(datum), + None => Some(datum), + _ => min, + }; + + max = match max { + Some(max) if datum > max => Some(datum), + None => Some(datum), + _ => max, + }; + } + } + + if let Some(min) = min { + self.bounds[0] = ((min * 0.85) / 2.0) * 2.0; + } + + if self.bounds[0] < 0.0 { + self.bounds[0] = 0.0; + } + + if let Some(max) = max { + self.bounds[1] = ((max * 1.15) / 2.0) * 2.0; + } + } + + fn previous(&mut self) { + self.legend.previous(); + } + + fn next(&mut self) { + self.legend.next(); + } + + fn unselect(&mut self) { + self.legend.unselect(); + } + + fn selected(&self) -> Option { + self.legend.selected() + } + + fn set_interpolate(&mut self) { + let interpolate = (1000.0 - self.width as f64) / self.width as f64; + + if interpolate >= 1.0 { + self.interpolate = interpolate as usize; + } else { + self.interpolate = 0; + } + } + + fn zoom_in(&mut self) { + self.width = (self.width as f64 * 0.8) as usize; + self.set_interpolate(); + } + + fn zoom_out(&mut self) { + self.width = (self.width as f64 * 1.25) as usize; + self.set_interpolate(); + } + + fn time_right(&mut self) { + let delta = (self.width as f64 * 0.25) as usize; + + if delta > self.offs { + self.offs = 0; + } else { + self.offs -= delta; + } + } + + fn time_left(&mut self) { + self.offs += (self.width as f64 * 0.25) as usize; + } +} + +struct Dashboard { + graphs: HashMap<(SpIdentifier, MeasurementKind), Graph>, + flipped: HashMap, + sids: HashMap<(SpIdentifier, MeasurementKind), Vec>, + kinds: Vec, + selected_kind: usize, + sps: Vec, + selected_sp: usize, + status: String, + time: u64, +} + +impl Dashboard { + fn new(metadata: &SensorMetadata) -> Result { + let mut sps = + metadata.sensors_by_sp.keys().copied().collect::>(); + let mut graphs = HashMap::new(); + let mut sids = HashMap::new(); + sps.sort(); + + let kinds = vec![ + MeasurementKind::Temperature, + MeasurementKind::Speed, + MeasurementKind::Current, + ]; + + for &sp in sps.iter() { + let sensors = metadata.sensors_by_sp.get_vec(&sp).unwrap(); + let mut by_kind = MultiMap::new(); + + for sid in sensors { + let (_, s, _) = metadata.sensors_by_id.get(sid).unwrap(); + by_kind.insert(s.kind, (s.name.clone(), *sid)); + } + + let keys = by_kind.keys().copied().collect::>(); + + for k in keys { + let mut v = by_kind.remove(&k).unwrap(); + v.sort(); + + let labels = + v.iter().map(|(n, _)| n.clone()).collect::>(); + + graphs.insert( + (sp, k), + Graph::new( + labels.as_slice(), + match k { + MeasurementKind::Temperature => Box::new(TempGraph), + MeasurementKind::Current => Box::new(CurrentGraph), + MeasurementKind::Speed => Box::new(FanGraph), + MeasurementKind::Voltage => Box::new(VoltageGraph), + _ => Box::new(SensorGraph), + }, + )?, + ); + + sids.insert( + (sp, k), + v.iter().map(|(_, sid)| *sid).collect::>(), + ); + } + } + + let status = sp_to_string(&sps[0]); + + Ok(Dashboard { + graphs, + flipped: HashMap::new(), + sids, + kinds, + selected_kind: 0, + sps, + selected_sp: 0, + status, + time: secs()?, + }) + } + + fn status(&self) -> Vec<(&str, &str)> { + vec![("Status", &self.status)] + } + + fn update_data(&mut self) { + for graph in self.graphs.values_mut() { + graph.update_data(); + } + + for graph in self.flipped.values_mut() { + graph.update_data(); + } + } + + fn up(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.previous(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().previous(); + } + } + + fn down(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.next(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().next(); + } + } + + fn esc(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.unselect(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().unselect(); + } + } + + fn left(&mut self) { + if self.selected_sp == 0 { + self.selected_sp = self.sps.len() - 1; + } else { + self.selected_sp -= 1; + } + + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn right(&mut self) { + self.selected_sp = (self.selected_sp + 1) % self.sps.len(); + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn time_left(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_left(); + } + + for graph in self.flipped.values_mut() { + graph.time_left(); + } + } + + fn time_right(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_right(); + } + + for graph in self.flipped.values_mut() { + graph.time_right(); + } + } + + fn flip(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if self.flipped.remove(&selected_kind).is_some() { + return; + } + + let sp = self.sps[self.selected_sp]; + + let graph = self.graphs.get(&(sp, selected_kind)).unwrap(); + + if let Some(ndx) = graph.selected() { + let mut from = vec![]; + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + from.push(( + self.graphs.get(&(*sp, selected_kind)).unwrap(), + sp_to_string(sp), + )); + } + + self.flipped + .insert(selected_kind, Graph::flip(from.as_slice(), ndx)); + } + } + + fn tab(&mut self) { + self.selected_kind = (self.selected_kind + 1) % self.kinds.len(); + } + + fn zoom_in(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_in(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_in(); + } + } + + fn zoom_out(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_out(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_out(); + } + } + + fn gap(&mut self, length: u64) { + let mut gap: Vec> = vec![]; + + for (graph, sids) in &self.sids { + while gap.len() < sids.len() { + gap.push(None); + } + + let graph = self.graphs.get_mut(graph).unwrap(); + + for _ in 0..length { + graph.data(&gap[0..sids.len()]); + } + } + } + + fn values(&mut self, values: &SensorValues) { + for (graph, sids) in &self.sids { + let mut data = vec![]; + + for sid in sids { + if let Some(value) = values.values.get(sid) { + data.push(*value); + } else { + data.push(None); + } + } + + let graph = self.graphs.get_mut(graph).unwrap(); + graph.data(data.as_slice()); + } + + self.time = values.time; + } +} + +fn run_dashboard( + terminal: &mut Terminal, + dashboard: &mut Dashboard, + force_update: bool, +) -> Result { + let update = if crossterm::event::poll(Duration::from_secs(0))? { + if let Event::Key(key) = event::read()? { + match key.code { + KeyCode::Char('q') => return Ok(true), + KeyCode::Char('+') => dashboard.zoom_in(), + KeyCode::Char('-') => dashboard.zoom_out(), + KeyCode::Char('<') => dashboard.time_left(), + KeyCode::Char('>') => dashboard.time_right(), + KeyCode::Char('!') => dashboard.flip(), + KeyCode::Char('l') => { + // + // ^L -- form feed -- is historically used to clear and + // redraw the screen. And, notably, it is what dtach(1) + // will send when attaching to a dashboard. If we + // see ^L, clear the terminal to force a total redraw. + // + if key.modifiers == KeyModifiers::CONTROL { + terminal.clear()?; + } + } + KeyCode::Up => dashboard.up(), + KeyCode::Down => dashboard.down(), + KeyCode::Right => dashboard.right(), + KeyCode::Left => dashboard.left(), + KeyCode::Esc => dashboard.esc(), + KeyCode::Tab => dashboard.tab(), + _ => {} + } + } + true + } else { + force_update + }; + + if update { + dashboard.update_data(); + terminal.draw(|f| draw(f, dashboard))?; + } + + Ok(false) +} + +fn secs() -> Result { + let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH)?; + Ok(now.as_secs()) +} + +/// +/// Runs `omdb mgs dashboard` +/// +pub(crate) async fn cmd_mgs_dashboard( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &DashboardArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.sensors_args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, values) = + sensor_metadata(&mut input, &args.sensors_args).await?; + + let mut dashboard = Dashboard::new(&metadata)?; + let mut last = values.time; + let mut force = true; + let mut update = true; + + dashboard.values(&values); + + if args.sensors_args.input.is_some() && !args.simulate_realtime { + loop { + let values = sensor_data(&mut input, &metadata).await?; + + if values.time == 0 { + break; + } + + if values.time != last + 1 { + dashboard.gap(values.time - last - 1); + } + + last = values.time; + dashboard.values(&values); + } + + update = false; + } + + // setup terminal + enable_raw_mode()?; + let mut stdout = io::stdout(); + execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?; + let backend = CrosstermBackend::new(stdout); + let mut terminal = Terminal::new(backend)?; + + let res = 'outer: loop { + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break Err(err), + Ok(true) => break Ok(()), + _ => {} + } + + force = false; + + let now = match secs() { + Err(err) => break Err(err), + Ok(now) => now, + }; + + if update && now != last { + let kicked = Instant::now(); + let f = sensor_data(&mut input, &metadata); + last = now; + + while Instant::now().duration_since(kicked).as_millis() < 800 { + tokio::time::sleep(Duration::from_millis(10)).await; + + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break 'outer Err(err), + Ok(true) => break 'outer Ok(()), + _ => {} + } + } + + let values = match f.await { + Err(err) => break Err(err), + Ok(v) => v, + }; + + dashboard.values(&values); + force = true; + continue; + } + + tokio::time::sleep(Duration::from_millis(10)).await; + }; + + // restore terminal + disable_raw_mode()?; + execute!( + terminal.backend_mut(), + LeaveAlternateScreen, + DisableMouseCapture + )?; + terminal.show_cursor()?; + + if let Err(err) = res { + println!("{err:?}"); + } + + Ok(()) +} + +fn draw_graph(f: &mut Frame, parent: Rect, graph: &mut Graph, now: u64) { + // + // We want the right panel to be 31 characters wide (a left-justified 20 + // and a right justified 8 + margins), but we don't want it to consume + // more than 80%; calculate accordingly. + // + let r = std::cmp::min((31 * 100) / parent.width, 80); + + let chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints( + [Constraint::Percentage(100 - r), Constraint::Percentage(r)] + .as_ref(), + ) + .split(parent); + + let latest = now as i64 - graph.offs as i64; + let earliest = Local.timestamp_opt(latest - graph.width as i64, 0).unwrap(); + let latest = Local.timestamp_opt(latest, 0).unwrap(); + + // + // We want a format that preserves horizontal real estate just a tad more + // than .to_rfc3339_opts()... + // + let fmt = "%Y-%m-%d %H:%M:%S"; + + let tz_offset = earliest.offset().fix().local_minus_utc(); + let tz = if tz_offset != 0 { + let hours = tz_offset / 3600; + let minutes = (tz_offset % 3600) / 60; + + if minutes != 0 { + format!("Z{:+}:{:02}", hours, minutes.abs()) + } else { + format!("Z{:+}", hours) + } + } else { + "Z".to_string() + }; + + let x_labels = vec![ + Span::styled( + format!("{}{}", earliest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + format!("{}{}", latest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + ]; + + let mut datasets = vec![]; + let selected = graph.legend.state.selected(); + + for (ndx, s) in graph.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + datasets.push( + Dataset::default() + .name(&*s.name) + .marker(symbols::Marker::Braille) + .style(Style::default().fg(s.color)) + .data(&s.data), + ); + } + + let chart = Chart::new(datasets) + .block( + Block::default() + .title(Span::styled( + graph.attributes.label(), + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL), + ) + .x_axis( + Axis::default() + .title(graph.attributes.x_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(x_labels) + .bounds([0.0, graph.width as f64]) + .labels_alignment(Alignment::Right), + ) + .y_axis( + Axis::default() + .title(graph.attributes.y_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(vec![ + Span::styled( + graph.attributes.axis_value(graph.bounds[0]), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + graph.attributes.axis_value(graph.bounds[1]), + Style::default().add_modifier(Modifier::BOLD), + ), + ]) + .bounds(graph.bounds), + ); + + f.render_widget(chart, chunks[0]); + + let mut rows = vec![]; + + for s in &graph.series { + let val = match s.raw.last() { + None | Some(None) => "-".to_string(), + Some(Some(val)) => graph.attributes.legend_value((*val).into()), + }; + + rows.push(ListItem::new(Line::from(vec![ + Span::styled( + format!("{:<20}", s.name), + Style::default().fg(s.color), + ), + Span::styled(format!("{:>8}", val), Style::default().fg(s.color)), + ]))); + } + + let list = List::new(rows) + .block( + Block::default() + .borders(Borders::ALL) + .title(graph.attributes.legend_label()), + ) + .highlight_style( + Style::default() + .bg(Color::LightGreen) + .fg(Color::Black) + .add_modifier(Modifier::BOLD), + ); + + // We can now render the item list + f.render_stateful_widget(list, chunks[1], &mut graph.legend.state); +} + +fn draw_graphs(f: &mut Frame, parent: Rect, dashboard: &mut Dashboard) { + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints( + [ + Constraint::Ratio(1, 2), + Constraint::Ratio(1, 4), + Constraint::Ratio(1, 4), + ] + .as_ref(), + ) + .split(parent); + + let sp = dashboard.sps[dashboard.selected_sp]; + + for (i, k) in dashboard.kinds.iter().enumerate() { + if let Some(graph) = dashboard.flipped.get_mut(k) { + draw_graph(f, screen[i], graph, dashboard.time); + } else { + draw_graph( + f, + screen[i], + dashboard.graphs.get_mut(&(sp, *k)).unwrap(), + dashboard.time, + ); + } + } +} + +fn draw_status(f: &mut Frame, parent: Rect, status: &[(&str, &str)]) { + let mut bar = vec![]; + + for i in 0..status.len() { + let s = &status[i]; + + bar.push(Span::styled( + s.0, + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::styled( + ": ", + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::raw(s.1)); + + if i < status.len() - 1 { + bar.push(Span::raw(" | ")); + } + } + + let text = vec![Line::from(bar)]; + + let para = Paragraph::new(text) + .alignment(Alignment::Right) + .style(Style::default().fg(Color::White).bg(Color::Black)); + + f.render_widget(para, parent); +} + +fn draw(f: &mut Frame, dashboard: &mut Dashboard) { + let size = f.size(); + + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Min(1), Constraint::Length(1)].as_ref()) + .split(size); + + draw_graphs(f, screen[0], dashboard); + draw_status(f, screen[1], &dashboard.status()); +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs new file mode 100644 index 0000000000..d00bebd96c --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs @@ -0,0 +1,950 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of the "mgs sensors" subcommand + +use anyhow::{bail, Context}; +use clap::Args; +use gateway_client::types::MeasurementErrorCode; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpComponentDetails; +use gateway_client::types::SpIdentifier; +use gateway_client::types::SpIgnition; +use gateway_client::types::SpType; +use multimap::MultiMap; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +#[derive(Debug, Args)] +pub(crate) struct SensorsArgs { + /// verbose messages + #[clap(long, short)] + pub verbose: bool, + + /// restrict to specified sled(s) + #[clap(long, use_value_delimiter = true)] + pub sled: Vec, + + /// exclude sleds rather than include them + #[clap(long, short)] + pub exclude: bool, + + /// include switches + #[clap(long)] + pub switches: bool, + + /// include PSC + #[clap(long)] + pub psc: bool, + + /// print sensors every second + #[clap(long, short)] + pub sleep: bool, + + /// parseable output + #[clap(long, short)] + pub parseable: bool, + + /// show latencies + #[clap(long)] + pub show_latencies: bool, + + /// restrict sensors by type of sensor + #[clap( + long, + short, + value_name = "sensor type", + use_value_delimiter = true + )] + pub types: Option>, + + /// restrict sensors by name + #[clap( + long, + short, + value_name = "sensor name", + use_value_delimiter = true + )] + pub named: Option>, + + /// simulate using specified file as input + #[clap(long, short)] + pub input: Option, + + /// start time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub start: Option, + + /// end time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub end: Option, + + /// duration, if using an input file + #[clap( + long, + value_name = "seconds", + requires = "input", + conflicts_with = "end" + )] + pub duration: Option, +} + +impl SensorsArgs { + fn matches_sp(&self, sp: &SpIdentifier) -> bool { + match sp.type_ { + SpType::Sled => { + let matched = if !self.sled.is_empty() { + self.sled.contains(&sp.slot) + } else { + true + }; + + matched != self.exclude + } + SpType::Switch => self.switches, + SpType::Power => self.psc, + } + } +} + +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Sensor { + pub name: String, + pub kind: MeasurementKind, +} + +impl Sensor { + fn units(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "°C", + MeasurementKind::Current | MeasurementKind::InputCurrent => "A", + MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V", + MeasurementKind::Speed => "RPM", + MeasurementKind::Power => "W", + } + } + + fn format(&self, value: f32, parseable: bool) -> String { + if parseable { + format!("{value}") + } else { + match self.kind { + MeasurementKind::Speed => { + // + // This space is deliberate: other units (°C, V, A) look + // more natural when directly attached to their value -- + // but RPM looks decidedly unnatural without a space. + // + format!("{value:0} RPM") + } + _ => { + format!("{value:.2}{}", self.units()) + } + } + } + } + + fn to_kind_string(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "temp", + MeasurementKind::Power => "power", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::InputCurrent => "input-current", + MeasurementKind::InputVoltage => "input-voltage", + MeasurementKind::Speed => "speed", + } + } + + fn from_string(name: &str, kind: &str) -> Option { + let k = match kind { + "temp" | "temperature" => Some(MeasurementKind::Temperature), + "power" => Some(MeasurementKind::Power), + "current" => Some(MeasurementKind::Current), + "voltage" => Some(MeasurementKind::Voltage), + "input-current" => Some(MeasurementKind::InputCurrent), + "input-voltage" => Some(MeasurementKind::InputVoltage), + "speed" => Some(MeasurementKind::Speed), + _ => None, + }; + + k.map(|kind| Sensor { name: name.to_string(), kind }) + } +} + +pub(crate) enum SensorInput { + MgsClient(gateway_client::Client), + CsvReader(csv::Reader, csv::Position), +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct SensorId(u32); + +#[derive(Debug)] +pub(crate) struct SensorMetadata { + pub sensors_by_sensor: MultiMap, + pub sensors_by_sensor_and_sp: + HashMap>, + pub sensors_by_id: + HashMap, + pub sensors_by_sp: MultiMap, + pub work_by_sp: + HashMap)>>, + #[allow(dead_code)] + pub start_time: Option, + pub end_time: Option, +} + +struct SensorSpInfo { + info: Vec<(SpIdentifier, SpInfo)>, + time: u64, + latencies: Option>, +} + +pub(crate) struct SensorValues { + pub values: HashMap>, + pub latencies: Option>, + pub time: u64, +} + +/// +/// We identify a device as either a physical device (i.e., when connecting +/// to MGS), or as a field in the CSV header (i.e., when processing data +/// postmortem. It's handy to have this as enum to allow most of the code +/// to be agnostic to the underlying source, but callers of ['device'] and +/// ['field'] are expected to know which of these they're dealing with. +/// +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub(crate) enum DeviceIdentifier { + Field(usize), + Device(String), +} + +impl DeviceIdentifier { + fn device(&self) -> &String { + match self { + Self::Device(ref device) => device, + _ => panic!(), + } + } + + fn field(&self) -> usize { + match self { + Self::Field(field) => *field, + _ => panic!(), + } + } +} + +struct SpInfo { + devices: MultiMap)>, + timestamps: Vec, +} + +async fn sp_info( + mgs_client: gateway_client::Client, + type_: SpType, + slot: u32, +) -> Result { + let mut devices = MultiMap::new(); + let mut timestamps = vec![]; + + timestamps.push(std::time::Instant::now()); + + // + // First, get a component list. + // + let components = mgs_client.sp_component_list(type_, slot).await?; + timestamps.push(std::time::Instant::now()); + + // + // Now, for every component, we're going to get its details: for those + // that are sensors (and contain measurements), we will store the name + // of the sensor as well as the retrieved value. + // + for c in &components.components { + for s in mgs_client + .sp_component_get(type_, slot, &c.component) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind, name, value } => Some( + (Sensor { name: name.clone(), kind: *kind }, Some(*value)), + ), + SpComponentDetails::MeasurementError { kind, name, error } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(( + Sensor { name: name.clone(), kind: *kind }, + None, + )), + } + } + _ => None, + }) + { + devices.insert(DeviceIdentifier::Device(c.component.clone()), s); + } + } + + timestamps.push(std::time::Instant::now()); + + Ok(SpInfo { devices, timestamps }) +} + +async fn sp_info_mgs( + mgs_client: &gateway_client::Client, + args: &SensorsArgs, +) -> Result { + let mut rval = vec![]; + let mut latencies = HashMap::new(); + + // + // First, get all of the SPs that we can see via Ignition + // + let all_sp_list = + mgs_client.ignition_list().await.context("listing ignition")?; + + let mut sp_list = all_sp_list + .iter() + .filter_map(|ignition| { + if matches!(ignition.details, SpIgnition::Yes { .. }) + && ignition.id.type_ == SpType::Sled + { + if args.matches_sp(&ignition.id) { + return Some(ignition.id); + } + } + None + }) + .collect::>(); + + if args.switches { + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 0 }); + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 1 }); + } + + if args.psc { + sp_list.push(SpIdentifier { type_: SpType::Power, slot: 0 }); + } + + sp_list.sort(); + + let now = std::time::Instant::now(); + + let mut handles = vec![]; + for sp_id in sp_list { + let handle = + tokio::spawn(sp_info(mgs_client.clone(), sp_id.type_, sp_id.slot)); + + handles.push((sp_id, handle)); + } + + for (sp_id, handle) in handles { + match handle.await.unwrap() { + Ok(info) => { + let l0 = info.timestamps[1].duration_since(info.timestamps[0]); + let l1 = info.timestamps[2].duration_since(info.timestamps[1]); + + if args.verbose { + eprintln!( + "mgs: latencies for {sp_id:?}: {l1:.1?} {l0:.1?}", + ); + } + + latencies.insert(sp_id, l0 + l1); + rval.push((sp_id, info)); + } + + Err(err) => { + eprintln!("failed to read devices for {:?}: {:?}", sp_id, err); + } + } + } + + if args.verbose { + eprintln!("total discovery time {:?}", now.elapsed()); + } + + Ok(SensorSpInfo { + info: rval, + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + latencies: Some(latencies), + }) +} + +fn sp_info_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + args: &SensorsArgs, +) -> Result { + let mut sps = vec![]; + let headers = reader.headers()?; + + let expected = ["TIME", "SENSOR", "KIND"]; + let len = expected.len(); + let hlen = headers.len(); + + if hlen < len { + bail!("expected as least {len} fields (found {headers:?})"); + } + + for ndx in 0..len { + if &headers[ndx] != expected[ndx] { + bail!( + "malformed headers: expected {}, found {} ({headers:?})", + &expected[ndx], + &headers[ndx] + ); + } + } + + for ndx in len..hlen { + let field = &headers[ndx]; + let parts: Vec<&str> = field.splitn(2, '-').collect(); + + if parts.len() != 2 { + bail!("malformed field \"{field}\""); + } + + let type_ = match parts[0] { + "SLED" => SpType::Sled, + "SWITCH" => SpType::Switch, + "POWER" => SpType::Power, + _ => { + bail!("unknown type {}", parts[0]); + } + }; + + let slot = parts[1].parse::().or_else(|_| { + bail!("invalid slot in \"{field}\""); + })?; + + let sp = SpIdentifier { type_, slot }; + + if args.matches_sp(&sp) { + sps.push(Some(sp)); + } else { + sps.push(None); + } + } + + let mut iter = reader.records(); + let mut sensors = HashSet::new(); + let mut by_sp = MultiMap::new(); + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + if time.is_none() { + let t = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(start) = args.start { + if t < start { + continue; + } + } + + if let Some(end) = args.end { + if let Some(start) = args.start { + if start > end { + bail!( + "specified start time is later than end time" + ); + } + } + + if t > end { + bail!( + "specified end time ({end}) is earlier \ + than time of earliest record ({t})" + ); + } + } + + time = Some(t); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if sensors.get(&sensor).is_some() { + break; + } + + sensors.insert(sensor.clone()); + + for (ndx, sp) in sps.iter().enumerate() { + if let Some(sp) = sp { + let value = match record[ndx + len].parse::() { + Ok(value) => Some(value), + _ => { + // + // We want to distinguish between the device + // having an error ("X") and it being absent + // ("-"); if it's absent, we don't want to add + // it at all. + // + match &record[ndx + len] { + "X" => {} + "-" => continue, + _ => { + bail!( + "line {}: unrecognized value \ + \"{}\" in field {}", + position.line(), + record[ndx + len].to_string(), + ndx + len + ); + } + } + + None + } + }; + + by_sp.insert(sp, (sensor.clone(), value)); + } + } + } + } else { + break; + } + } + + if time.is_none() { + bail!("no data found"); + } + + let mut rval = vec![]; + + for (field, sp) in sps.iter().enumerate() { + let mut devices = MultiMap::new(); + + if let Some(sp) = sp { + if let Some(v) = by_sp.remove(sp) { + devices.insert_many(DeviceIdentifier::Field(field + len), v); + } + + rval.push((*sp, SpInfo { devices, timestamps: vec![] })); + } + } + + Ok(SensorSpInfo { info: rval, time: time.unwrap(), latencies: None }) +} + +pub(crate) async fn sensor_metadata( + input: &mut SensorInput, + args: &SensorsArgs, +) -> Result<(Arc, SensorValues), anyhow::Error> { + let by_kind = if let Some(types) = &args.types { + let mut h = HashSet::new(); + + for t in types { + h.insert(match Sensor::from_string("", t) { + None => bail!("invalid sensor kind {t}"), + Some(s) => s.kind, + }); + } + + Some(h) + } else { + None + }; + + let by_name = args + .named + .as_ref() + .map(|named| named.into_iter().collect::>()); + + let info = match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_info_mgs(mgs_client, args).await? + } + SensorInput::CsvReader(reader, position) => { + sp_info_csv(reader, position, args)? + } + }; + + let mut sensors_by_sensor = MultiMap::new(); + let mut sensors_by_sensor_and_sp = HashMap::new(); + let mut sensors_by_id = HashMap::new(); + let mut sensors_by_sp = MultiMap::new(); + let mut values = HashMap::new(); + let mut work_by_sp = HashMap::new(); + + let mut current = 0; + let time = info.time; + + for (sp_id, info) in info.info { + let mut sp_work = vec![]; + + for (device, sensors) in info.devices { + let mut device_work = vec![]; + + for (sensor, value) in sensors { + if let Some(ref by_kind) = by_kind { + if by_kind.get(&sensor.kind).is_none() { + continue; + } + } + + if let Some(ref by_name) = by_name { + if by_name.get(&sensor.name).is_none() { + continue; + } + } + + let id = SensorId(current); + current += 1; + + sensors_by_id + .insert(id, (sp_id, sensor.clone(), device.clone())); + + if value.is_none() && args.verbose { + eprintln!( + "mgs: error for {sp_id:?} on {sensor:?} ({device:?})" + ); + } + + sensors_by_sensor.insert(sensor.clone(), id); + + let by_sp = sensors_by_sensor_and_sp + .entry(sensor) + .or_insert_with(|| HashMap::new()); + by_sp.insert(sp_id, id); + sensors_by_sp.insert(sp_id, id); + values.insert(id, value); + + device_work.push(id); + } + + sp_work.push((device, device_work)); + } + + work_by_sp.insert(sp_id, sp_work); + } + + Ok(( + Arc::new(SensorMetadata { + sensors_by_sensor, + sensors_by_sensor_and_sp, + sensors_by_id, + sensors_by_sp, + work_by_sp, + start_time: args.start, + end_time: match args.end { + Some(end) => Some(end), + None => args.duration.map(|duration| time + duration), + }, + }), + SensorValues { values, time, latencies: info.latencies }, + )) +} + +async fn sp_read_sensors( + mgs_client: &gateway_client::Client, + id: &SpIdentifier, + metadata: &SensorMetadata, +) -> Result<(Vec<(SensorId, Option)>, Duration), anyhow::Error> { + let work = metadata.work_by_sp.get(id).unwrap(); + let mut rval = vec![]; + + let start = std::time::Instant::now(); + + for (component, ids) in work.iter() { + for (value, id) in mgs_client + .sp_component_get(id.type_, id.slot, component.device()) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind: _, name: _, value } => { + Some(Some(*value)) + } + SpComponentDetails::MeasurementError { error, .. } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(None), + } + } + _ => None, + }) + .zip(ids.iter()) + { + rval.push((*id, value)); + } + } + + Ok((rval, start.elapsed())) +} + +async fn sp_data_mgs( + mgs_client: &gateway_client::Client, + metadata: &Arc, +) -> Result { + let mut values = HashMap::new(); + let mut latencies = HashMap::new(); + let mut handles = vec![]; + + for sp_id in metadata.sensors_by_sp.keys() { + let mgs_client = mgs_client.clone(); + let id = *sp_id; + let metadata = Arc::clone(&metadata); + + let handle = tokio::spawn(async move { + sp_read_sensors(&mgs_client, &id, &metadata).await + }); + + handles.push((id, handle)); + } + + for (id, handle) in handles { + let (rval, latency) = handle.await.unwrap()?; + + latencies.insert(id, latency); + + for (id, value) in rval { + values.insert(id, value); + } + } + + Ok(SensorValues { + values, + latencies: Some(latencies), + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + }) +} + +fn sp_data_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + metadata: &SensorMetadata, +) -> Result { + let headers = reader.headers()?; + let hlen = headers.len(); + let mut values = HashMap::new(); + + reader.seek(position.clone())?; + let mut iter = reader.records(); + + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + let now = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(time) = time { + if now != time { + break; + } + } else { + if let Some(end) = metadata.end_time { + if now > end { + time = Some(0); + break; + } + } + + time = Some(now); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if let Some(ids) = metadata.sensors_by_sensor.get_vec(&sensor) { + for id in ids { + let (_, _, d) = metadata.sensors_by_id.get(id).unwrap(); + let value = match record[d.field()].parse::() { + Ok(value) => Some(value), + _ => None, + }; + + values.insert(*id, value); + } + } + } else { + bail!("bad sensor at line {}", position.line()); + } + } else { + time = Some(0); + break; + } + } + + Ok(SensorValues { values, latencies: None, time: time.unwrap() }) +} + +pub(crate) async fn sensor_data( + input: &mut SensorInput, + metadata: &Arc, +) -> Result { + match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_data_mgs(mgs_client, metadata).await + } + SensorInput::CsvReader(reader, position) => { + sp_data_csv(reader, position, &metadata) + } + } +} + +/// +/// Runs `omdb mgs sensors` +/// +pub(crate) async fn cmd_mgs_sensors( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &SensorsArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, mut values) = sensor_metadata(&mut input, args).await?; + + let mut sensors = metadata.sensors_by_sensor.keys().collect::>(); + sensors.sort(); + + let mut sps = metadata.sensors_by_sp.keys().collect::>(); + sps.sort(); + + let print_value = |v| { + if args.parseable { + print!(",{v}"); + } else { + print!(" {v:>8}"); + } + }; + + let print_header = || { + if !args.parseable { + print!("{:20} ", "NAME"); + } else { + print!("TIME,SENSOR,KIND"); + } + + for sp in &sps { + print_value(format!( + "{}-{}", + crate::mgs::sp_type_to_str(&sp.type_).to_uppercase(), + sp.slot + )); + } + + println!(); + }; + + let print_name = |sensor: &Sensor, now: u64| { + if !args.parseable { + print!("{:20} ", sensor.name); + } else { + print!("{now},{},{}", sensor.name, sensor.to_kind_string()); + } + }; + + let print_latency = |now: u64| { + if !args.parseable { + print!("{:20} ", "LATENCY"); + } else { + print!("{now},{},{}", "LATENCY", "latency"); + } + }; + + let mut wakeup = + tokio::time::Instant::now() + tokio::time::Duration::from_millis(1000); + + print_header(); + + loop { + for sensor in &sensors { + print_name(sensor, values.time); + + let by_sp = metadata.sensors_by_sensor_and_sp.get(sensor).unwrap(); + + for sp in &sps { + print_value(if let Some(id) = by_sp.get(sp) { + if let Some(value) = values.values.get(id) { + match value { + Some(value) => { + sensor.format(*value, args.parseable) + } + None => "X".to_string(), + } + } else { + "?".to_string() + } + } else { + "-".to_string() + }); + } + + println!(); + } + + if args.show_latencies { + if let Some(latencies) = values.latencies { + print_latency(values.time); + + for sp in &sps { + print_value(if let Some(latency) = latencies.get(sp) { + format!("{}ms", latency.as_millis()) + } else { + "?".to_string() + }); + } + } + + println!(); + } + + if !args.sleep { + if args.input.is_none() { + break; + } + } else { + tokio::time::sleep_until(wakeup).await; + wakeup += tokio::time::Duration::from_millis(1000); + } + + values = sensor_data(&mut input, &metadata).await?; + + if args.input.is_some() && values.time == 0 { + break; + } + + if !args.parseable { + print_header(); + } + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index f00c05f1ec..9904263067 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -639,6 +639,32 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ); } }; + } else if name == "region_replacement" { + #[derive(Deserialize)] + struct TaskSuccess { + /// how many region replacements were started ok + region_replacement_started_ok: usize, + + /// how many region replacements could not be started + region_replacement_started_err: usize, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(success) => { + println!( + " number of region replacements started ok: {}", + success.region_replacement_started_ok + ); + println!( + " number of region replacement start errors: {}", + success.region_replacement_started_err + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 72e9d2e8fc..0600945194 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -83,6 +83,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -169,6 +173,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -242,6 +250,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 416b669068..1cd85262f6 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -277,6 +277,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -407,6 +411,14 @@ task: "phantom_disks" number of phantom disks deleted: 0 number of phantom disk delete errors: 0 +task: "region_replacement" + configured period: every 30s + currently executing: no + last completed activation: iter 2, triggered by an explicit signal + started at (s ago) and ran for ms + number of region replacements started ok: 0 + number of region replacement start errors: 0 + task: "service_zone_nat_tracker" configured period: every 30s currently executing: no diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 2790b0ef83..6f9b539371 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -90,6 +90,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory @@ -118,6 +119,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory @@ -270,7 +272,9 @@ Debug a specific Management Gateway Service instance Usage: omdb mgs [OPTIONS] Commands: + dashboard Dashboard of SPs inventory Show information about devices and components visible to MGS + sensors Show information about sensors, as gleaned by MGS help Print this message or the help of the given subcommand(s) Options: diff --git a/docs/clickhouse-debugging.adoc b/docs/clickhouse-debugging.adoc new file mode 100644 index 0000000000..a906d1841f --- /dev/null +++ b/docs/clickhouse-debugging.adoc @@ -0,0 +1,199 @@ +:showtitle: +:numbered: +:toc: left + += Omicron Clickhouse Debugging Guide + +This is a guide for debugging Clickhouse on a variety of environments. + +If you have advice that is not covered here, consider adding it! + +== Debugging on a Live System + +The following provides instructions for connecting to a Clickhouse shell on a running system. + +. **Find the zone running Clickhouse**. This can be accomplished by running `zoneadm list -cv`, and finding the zone with a prefix of `oxz_clickhouse`. If you're running on a multi-machine system (e.g., dogfood, colo, etc) and you have access to the `pilot` binary, you can ask all sleds at once for the location of Clickhouse with: +// '+' for list continuation to insert code blocks while keeping the list order ++ +[source,bash] +---- +# Run from the switch zone. +$ pilot host exec -c "zoneadm list -c | grep clickhouse" 0-31 +---- +. **Log into that zone**. This can be done using: ++ +[source,bash] +---- +# Run from the switch zone +$ pilot host login + +# Run from the machine with the Clichouse zone +$ pfexec zlogin oxz_clickhouse_ +---- + +. **Identify the IP address of Clickhouse**. This is possible using `ipadm`: ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ ipadm +ADDROBJ TYPE STATE ADDR +lo0/v4 static ok 127.0.0.1/8 +lo0/v6 static ok ::1/128 +oxControlService8/ll addrconf ok fe80::8:20ff:fe35:6b0a%oxControlService8/10 +oxControlService8/omicron6 static ok fd00:1122:3344:107::4/64 <-- It's this one! +---- +. **Log into Clickhouse using the CLI** ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ /opt/oxide/clickhouse/clickhouse client --host fd00:1122:3344:107::4 +ClickHouse client version 22.8.9.1. +Connecting to fd00:1122:3344:107::4:9000 as user default. +Connected to ClickHouse server version 22.8.9 revision 54460. + +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) +---- +. **Inspect the database**. At this point, you've successfully accessed the Clichouse shell. +The `oximeter` database is likely the most useful one for inspection: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) USE oximeter; +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SHOW TABLES + +SHOW TABLES + +Query id: a8c82507-6179-40ee-8e51-4801ca5ff6f8 + +┌─name───────────────────────┐ +│ fields_bool │ +│ fields_i16 │ +│ fields_i32 │ +│ fields_i64 │ +│ fields_i8 │ +│ fields_ipaddr │ +│ fields_string │ +│ fields_u16 │ +│ fields_u32 │ +│ fields_u64 │ +│ fields_u8 │ +│ fields_uuid │ +│ measurements_bool │ +│ measurements_bytes │ +│ measurements_cumulativef32 │ +│ measurements_cumulativef64 │ +│ measurements_cumulativei64 │ +│ measurements_cumulativeu64 │ +│ measurements_f32 │ +│ measurements_f64 │ +│ measurements_histogramf32 │ +│ measurements_histogramf64 │ +│ measurements_histogrami16 │ +│ measurements_histogrami32 │ +│ measurements_histogrami64 │ +│ measurements_histogrami8 │ +│ measurements_histogramu16 │ +│ measurements_histogramu32 │ +│ measurements_histogramu64 │ +│ measurements_histogramu8 │ +│ measurements_i16 │ +│ measurements_i32 │ +│ measurements_i64 │ +│ measurements_i8 │ +│ measurements_string │ +│ measurements_u16 │ +│ measurements_u32 │ +│ measurements_u64 │ +│ measurements_u8 │ +│ timeseries_schema │ +│ version │ +└────────────────────────────┘ +41 rows in set. Elapsed: 0.002 sec. +---- +. **Query for your schema**. The `timeseries_schema` table can provide some additional context for your particular +measurement. The rest of this document will contain an example looking for a very specific "transaction retry" +timeseries, but you can substitute these values with your own. If we know even part of the timeseries name (like the word "transaction") we can search for it with the following: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT timeseries_name,fields.type,fields.source,datum_type FROM timeseries_schema WHERE timeseries_name LIKE '%transaction%' + +SELECT + timeseries_name, + fields.type, + fields.source, + datum_type +FROM timeseries_schema +WHERE timeseries_name LIKE '%transaction%' + +Query id: 09e6086f-fc5d-4905-abed-013be55d6706 + +┌─timeseries_name─────────────────┬─fields.type──────┬─fields.source───────┬─datum_type─┐ +│ database_transaction:retry_data │ ['U32','String'] │ ['Metric','Target'] │ F64 │ +└─────────────────────────────────┴──────────────────┴─────────────────────┴────────────┘ + +1 row in set. Elapsed: 0.003 sec. +---- +This tells us the following: first, our timeseries has fields (see: `fields.type`) from `fields_u32` and `fields_string`. Next, it also emits measurements (see: `datum_type`) into `measurements_f64`. + +. **Query for your data**. This next step is extremely specific to your particular timeseries. +However, for this "database_transaction:retry_data" example, we need to query for data related +to this timeseries from `fields_u32`, `fields_string`, and `measurements_f64`. This information +should be inferable from the query to the `timeseries_schema` table. + ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT + fields_string.field_value as transaction_name, + fields_u32.field_value as attempt, + measurements_f64.datum as attempt_duration, + measurements_f64.timestamp +FROM measurements_f64 +INNER JOIN fields_string ON fields_string.timeseries_key = measurements_f64.timeseries_key +INNER JOIN fields_u32 ON fields_u32.timeseries_key = measurements_f64.timeseries_key +WHERE measurements_f64.timeseries_name = 'database_transaction:retry_data' +ORDER BY measurements_f64.timestamp ASC + +Query id: 813c994e-058c-4af2-9d3a-11cf9f222cbf + +┌─transaction_name─────────┬─attempt─┬─attempt_duration─┬────measurements_f64.timestamp─┐ +│ sled_reservation_create │ 1 │ 0.014977911 │ 2024-01-11 22:41:13.667101491 │ +│ sled_reservation_create │ 1 │ 0.01660099 │ 2024-01-11 22:41:13.667610290 │ +│ sled_reservation_create │ 1 │ 0.014088819 │ 2024-01-11 22:41:13.672007505 │ +│ sled_reservation_create │ 1 │ 0.01501511 │ 2024-01-11 22:41:13.673713738 │ +│ sled_reservation_create │ 2 │ 0.156134143 │ 2024-01-11 22:41:13.843218486 │ +│ sled_reservation_create │ 2 │ 0.150804944 │ 2024-01-11 22:41:13.855771487 │ +│ sled_reservation_create │ 2 │ 0.17012195 │ 2024-01-11 22:41:13.855798649 │ +│ sled_reservation_create │ 1 │ 0.205570224 │ 2024-01-11 22:41:13.872957153 │ +│ sled_reservation_create │ 3 │ 0.006690087 │ 2024-01-11 22:41:13.891856215 │ +│ sled_reservation_create │ 4 │ 0.012846307 │ 2024-01-11 22:41:13.955465361 │ +│ sled_reservation_create │ 1 │ 0.020482506 │ 2024-01-18 23:22:48.146559108 │ +│ sled_reservation_create │ 1 │ 0.008722631 │ 2024-01-19 05:26:07.397242186 │ +│ sled_reservation_create │ 1 │ 0.007484627 │ 2024-01-19 05:26:07.590876948 │ +│ sled_reservation_create │ 1 │ 0.008384388 │ 2024-01-19 05:27:42.833060701 │ +│ sled_reservation_create │ 1 │ 0.009016489 │ 2024-01-19 05:28:15.860577501 │ +│ sled_reservation_create │ 1 │ 0.017649607 │ 2024-01-29 08:21:59.599608552 │ +│ sled_reservation_create │ 1 │ 0.017026628 │ 2024-01-29 08:23:30.278820785 │ +│ volume_create │ 1 │ 0.025257548 │ 2024-01-29 13:03:44.799614376 │ +│ volume_checkout │ 1 │ 0.009869392 │ 2024-01-29 13:03:49.827578682 │ +│ sled_reservation_create │ 1 │ 0.018168935 │ 2024-01-29 13:03:56.876826535 │ +│ volume_checkout │ 1 │ 0.007425083 │ 2024-01-29 13:27:17.949365703 │ +│ sled_reservation_create │ 1 │ 0.017133937 │ 2024-01-29 13:27:39.534955222 │ +│ sled_reservation_create │ 1 │ 0.028159647 │ 2024-01-29 13:27:39.593375890 │ +│ sled_reservation_create │ 1 │ 0.053410541 │ 2024-01-29 13:27:39.593709195 │ +│ sled_reservation_create │ 2 │ 0.080795694 │ 2024-01-29 13:27:39.717689230 │ +│ sled_reservation_create │ 1 │ 0.071597836 │ 2024-01-29 13:27:39.722071303 │ +│ regions_hard_delete │ 1 │ 0.019350474 │ 2024-01-31 13:51:58.056808199 │ +│ sled_reservation_create │ 1 │ 0.032482692 │ 2024-02-01 06:41:51.647937599 │ +│ volume_checkout │ 1 │ 0.009380859 │ 2024-02-01 07:03:04.971258393 │ +│ sled_reservation_create │ 1 │ 0.018020138 │ 2024-02-01 07:04:17.110928203 │ +│ regions_hard_delete │ 1 │ 0.011993838 │ 2024-02-01 08:32:56.113587884 │ +│ volume_checkout │ 1 │ 0.223425122 │ 2024-02-01 15:47:31.240008185 │ +│ volume_checkout │ 1 │ 0.454675525 │ 2024-02-01 15:47:31.480408091 │ +│ volume_checkout │ 1 │ 0.445790132 │ 2024-02-01 15:47:31.480943824 │ +│ volume_checkout │ 2 │ 0.206526747 │ 2024-02-01 15:47:31.481037611 │ +└──────────────────────────┴─────────┴──────────────────┴───────────────────────────────┘ +---- diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 6a0b8f79d5..e286fe3730 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -277,7 +277,7 @@ The below example demonstrates a single static gateway route; in-depth explanati [rack_network_config] # An internal-only IPv6 address block which contains AZ-wide services. # This does not need to be changed. -rack_subnet = "fd00:1122:3344:01::/56" +rack_subnet = "fd00:1122:3344:0100::/56" # A range of IP addresses used by Boundary Services on the network. In a real # system, these would be addresses of the uplink ports on the Sidecar. With # softnpu, only one address is used. diff --git a/flake.lock b/flake.lock index 2c24a13714..f2dfc1b532 100644 --- a/flake.lock +++ b/flake.lock @@ -36,16 +36,13 @@ }, "root": { "inputs": { - "flake-utils": "flake-utils", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" } }, "rust-overlay": { "inputs": { - "flake-utils": [ - "flake-utils" - ], + "flake-utils": "flake-utils", "nixpkgs": [ "nixpkgs" ] diff --git a/flake.nix b/flake.nix index 65329cbbf7..408dff5706 100644 --- a/flake.nix +++ b/flake.nix @@ -3,62 +3,431 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { url = "github:oxalica/rust-overlay"; - inputs = { - nixpkgs.follows = "nixpkgs"; - flake-utils.follows = "flake-utils"; - }; + inputs.nixpkgs.follows = "nixpkgs"; }; }; - outputs = { self, nixpkgs, flake-utils, rust-overlay }: - flake-utils.lib.eachDefaultSystem - (system: + outputs = { self, nixpkgs, rust-overlay, ... }: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { + inherit overlays; + system = "x86_64-linux"; + }; + # use the Rust toolchain defined in the `rust-toolchain.toml` file. + rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; + + buildInputs = with pkgs; [ + # libs + openssl + postgresql + xmlsec + sqlite + libclang + libxml2 + libtool + ]; + + nativeBuildInputs = with pkgs; [ + rustToolchain + cmake + stdenv + pkg-config + ]; + + openAPIVersion = with pkgs.lib; path: + let + file = strings.fileContents path; + parts = strings.splitString "\n" file; + extractHash = prefix: (line: trivial.pipe line [ + (elemAt parts) + (strings.removeSuffix "\"") + (strings.removePrefix "${prefix}=\"") + ]); + in + { + commit = extractHash "COMMIT" 0; + sha = extractHash "SHA2" 1; + }; + + downloadBuildomat = + let baseURL = "https://buildomat.eng.oxide.computer/public/file/oxidecomputer"; + in { kind, repo, file, commit, sha }: + builtins.fetchurl { + url = "${baseURL}/${repo}/${kind}/${commit}/${file}"; + sha256 = sha; + }; + + downloadOpenAPI = { repo, file, version }: + downloadBuildomat + { + inherit repo file; + kind = "openapi"; + commit = pkgs.lib.debug.traceValFn + (v: "${file}: commit=${v}") + version.commit; + sha = version.sha; + }; + + dendriteVersion = openAPIVersion + ./tools/dendrite_openapi_version; + mgVersion = openAPIVersion + ./tools/maghemite_mg_openapi_version; + + + dendriteOpenAPI = downloadOpenAPI + { + repo = "dendrite"; + file = "dpd.json"; + version = dendriteVersion; + }; + + ddmOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "ddm-admin.json"; + version = openAPIVersion ./tools/maghemite_ddm_openapi_version; + }; + + mgOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "mg-admin.json"; + version = mgVersion; + }; + + # given a list of strings of the form `PREFIX="SHA256"`, finds the string + # starting with the provided `name` and returns the hash for that prefix. + findSha = with pkgs.lib; + shas: (name: + let + upperName = strings.toUpper name; + prefix = "${upperName}=\""; + in + trivial.pipe shas [ + (lists.findFirst (strings.hasPrefix prefix) "") + (strings.removePrefix prefix) + (strings.removeSuffix "\"") + ]); + + dendrite-stub = with pkgs.lib; let - overlays = [ (import rust-overlay) ]; - pkgs = import nixpkgs { - inherit system overlays; + commit = dendriteVersion.commit; + repo = "dendrite"; + stubShas = + let + file = builtins.readFile + ./tools/dendrite_stub_checksums; + in + strings.splitString "\n" file; + findStubSha = name: findSha stubShas "CIDL_SHA256_${name}"; + fetchLinuxBin = file: + downloadBuildomat { + inherit commit file repo; + sha = findStubSha "linux_${file}"; + kind = "linux-bin"; + }; + + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findStubSha "illumos"; + kind = "image"; + file = "dendrite-stub.tar.gz"; + }; + swadm = fetchLinuxBin + "swadm"; + dpd = fetchLinuxBin + "dpd"; + in + with pkgs; stdenv.mkDerivation + { + name = "dendrite-stub"; + version = commit; + src = tarball; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + openssl + ]; + + installPhase = + let + binPath = "root/opt/oxide/dendrite/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${swadm} $out/${binPath}/swadm + chmod +x $out/${binPath}/swadm + cp ${dpd} $out/${binPath}/dpd + chmod +x $out/${binPath}/dpd + + mkdir -p $out/bin + ln -s $out/${binPath}/swadm $out/bin/swadm + ln -s $out/${binPath}/dpd $out/bin/dpd + ''; }; - # use the Rust toolchain defined in the `rust-toolchain.toml` file. - rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; - nativeBuildInputs = with pkgs; [ - rustToolchain - cmake - stdenv - pkg-config - ]; - buildInputs = with pkgs; [ - # libs - openssl - postgresql - xmlsec - sqlite - libclang - libxml2 - ]; + + mgd = with pkgs.lib; + let + commit = mgVersion.commit; + repo = "maghemite"; + shas = + let + file = builtins.readFile + ./tools/maghemite_mgd_checksums; + in + strings.splitString + "\n" + file; + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findSha shas "CIDL_SHA256"; + kind = "image"; + file = "mgd.tar.gz"; + }; + linuxBin = + downloadBuildomat + { + inherit commit repo; + sha = findSha shas "MGD_LINUX_SHA256"; + kind = "linux"; + file = "mgd"; + }; in with pkgs; - { - devShells.default = mkShell.override + stdenv.mkDerivation + { + name = "mgd"; + src = tarball; + version = commit; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + + installPhase = + let + binPath = "root/opt/oxide/mgd/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${linuxBin} $out/${binPath}/mgd + chmod +x $out/${binPath}/mgd + + mkdir -p $out/bin + ln -s $out/${binPath}/mgd $out/bin/mgd + ''; + }; + + # reads the version for Clickhouse or Cockroachdb from the + # `tools/clickhouse_version` and `tools/cockroachdb_version` files. + readVersionFile = with pkgs.lib; file: trivial.pipe ./tools/${file} [ + (builtins.readFile) + (strings.removeSuffix "\n") + (strings.removePrefix "v") + (debug.traceValFn (v: "${file}: ${v}")) + ]; + + clickhouse = with pkgs; + let + name = "clickhouse"; + version = readVersionFile "${name}_version"; + # N.B. that unlike maghemite and dendrite, the Clickhouse hashes + # in `tools/clickhouse_checksums` are MD5 rather than SHA256, so we + # can't give Nix those hashes and must instead determine it ourselves. + # this means that we will have to update this SHA if the clickhouse + # version changes. + sha256 = "1lgxwh67apgl386ilpg0iy5xkyz12q4lgnz08zswjbxv88ra0qxj"; + src = builtins.fetchurl { - # use Clang as the C compiler for all C libraries - stdenv = clangStdenv; - } + inherit sha256; + url = "https://oxide-clickhouse-build.s3.us-west-2.amazonaws.com/${name}-v${version}.linux.tar.gz"; + }; + in + stdenv.mkDerivation + { + inherit src name version; + sourceRoot = "."; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + mkdir -p $out/etc + cp ./${name} $out/bin/${name} + cp ./._config.xml $out/bin/config.xml + ''; + }; + + cockroachdb = with pkgs; + let + name = "cockroachdb"; + binName = "cockroach"; + version = readVersionFile "${name}_version"; + sha256 = + let + shaFile = builtins.readFile ./tools/${name}_checksums; + shas = lib.strings.splitString "\n" shaFile; + in + findSha shas "CIDL_SHA256_LINUX"; + src = builtins.fetchurl { - inherit buildInputs nativeBuildInputs; + inherit sha256; + url = "https://binaries.cockroachdb.com/${binName}-v${version}.linux-amd64.tgz"; + }; + in + stdenv.mkDerivation + { + inherit name src version; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; - name = "omicron"; - DEP_PQ_LIBDIRS = " ${postgresql.lib}/lib"; - LIBCLANG_PATH = "${libclang.lib}/lib"; - OPENSSL_DIR = "${openssl.dev}"; - OPENSSL_LIB_DIR = "${openssl.out}/lib"; + buildInputs = [ + glibc + # gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + cp ./${binName} $out/bin/${binName} + ''; + }; + in + { + packages.x86_64-linux = { + inherit dendrite-stub mgd cockroachdb clickhouse; + }; - # Needed by rustfmt-wrapper, see: - # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs - RUSTFMT = "${rustToolchain}/bin/rustfmt"; + checks.x86_64-linux = with pkgs; + let + # produces a check derivation that ensures a package's executable has + # the expected version. + mkVersionCheck = { pkg, cmd }: runCommand "check-${pkg.name}-version" + { + PATH = "${pkg.out}"; + } '' + actualVersion=$(${pkg.out}/bin/${cmd}) + if [ "$actualVersion" != "${pkg.version}" ]; then + echo "expected ${pkg.name} version \"${pkg.version}\", got \"$actualVersion\"" + exit 1 + fi + + # the check derivation must have an output. + touch $out + ''; + # produces a check derivation that ensures a package's executable + # runs. + mkExecCheck = { pkg, cmd }: runCommand "check-${pkg.name}-${cmd}-exec" + { } '' + ${pkg.out}/bin/${cmd} && touch $out + ''; + in + { + clickhouseVersion = mkVersionCheck + { + pkg = clickhouse; + cmd = "clickhouse server --version | cut -d ' ' -f 4"; }; - } - ); + + cockroachdbVersion = mkVersionCheck + { + pkg = cockroachdb; + cmd = "cockroach version --build-tag | tr -d 'v'"; + }; + + mgdCanExec = mkExecCheck { + pkg = mgd; + cmd = "mgd help"; + }; + + dpdCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "dpd help"; + }; + + swadmCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "swadm help"; + }; + }; + + devShells.x86_64-linux.default = + pkgs.mkShell.override + { + # use Clang as the C compiler for all C libraries + stdenv = pkgs.clangStdenv; + } + { + inherit buildInputs; + nativeBuildInputs = nativeBuildInputs ++ [ + # Dendrite and maghemite, for running tests. + dendrite-stub + mgd + clickhouse + cockroachdb + ]; + + name = "omicron"; + DEP_PQ_LIBDIRS = "${pkgs.postgresql.lib}/lib"; + LIBCLANG_PATH = "${pkgs.libclang.lib}/lib"; + OPENSSL_DIR = "${pkgs.openssl.dev}"; + OPENSSL_LIB_DIR = "${pkgs.openssl.out}/lib"; + + MG_OPENAPI_PATH = mgOpenAPI; + DDM_OPENAPI_PATH = ddmOpenAPI; + DPD_OPENAPI_PATH = dendriteOpenAPI; + + # Needed by rustfmt-wrapper, see: + # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs + RUSTFMT = "${rustToolchain}/bin/rustfmt"; + + shellHook = '' + rm out/mgd + rm out/dendrite-stub + rm -r out/clickhouse + rm -r out/cockroachdb + + mkdir -p out/clickhouse + mkdir -p out/cockroachdb/ + + ln -s ${mgd.out} -T out/mgd + ln -s ${dendrite-stub.out} -T out/dendrite-stub + ln -s ${clickhouse.out}/bin/clickhouse out/clickhouse/clickhouse + ln -s ${clickhouse.out}/etc/config.xml out/clickhouse + ln -s ${cockroachdb.out}/bin out/cockroachdb/bin + ''; + }; + }; } + + + + + + + diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 72adb1d3df..cde6e7e8c6 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1053,6 +1053,7 @@ mod tests { use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use omicron_common::address::Ipv6Subnet; use omicron_test_utils::dev; @@ -1061,7 +1062,11 @@ mod tests { use std::mem; use std::net::Ipv6Addr; - static EMPTY_POLICY: Policy = Policy { sleds: BTreeMap::new() }; + static EMPTY_POLICY: Policy = Policy { + sleds: BTreeMap::new(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: 0, + }; // This is a not-super-future-maintainer-friendly helper to check that all // the subtables related to blueprints have been pruned of a specific @@ -1111,7 +1116,11 @@ mod tests { }) .collect(); let ip = ip.unwrap_or_else(|| thread_rng().gen::().into()); - SledResources { zpools, subnet: Ipv6Subnet::new(ip) } + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet: Ipv6Subnet::new(ip), + } } // Create a `Policy` that contains all the sleds found in `collection` @@ -1131,6 +1140,11 @@ mod tests { ) }) .collect(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: collection + .all_omicron_zones() + .filter(|z| z.zone_type.is_nexus()) + .count(), } } @@ -1320,7 +1334,8 @@ mod tests { // Create a builder for a child blueprint. let mut builder = - BlueprintBuilder::new_based_on(&blueprint1, &policy, "test"); + BlueprintBuilder::new_based_on(&blueprint1, &policy, "test") + .expect("failed to create builder"); // Add zones to our new sled. assert_eq!( @@ -1465,9 +1480,11 @@ mod tests { .unwrap(); let blueprint2 = BlueprintBuilder::new_based_on(&blueprint1, &EMPTY_POLICY, "test2") + .expect("failed to create builder") .build(); let blueprint3 = BlueprintBuilder::new_based_on(&blueprint1, &EMPTY_POLICY, "test3") + .expect("failed to create builder") .build(); assert_eq!(blueprint1.parent_blueprint_id, None); assert_eq!(blueprint2.parent_blueprint_id, Some(blueprint1.id)); @@ -1559,6 +1576,7 @@ mod tests { // with enabled=false, that status is serialized. let blueprint4 = BlueprintBuilder::new_based_on(&blueprint3, &EMPTY_POLICY, "test3") + .expect("failed to create builder") .build(); assert_eq!(blueprint4.parent_blueprint_id, Some(blueprint3.id)); datastore.blueprint_insert(&opctx, &blueprint4).await.unwrap(); diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b9ad2ea610..f9e0be81c1 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -150,6 +150,7 @@ pub type DataStoreConnection<'a> = bb8::PooledConnection<'a, ConnectionManager>; pub struct DataStore { + log: Logger, pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, transaction_retry_producer: crate::transaction_retry::Producer, @@ -164,8 +165,9 @@ impl DataStore { /// Ignores the underlying DB version. Should be used with caution, as usage /// of this method can construct a Datastore which does not understand /// the underlying CockroachDB schema. Data corruption could result. - pub fn new_unchecked(pool: Arc) -> Result { + pub fn new_unchecked(log: Logger, pool: Arc) -> Result { let datastore = DataStore { + log, pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), @@ -184,7 +186,8 @@ impl DataStore { pool: Arc, config: Option<&SchemaConfig>, ) -> Result { - let datastore = Self::new_unchecked(pool)?; + let datastore = + Self::new_unchecked(log.new(o!("component" => "datastore")), pool)?; // Keep looping until we find that the schema matches our expectation. const EXPECTED_VERSION: SemverVersion = @@ -230,6 +233,7 @@ impl DataStore { name: &'static str, ) -> crate::transaction_retry::RetryHelper { crate::transaction_retry::RetryHelper::new( + &self.log, &self.transaction_retry_producer, name, ) diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index c474b729f8..6b5098158b 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -9,6 +9,7 @@ use chrono::Utc; use diesel::result::Error as DieselError; use oximeter::{types::Sample, Metric, MetricsError, Target}; use rand::{thread_rng, Rng}; +use slog::{info, warn, Logger}; use std::sync::{Arc, Mutex}; use std::time::Duration; @@ -60,6 +61,10 @@ impl RetryHelperInner { Self { start: Utc::now(), attempts: 1 } } + fn has_retried(&self) -> bool { + self.attempts > 1 + } + fn tick(&mut self) -> Self { let start = self.start; let attempts = self.attempts; @@ -74,6 +79,7 @@ impl RetryHelperInner { /// Helper utility for tracking retry attempts and latency. /// Intended to be used from within "transaction_async_with_retry". pub struct RetryHelper { + log: Logger, producer: Producer, name: &'static str, inner: Mutex, @@ -86,8 +92,13 @@ const MAX_RETRY_ATTEMPTS: u32 = 10; impl RetryHelper { /// Creates a new RetryHelper, and starts a timer tracking the transaction /// duration. - pub(crate) fn new(producer: &Producer, name: &'static str) -> Self { + pub(crate) fn new( + log: &Logger, + producer: &Producer, + name: &'static str, + ) -> Self { Self { + log: log.new(o!("transaction" => name)), producer: producer.clone(), name, inner: Mutex::new(RetryHelperInner::new()), @@ -107,7 +118,21 @@ impl RetryHelper { + Send + Sync, { - conn.transaction_async_with_retry(f, self.as_callback()).await + let slef = Arc::new(self); + let result = conn + .transaction_async_with_retry(f, slef.clone().as_callback()) + .await; + + let retry_info = slef.inner.lock().unwrap(); + if retry_info.has_retried() { + info!( + slef.log, + "transaction completed"; + "attempts" => retry_info.attempts, + ); + } + + result } // Called upon retryable transaction failure. @@ -143,6 +168,12 @@ impl RetryHelper { let mut rng = thread_rng(); rng.gen_range(MIN_RETRY_BACKOFF..MAX_RETRY_BACKOFF) }; + + warn!( + self.log, + "Retryable transaction failure"; + "retry_after (ms)" => duration.as_millis(), + ); tokio::time::sleep(duration).await; // Now that we've finished sleeping, reset the timer and bump the number @@ -151,14 +182,13 @@ impl RetryHelper { return inner.attempts < MAX_RETRY_ATTEMPTS; } - /// Converts this function to a retryable callback that can be used from - /// "transaction_async_with_retry". - pub(crate) fn as_callback( - self, + // Converts this function to a retryable callback that can be used from + // "transaction_async_with_retry". + fn as_callback( + self: Arc, ) -> impl Fn() -> futures::future::BoxFuture<'static, bool> { - let r = Arc::new(self); move || { - let r = r.clone(); + let r = self.clone(); Box::pin(async move { r.retry_callback().await }) } } diff --git a/nexus/deployment/src/blueprint_builder.rs b/nexus/deployment/src/blueprint_builder.rs index ac2fe70e6b..1bf46d34b2 100644 --- a/nexus/deployment/src/blueprint_builder.rs +++ b/nexus/deployment/src/blueprint_builder.rs @@ -6,11 +6,14 @@ use crate::ip_allocator::IpAllocator; use anyhow::anyhow; +use anyhow::bail; use internal_dns::config::Host; use internal_dns::config::ZoneVariant; use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::NetworkInterface; +use nexus_types::deployment::NetworkInterfaceKind; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneDataset; use nexus_types::deployment::OmicronZoneType; @@ -23,11 +26,20 @@ use omicron_common::address::get_internal_dns_server_addresses; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CP_SERVICES_RESERVED_ADDRESSES; +use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; +use omicron_common::address::NEXUS_OPTE_IPV6_SUBNET; use omicron_common::address::NTP_PORT; use omicron_common::address::SLED_RESERVED_ADDRESSES; use omicron_common::api::external::Generation; +use omicron_common::api::external::IpNet; +use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; +use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::collections::HashSet; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use thiserror::Error; @@ -38,6 +50,14 @@ use uuid::Uuid; pub enum Error { #[error("sled {sled_id}: ran out of available addresses for sled")] OutOfAddresses { sled_id: Uuid }, + #[error("no Nexus zones exist in parent blueprint")] + NoNexusZonesInParentBlueprint, + #[error("no external service IP addresses are available")] + NoExternalServiceIpAvailable, + #[error("no system MAC addresses are available")] + NoSystemMacAddressAvailable, + #[error("exhausted available Nexus IP addresses")] + ExhaustedNexusIps, #[error("programming error in planner")] Planner(#[from] anyhow::Error), } @@ -52,6 +72,16 @@ pub enum Ensure { NotNeeded, } +/// Describes whether an idempotent "ensure" operation resulted in multiple +/// actions taken or no action was necessary +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum EnsureMultiple { + /// action was taken, and multiple items were added + Added(usize), + /// no action was necessary + NotNeeded, +} + /// Helper for assembling a blueprint /// /// There are two basic ways to assemble a new blueprint: @@ -77,10 +107,20 @@ pub struct BlueprintBuilder<'a> { // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. - omicron_zones: BTreeMap, + zones: BlueprintZones<'a>, zones_in_service: BTreeSet, creator: String, comments: Vec, + + // These fields mirror how RSS chooses addresses for zone NICs. + nexus_v4_ips: Box + Send>, + nexus_v6_ips: Box + Send>, + + // Iterator of available external IPs for service zones + available_external_ips: Box + Send + 'a>, + + // Iterator of available MAC addresses in the system address range + available_system_macs: Box>, } impl<'a> BlueprintBuilder<'a> { @@ -146,54 +186,122 @@ impl<'a> BlueprintBuilder<'a> { parent_blueprint: &'a Blueprint, policy: &'a Policy, creator: &str, - ) -> BlueprintBuilder<'a> { - BlueprintBuilder { + ) -> anyhow::Result> { + // Scan through the parent blueprint and build several sets of "used + // resources". When adding new control plane zones to a sled, we may + // need to allocate new resources to that zone. However, allocation at + // this point is entirely optimistic and theoretical: our caller may + // discard the blueprint we create without ever making it the new + // target, or it might be an arbitrarily long time before it becomes the + // target. We need to be able to make allocation decisions that we + // expect the blueprint executor to be able to realize successfully if + // and when we become the target, but we cannot _actually_ perform + // resource allocation. + // + // To do this, we look at our parent blueprint's used resources, and + // then choose new resources that aren't already in use (if possible; if + // we need to allocate a new resource and the parent blueprint appears + // to be using all the resources of that kind, our blueprint generation + // will fail). + // + // For example, RSS assigns Nexus NIC IPs by stepping through a list of + // addresses based on `NEXUS_OPTE_IPVx_SUBNET` (as in the iterators + // below). We use the same list of addresses, but additionally need to + // filter out the existing IPs for any Nexus instances that already + // exist. + // + // Note that by building these iterators up front based on + // `parent_blueprint`, we cannot reuse resources in a case where we + // remove a zone that used a resource and then add another zone that + // wants the same kind of resource. We don't support zone removal yet, + // but expect this to be okay: we don't anticipate removal and addition + // to frequently be combined into the exact same blueprint, particularly + // in a way that expects the addition to reuse resources from the + // removal; we won't want to attempt to reuse resources from a zone + // until we know it's been fully removed. + let mut existing_nexus_v4_ips: HashSet = HashSet::new(); + let mut existing_nexus_v6_ips: HashSet = HashSet::new(); + let mut used_external_ips: HashSet = HashSet::new(); + let mut used_macs: HashSet = HashSet::new(); + + for (_, z) in parent_blueprint.all_omicron_zones() { + if let OmicronZoneType::Nexus { nic, .. } = &z.zone_type { + match nic.ip { + IpAddr::V4(ip) => { + if !existing_nexus_v4_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + IpAddr::V6(ip) => { + if !existing_nexus_v6_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + } + } + if let Some(external_ip) = z.zone_type.external_ip()? { + if !used_external_ips.insert(external_ip) { + bail!("duplicate external IP: {external_ip}"); + } + } + if let Some(nic) = z.zone_type.service_vnic() { + if !used_macs.insert(nic.mac) { + bail!("duplicate service vNIC MAC: {}", nic.mac); + } + } + } + + // TODO-performance Building these iterators as "walk through the list + // and skip anything we've used already" is fine as long as we're + // talking about a small number of resources (e.g., single-digit number + // of Nexus instances), but wouldn't be ideal if we have many resources + // we need to skip. We could do something smarter here based on the sets + // of used resources we built above if needed. + let nexus_v4_ips = Box::new( + NEXUS_OPTE_IPV4_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v4_ips.contains(ip)), + ); + let nexus_v6_ips = Box::new( + NEXUS_OPTE_IPV6_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v6_ips.contains(ip)), + ); + let available_external_ips = Box::new( + policy + .service_ip_pool_ranges + .iter() + .flat_map(|r| r.iter()) + .filter(move |ip| !used_external_ips.contains(ip)), + ); + let available_system_macs = Box::new( + MacAddr::iter_system().filter(move |mac| !used_macs.contains(mac)), + ); + + Ok(BlueprintBuilder { parent_blueprint, policy, sled_ip_allocators: BTreeMap::new(), - omicron_zones: BTreeMap::new(), + zones: BlueprintZones::new(parent_blueprint), zones_in_service: parent_blueprint.zones_in_service.clone(), creator: creator.to_owned(), comments: Vec::new(), - } + nexus_v4_ips, + nexus_v6_ips, + available_external_ips, + available_system_macs, + }) } /// Assemble a final [`Blueprint`] based on the contents of the builder - pub fn build(mut self) -> Blueprint { + pub fn build(self) -> Blueprint { // Collect the Omicron zones config for each in-service sled. - let omicron_zones = self - .policy - .sleds - .keys() - .map(|sled_id| { - // Start with self.omicron_zones, which contains entries for any - // sled whose zones config is changing in this blueprint. - let mut zones = self - .omicron_zones - .remove(sled_id) - // If it's not there, use the config from the parent - // blueprint. - .or_else(|| { - self.parent_blueprint - .omicron_zones - .get(sled_id) - .cloned() - }) - // If it's not there either, then this must be a new sled - // and we haven't added any zones to it yet. Use the - // standard initial config. - .unwrap_or_else(|| OmicronZonesConfig { - generation: Generation::new(), - zones: vec![], - }); - - // This is not strictly necessary. But for testing, it's - // helpful for things to be in sorted order. - zones.zones.sort_by_key(|zone| zone.id); - - (*sled_id, zones) - }) - .collect(); + let omicron_zones = + self.zones.into_omicron_zones(self.policy.sleds.keys().copied()); Blueprint { id: Uuid::new_v4(), omicron_zones, @@ -222,13 +330,9 @@ impl<'a> BlueprintBuilder<'a> { ) -> Result { // If there's already an NTP zone on this sled, do nothing. let has_ntp = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| z.zone_type.is_ntp()) - }) - .unwrap_or(false); + .zones + .current_sled_zones(sled_id) + .any(|z| z.zone_type.is_ntp()); if has_ntp { return Ok(Ensure::NotNeeded); } @@ -286,20 +390,14 @@ impl<'a> BlueprintBuilder<'a> { pool_name: ZpoolName, ) -> Result { // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| { - matches!( - &z.zone_type, - OmicronZoneType::Crucible { dataset, .. } - if dataset.pool_name == pool_name - ) - }) - }) - .unwrap_or(false); + let has_crucible_on_this_pool = + self.zones.current_sled_zones(sled_id).any(|z| { + matches!( + &z.zone_type, + OmicronZoneType::Crucible { dataset, .. } + if dataset.pool_name == pool_name + ) + }); if has_crucible_on_this_pool { return Ok(Ensure::NotNeeded); } @@ -329,6 +427,127 @@ impl<'a> BlueprintBuilder<'a> { Ok(Ensure::Added) } + /// Return the number of Nexus zones that would be configured to run on the + /// given sled if this builder generated a blueprint + /// + /// This value may change before a blueprint is actually generated if + /// further changes are made to the builder. + pub fn sled_num_nexus_zones(&self, sled_id: Uuid) -> usize { + self.zones + .current_sled_zones(sled_id) + .filter(|z| z.zone_type.is_nexus()) + .count() + } + + pub fn sled_ensure_zone_multiple_nexus( + &mut self, + sled_id: Uuid, + desired_zone_count: usize, + ) -> Result { + // How many Nexus zones do we need to add? + let nexus_count = self.sled_num_nexus_zones(sled_id); + let num_nexus_to_add = match desired_zone_count.checked_sub(nexus_count) + { + Some(0) => return Ok(EnsureMultiple::NotNeeded), + Some(n) => n, + None => { + return Err(Error::Planner(anyhow!( + "removing a Nexus zone not yet supported \ + (sled {sled_id} has {nexus_count}; \ + planner wants {desired_zone_count})" + ))); + } + }; + + // Whether Nexus should use TLS and what the external DNS servers it + // should use are currently provided at rack-setup time, and should be + // consistent across all Nexus instances. We'll assume we can copy them + // from any other Nexus zone in our parent blueprint. + // + // TODO-correctness Once these properties can be changed by a rack + // operator, this will need more work. At a minimum, if such a change + // goes through the blueprint system (which seems likely), we'll need to + // check that we're if this builder is being used to make such a change, + // that change is also reflected here in a new zone. Perhaps these + // settings should be part of `Policy` instead? + let (external_tls, external_dns_servers) = self + .parent_blueprint + .omicron_zones + .values() + .find_map(|sled_zones| { + sled_zones.zones.iter().find_map(|z| match &z.zone_type { + OmicronZoneType::Nexus { + external_tls, + external_dns_servers, + .. + } => Some((*external_tls, external_dns_servers.clone())), + _ => None, + }) + }) + .ok_or(Error::NoNexusZonesInParentBlueprint)?; + + for _ in 0..num_nexus_to_add { + let nexus_id = Uuid::new_v4(); + let external_ip = self + .available_external_ips + .next() + .ok_or(Error::NoExternalServiceIpAvailable)?; + + let nic = { + let (ip, subnet) = match external_ip { + IpAddr::V4(_) => ( + self.nexus_v4_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + ), + IpAddr::V6(_) => ( + self.nexus_v6_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV6_SUBNET).into(), + ), + }; + let mac = self + .available_system_macs + .next() + .ok_or(Error::NoSystemMacAddressAvailable)?; + NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(nexus_id), + name: format!("nexus-{nexus_id}").parse().unwrap(), + ip, + mac, + subnet, + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + } + }; + + let ip = self.sled_alloc_ip(sled_id)?; + let port = omicron_common::address::NEXUS_INTERNAL_PORT; + let internal_address = + SocketAddrV6::new(ip, port, 0, 0).to_string(); + let zone = OmicronZoneConfig { + id: nexus_id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address, + external_ip, + nic, + external_tls, + external_dns_servers: external_dns_servers.clone(), + }, + }; + self.sled_add_zone(sled_id, zone)?; + } + + Ok(EnsureMultiple::Added(num_nexus_to_add)) + } + fn sled_add_zone( &mut self, sled_id: Uuid, @@ -344,27 +563,7 @@ impl<'a> BlueprintBuilder<'a> { ))); } - let sled_zones = - self.omicron_zones.entry(sled_id).or_insert_with(|| { - if let Some(old_sled_zones) = - self.parent_blueprint.omicron_zones.get(&sled_id) - { - OmicronZonesConfig { - generation: old_sled_zones.generation.next(), - zones: old_sled_zones.zones.clone(), - } - } else { - // The first generation is reserved to mean the one - // containing no zones. See - // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start - // with the next one. - OmicronZonesConfig { - generation: Generation::new().next(), - zones: vec![], - } - } - }); - + let sled_zones = self.zones.change_sled_zones(sled_id); sled_zones.zones.push(zone); Ok(()) } @@ -398,16 +597,14 @@ impl<'a> BlueprintBuilder<'a> { // Record each of the sled's zones' underlay addresses as // allocated. - if let Some(sled_zones) = self.omicron_zones.get(&sled_id) { - for z in &sled_zones.zones { - allocator.reserve(z.underlay_address); - } + for z in self.zones.current_sled_zones(sled_id) { + allocator.reserve(z.underlay_address); } allocator }); - allocator.alloc().ok_or_else(|| Error::OutOfAddresses { sled_id }) + allocator.alloc().ok_or(Error::OutOfAddresses { sled_id }) } fn sled_resources(&self, sled_id: Uuid) -> Result<&SledResources, Error> { @@ -420,28 +617,118 @@ impl<'a> BlueprintBuilder<'a> { } } +/// Helper for working with sets of zones on each sled +/// +/// Tracking the set of zones is slightly non-trivial because we need to bump +/// the per-sled generation number iff the zones are changed. So we need to +/// keep track of whether we've changed the zones relative to the parent +/// blueprint. We do this by keeping a copy of any `OmicronZonesConfig` that +/// we've changed and a _reference_ to the parent blueprint's zones. This +/// struct makes it easy for callers iterate over the right set of zones. +struct BlueprintZones<'a> { + changed_zones: BTreeMap, + parent_zones: &'a BTreeMap, +} + +impl<'a> BlueprintZones<'a> { + pub fn new(parent_blueprint: &'a Blueprint) -> BlueprintZones { + BlueprintZones { + changed_zones: BTreeMap::new(), + parent_zones: &parent_blueprint.omicron_zones, + } + } + + /// Returns a mutable reference to a sled's Omicron zones *because* we're + /// going to change them. It's essential that the caller _does_ change them + /// because we will have bumped the generation number and we don't want to + /// do that if no changes are being made. + pub fn change_sled_zones( + &mut self, + sled_id: Uuid, + ) -> &mut OmicronZonesConfig { + self.changed_zones.entry(sled_id).or_insert_with(|| { + if let Some(old_sled_zones) = self.parent_zones.get(&sled_id) { + OmicronZonesConfig { + generation: old_sled_zones.generation.next(), + zones: old_sled_zones.zones.clone(), + } + } else { + // The first generation is reserved to mean the one + // containing no zones. See + // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start + // with the next one. + OmicronZonesConfig { + generation: Generation::new().next(), + zones: vec![], + } + } + }) + } + + /// Iterates over the list of Omicron zones currently configured for this + /// sled in the blueprint that's being built + pub fn current_sled_zones( + &self, + sled_id: Uuid, + ) -> Box + '_> { + if let Some(sled_zones) = self + .changed_zones + .get(&sled_id) + .or_else(|| self.parent_zones.get(&sled_id)) + { + Box::new(sled_zones.zones.iter()) + } else { + Box::new(std::iter::empty()) + } + } + + /// Produces an owned map of zones for the requested sleds + pub fn into_omicron_zones( + mut self, + sled_ids: impl Iterator, + ) -> BTreeMap { + sled_ids + .map(|sled_id| { + // Start with self.changed_zones, which contains entries for any + // sled whose zones config is changing in this blueprint. + let mut zones = self + .changed_zones + .remove(&sled_id) + // If it's not there, use the config from the parent + // blueprint. + .or_else(|| self.parent_zones.get(&sled_id).cloned()) + // If it's not there either, then this must be a new sled + // and we haven't added any zones to it yet. Use the + // standard initial config. + .unwrap_or_else(|| OmicronZonesConfig { + generation: Generation::new(), + zones: vec![], + }); + + // This is not strictly necessary. But for testing, it's + // helpful for things to be in sorted order. + zones.zones.sort_by_key(|zone| zone.id); + + (sled_id, zones) + }) + .collect() + } +} + #[cfg(test)] pub mod test { - use super::BlueprintBuilder; - use ipnet::IpAdd; - use nexus_types::deployment::Policy; - use nexus_types::deployment::SledResources; - use nexus_types::deployment::ZpoolName; - use nexus_types::inventory::Collection; + use super::*; + use nexus_types::external_api::views::SledProvisionState; + use omicron_common::address::IpRange; + use omicron_common::address::Ipv4Range; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::ByteCount; - use omicron_common::api::external::Generation; use sled_agent_client::types::{ Baseboard, Inventory, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, OmicronZonesConfig, SledRole, }; - use std::collections::BTreeMap; - use std::collections::BTreeSet; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; use std::str::FromStr; - use uuid::Uuid; /// Returns a collection and policy describing a pretty simple system pub fn example() -> (Collection, Policy) { @@ -452,7 +739,32 @@ pub mod test { "a5f3db3a-61aa-4f90-ad3e-02833c253bf5", "0d168386-2551-44e8-98dd-ae7a7570f8a0", ]; - let mut policy = Policy { sleds: BTreeMap::new() }; + let mut policy = Policy { + sleds: BTreeMap::new(), + // IPs from TEST-NET-1 (RFC 5737) + service_ip_pool_ranges: vec![Ipv4Range::new( + "192.0.2.2".parse().unwrap(), + "192.0.2.20".parse().unwrap(), + ) + .unwrap() + .into()], + target_nexus_zone_count: 3, + }; + let mut service_ip_pool_range = policy.service_ip_pool_ranges[0].iter(); + let mut nexus_nic_ips = NEXUS_OPTE_IPV4_SUBNET + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES); + let mut nexus_nic_macs = { + let mut used = HashSet::new(); + std::iter::from_fn(move || { + let mut mac = MacAddr::random_system(); + while !used.insert(mac) { + mac = MacAddr::random_system(); + } + Some(mac) + }) + }; + for sled_id_str in sled_ids.iter() { let sled_id: Uuid = sled_id_str.parse().unwrap(); let sled_ip = policy_add_sled(&mut policy, sled_id); @@ -480,19 +792,58 @@ pub mod test { .unwrap(); let zpools = &policy.sleds.get(&sled_id).unwrap().zpools; - let ip1 = sled_ip.saturating_add(1); - let zones: Vec<_> = std::iter::once(OmicronZoneConfig { - id: Uuid::new_v4(), - underlay_address: sled_ip.saturating_add(1), - zone_type: OmicronZoneType::InternalNtp { - address: SocketAddrV6::new(ip1, 12345, 0, 0).to_string(), - dns_servers: vec![], - domain: None, - ntp_servers: vec![], - }, + let mut sled_ips = + std::iter::successors(Some(sled_ip.saturating_add(1)), |ip| { + println!("sled_ips iterator: currently {ip:?}"); + Some(ip.saturating_add(1)) + }); + let zones: Vec<_> = std::iter::once({ + let ip = sled_ips.next().unwrap(); + OmicronZoneConfig { + id: Uuid::new_v4(), + underlay_address: ip, + zone_type: OmicronZoneType::InternalNtp { + address: SocketAddrV6::new(ip, 12345, 0, 0).to_string(), + dns_servers: vec![], + domain: None, + ntp_servers: vec![], + }, + } }) - .chain(zpools.iter().enumerate().map(|(i, zpool_name)| { - let ip = sled_ip.saturating_add(u128::try_from(i + 2).unwrap()); + .chain(std::iter::once({ + let id = Uuid::new_v4(); + let ip = sled_ips.next().unwrap(); + let external_ip = + service_ip_pool_range.next().expect("no service IPs left"); + let nic_ip = + nexus_nic_ips.next().expect("no nexus nic IPs left"); + OmicronZoneConfig { + id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address: SocketAddrV6::new(ip, 12346, 0, 0) + .to_string(), + external_ip, + nic: NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(id), + name: format!("nexus-{id}").parse().unwrap(), + ip: nic_ip.into(), + mac: nexus_nic_macs + .next() + .expect("no nexus nic MACs left"), + subnet: IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_tls: false, + external_dns_servers: Vec::new(), + }, + } + })) + .chain(zpools.iter().map(|zpool_name| { + let ip = sled_ips.next().unwrap(); OmicronZoneConfig { id: Uuid::new_v4(), underlay_address: ip, @@ -540,10 +891,36 @@ pub mod test { .collect(); let subnet = Ipv6Subnet::::new(sled_ip); - policy.sleds.insert(sled_id, SledResources { zpools, subnet }); + policy.sleds.insert( + sled_id, + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet, + }, + ); sled_ip } + /// Checks various conditions that should be true for all blueprints + pub fn verify_blueprint(blueprint: &Blueprint) { + let mut underlay_ips: BTreeMap = + BTreeMap::new(); + for sled_zones in blueprint.omicron_zones.values() { + for zone in &sled_zones.zones { + if let Some(previous) = + underlay_ips.insert(zone.underlay_address, zone) + { + panic!( + "found duplicate underlay IP {} in zones {} and \ + {}\n\nblueprint: {:#?}", + zone.underlay_address, zone.id, previous.id, blueprint + ); + } + } + } + } + #[test] fn test_initial() { // Test creating a blueprint from a collection and verifying that it @@ -556,6 +933,7 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint_initial); // Since collections don't include what was in service, we have to // provide that ourselves. For our purposes though we don't care. @@ -575,8 +953,10 @@ pub mod test { &blueprint_initial, &policy, "test_basic", - ); + ) + .expect("failed to create builder"); let blueprint = builder.build(); + verify_blueprint(&blueprint); let diff = blueprint_initial.diff(&blueprint); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -596,9 +976,11 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); let mut builder = - BlueprintBuilder::new_based_on(&blueprint1, &policy, "test_basic"); + BlueprintBuilder::new_based_on(&blueprint1, &policy, "test_basic") + .expect("failed to create builder"); // The initial blueprint should have internal NTP zones on all the // existing sleds, plus Crucible zones on all pools. So if we ensure @@ -613,6 +995,7 @@ pub mod test { } let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); let diff = blueprint1.diff(&blueprint2); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -626,7 +1009,8 @@ pub mod test { let new_sled_id = Uuid::new_v4(); let _ = policy_add_sled(&mut policy, new_sled_id); let mut builder = - BlueprintBuilder::new_based_on(&blueprint2, &policy, "test_basic"); + BlueprintBuilder::new_based_on(&blueprint2, &policy, "test_basic") + .expect("failed to create builder"); builder.sled_ensure_zone_ntp(new_sled_id).unwrap(); let new_sled_resources = policy.sleds.get(&new_sled_id).unwrap(); for pool_name in &new_sled_resources.zpools { @@ -636,6 +1020,7 @@ pub mod test { } let blueprint3 = builder.build(); + verify_blueprint(&blueprint3); let diff = blueprint2.diff(&blueprint3); println!("expecting new NTP and Crucible zones:\n{}", diff); @@ -691,4 +1076,274 @@ pub mod test { .collect::>(); assert_eq!(crucible_pool_names, new_sled_resources.zpools); } + + #[test] + fn test_add_nexus_with_no_existing_nexus_zones() { + let (mut collection, policy) = example(); + + // Adding a new Nexus zone currently requires copying settings from an + // existing Nexus zone. If we remove all Nexus zones from the + // collection, create a blueprint, then try to add a Nexus zone, it + // should fail. + for zones in collection.omicron_zones.values_mut() { + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + } + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + + let err = builder + .sled_ensure_zone_multiple_nexus( + collection + .omicron_zones + .keys() + .next() + .copied() + .expect("no sleds present"), + 1, + ) + .unwrap_err(); + + assert!( + matches!(err, Error::NoNexusZonesInParentBlueprint), + "unexpected error {err}" + ); + } + + #[test] + fn test_add_nexus_error_cases() { + let (mut collection, policy) = example(); + + // Remove the Nexus zone from one of the sleds so that + // `sled_ensure_zone_nexus` can attempt to add a Nexus zone to + // `sled_id`. + let sled_id = { + let mut selected_sled_id = None; + for (sled_id, zones) in &mut collection.omicron_zones { + let nzones_before_retain = zones.zones.zones.len(); + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + if zones.zones.zones.len() < nzones_before_retain { + selected_sled_id = Some(*sled_id); + break; + } + } + selected_sled_id.expect("found no sleds with Nexus zone") + }; + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + { + // Attempting to add Nexus to the sled we removed it from (with no + // other changes to the environment) should succeed. + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(1)); + } + + { + // Attempting to add multiple Nexus zones to the sled we removed it + // from (with no other changes to the environment) should also + // succeed. + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 3) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(3)); + } + + { + // Replace the policy's external service IP pool ranges with ranges + // that are already in use by existing zones. Attempting to add a + // Nexus with no remaining external IPs should fail. + let mut policy = policy.clone(); + let mut used_ip_ranges = Vec::new(); + for (_, z) in parent.all_omicron_zones() { + if let Some(ip) = z + .zone_type + .external_ip() + .expect("failed to check for external IP") + { + used_ip_ranges.push(IpRange::from(ip)); + } + } + assert!(!used_ip_ranges.is_empty()); + policy.service_ip_pool_ranges = used_ip_ranges; + + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let err = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .unwrap_err(); + + assert!( + matches!(err, Error::NoExternalServiceIpAvailable), + "unexpected error {err}" + ); + } + + // We're not testing the `ExhaustedNexusIps` error case (where we've run + // out of Nexus OPTE addresses), because it's fairly diffiult to induce + // that from outside: we would need to start from a parent blueprint + // that contained a Nexus instance for every IP in the + // `NEXUS_OPTE_*_SUBNET`. We could hack around that by creating the + // `BlueprintBuilder` and mucking with its internals, but that doesn't + // seem like a particularly useful test either. + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_external_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same external IP. Skim through the zones, copy the + // external IP from one Nexus zone, then assign it to a later Nexus + // zone. + let mut found_second_nexus_zone = false; + let mut nexus_external_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { external_ip, .. } = + &mut z.zone_type + { + if let Some(ip) = nexus_external_ip { + *external_ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_external_ip = Some(*external_ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate external IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_nexus_zones_with_same_nic_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // Nexus zones with the same NIC IP. Skim through the zones, copy + // the NIC IP from one Nexus zone, then assign it to a later + // Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(ip) = nexus_nic_ip { + nic.ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_ip = Some(nic.ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate Nexus NIC IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_vnic_mac() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same service vNIC MAC address. Skim through the + // zones, copy the NIC MAC from one Nexus zone, then assign it to a + // later Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_mac = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(mac) = nexus_nic_mac { + nic.mac = mac; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_mac = Some(nic.mac); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate service vNIC MAC"), + "unexpected error: {err:#}" + ), + }; + } } diff --git a/nexus/deployment/src/lib.rs b/nexus/deployment/src/lib.rs index fd182ae613..546f2c1dc1 100644 --- a/nexus/deployment/src/lib.rs +++ b/nexus/deployment/src/lib.rs @@ -57,7 +57,7 @@ //! The Planner //! //! fleet policy (latest inventory) (latest blueprint) -//! \ | / +//! \ | / //! \ | / //! +----------+ | +----------/ //! | | | @@ -85,7 +85,7 @@ //! The Executor (better name?) //! //! latest committed blueprint latest inventory -//! | | +//! | | //! | | //! +----+ +----+ //! | | diff --git a/nexus/deployment/src/planner.rs b/nexus/deployment/src/planner.rs index 0a8e1f0b81..cbdcfd80c0 100644 --- a/nexus/deployment/src/planner.rs +++ b/nexus/deployment/src/planner.rs @@ -8,11 +8,17 @@ use crate::blueprint_builder::BlueprintBuilder; use crate::blueprint_builder::Ensure; +use crate::blueprint_builder::EnsureMultiple; use crate::blueprint_builder::Error; use nexus_types::deployment::Blueprint; use nexus_types::deployment::Policy; +use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; +use slog::warn; use slog::{info, Logger}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use uuid::Uuid; pub struct Planner<'a> { log: Logger, @@ -39,10 +45,10 @@ impl<'a> Planner<'a> { // NOTE: Right now, we just assume that this is the latest inventory // collection. See the comment on the corresponding field in `Planner`. inventory: &'a Collection, - ) -> Planner<'a> { + ) -> anyhow::Result> { let blueprint = - BlueprintBuilder::new_based_on(parent_blueprint, policy, creator); - Planner { log, policy, blueprint, inventory } + BlueprintBuilder::new_based_on(parent_blueprint, policy, creator)?; + Ok(Planner { log, policy, blueprint, inventory }) } pub fn plan(mut self) -> Result { @@ -61,6 +67,17 @@ impl<'a> Planner<'a> { // added and where they should go. And the blueprint builder will need // to grow the ability to provision one. + // After we make our initial pass through the sleds below to check for + // zones every sled should have (NTP, Crucible), we'll start making + // decisions about placing other service zones. We need to _exclude_ any + // sleds for which we just added an NTP zone, as we won't be able to add + // additional services to them until that NTP zone has been brought up. + // + // We will not mark sleds getting Crucible zones as ineligible; other + // control plane service zones starting concurrently with Crucible zones + // is fine. + let mut sleds_ineligible_for_services = BTreeSet::new(); + for (sled_id, sled_info) in &self.policy.sleds { // Check for an NTP zone. Every sled should have one. If it's not // there, all we can do is provision that one zone. We have to wait @@ -70,13 +87,14 @@ impl<'a> Planner<'a> { info!( &self.log, "found sled missing NTP zone (will add one)"; - "sled_id" => ?sled_id + "sled_id" => %sled_id ); self.blueprint .comment(&format!("sled {}: add NTP zone", sled_id)); // Don't make any other changes to this sled. However, this // change is compatible with any other changes to other sleds, // so we can "continue" here rather than "break". + sleds_ineligible_for_services.insert(*sled_id); continue; } @@ -100,7 +118,7 @@ impl<'a> Planner<'a> { let has_ntp_inventory = self .inventory .omicron_zones - .get(&sled_id) + .get(sled_id) .map(|sled_zones| { sled_zones.zones.zones.iter().any(|z| z.zone_type.is_ntp()) }) @@ -110,7 +128,7 @@ impl<'a> Planner<'a> { &self.log, "parent blueprint contains NTP zone, but it's not in \ inventory yet"; - "sled_id" => ?sled_id, + "sled_id" => %sled_id, ); continue; } @@ -145,6 +163,139 @@ impl<'a> Planner<'a> { } } + // We've now placed all the services that should always exist on all + // sleds. Before moving on to make decisions about placing services that + // are _not_ present on all sleds, check the provision state of all our + // sleds so we can avoid any non-provisionable sleds under the + // assumption that there is something amiss with them. + sleds_ineligible_for_services.extend( + self.policy.sleds.iter().filter_map(|(sled_id, sled_info)| { + match sled_info.provision_state { + SledProvisionState::Provisionable => None, + SledProvisionState::NonProvisionable => Some(*sled_id), + } + }), + ); + + self.ensure_correct_number_of_nexus_zones( + &sleds_ineligible_for_services, + )?; + + Ok(()) + } + + fn ensure_correct_number_of_nexus_zones( + &mut self, + sleds_ineligible_for_services: &BTreeSet, + ) -> Result<(), Error> { + // Bin every sled by the number of Nexus zones it currently has while + // counting the total number of Nexus zones. + let mut num_total_nexus = 0; + let mut sleds_by_num_nexus: BTreeMap> = + BTreeMap::new(); + for &sled_id in self.policy.sleds.keys() { + let num_nexus = self.blueprint.sled_num_nexus_zones(sled_id); + num_total_nexus += num_nexus; + + // Only bin this sled if we're allowed to use it. If we have a sled + // we're not allowed to use that's already running a Nexus (seems + // fishy!), we counted its Nexus above but will ignore it here. + if !sleds_ineligible_for_services.contains(&sled_id) { + sleds_by_num_nexus.entry(num_nexus).or_default().push(sled_id); + } + } + + // TODO-correctness What should we do if we have _too many_ Nexus + // instances? For now, just log it the number of zones any time we have + // at least the minimum number. + let nexus_to_add = + self.policy.target_nexus_zone_count.saturating_sub(num_total_nexus); + if nexus_to_add == 0 { + info!( + self.log, "sufficient Nexus zones exist in plan"; + "desired_count" => self.policy.target_nexus_zone_count, + "current_count" => num_total_nexus, + ); + return Ok(()); + } + + // Ensure we have at least one sled on which we can add Nexus zones. If + // we don't, we have nothing else to do. This isn't a hard error, + // because we might be waiting for NTP on all eligible sleds (although + // it would be weird, since we're presumably running from within Nexus + // on some sled). + if sleds_by_num_nexus.is_empty() { + warn!(self.log, "want to add Nexus zones, but no eligible sleds"); + return Ok(()); + } + + // Build a map of sled -> new nexus zone count. + let mut sleds_to_change: BTreeMap = BTreeMap::new(); + + 'outer: for _ in 0..nexus_to_add { + // `sleds_by_num_nexus` is sorted by key already, and we want to + // pick from the lowest-numbered bin. We can just loop over its + // keys, expecting to stop on the first iteration, with the only + // exception being when we've removed all the sleds from a bin. + for (&num_nexus, sleds) in sleds_by_num_nexus.iter_mut() { + // `sleds` contains all sleds with the minimum number of Nexus + // zones. Pick one arbitrarily but deterministically. + let Some(sled_id) = sleds.pop() else { + // We already drained this bin; move on. + continue; + }; + + // This insert might overwrite an old value for this sled (e.g., + // in the "we have 1 sled and need to add many Nexus instances + // to it" case). That's fine. + sleds_to_change.insert(sled_id, num_nexus + 1); + + // Put this sled back in our map, but now with one more Nexus. + sleds_by_num_nexus + .entry(num_nexus + 1) + .or_default() + .push(sled_id); + + continue 'outer; + } + + // This should be unreachable: it's only possible if we fail to find + // a nonempty vec in `sleds_by_num_nexus`, and we checked above that + // `sleds_by_num_nexus` is not empty. + unreachable!("logic error finding sleds for Nexus"); + } + + // For each sled we need to change, actually do so. + let mut total_added = 0; + for (sled_id, new_nexus_count) in sleds_to_change { + match self + .blueprint + .sled_ensure_zone_multiple_nexus(sled_id, new_nexus_count)? + { + EnsureMultiple::Added(n) => { + info!( + self.log, "will add {n} Nexus zone(s) to sled"; + "sled_id" => %sled_id, + ); + total_added += n; + } + // This is only possible if we asked the sled to ensure the same + // number of zones it already has, but that's impossible based + // on the way we built up `sleds_to_change`. + EnsureMultiple::NotNeeded => unreachable!( + "sled on which we added Nexus zones did not add any" + ), + } + } + + // Double check that we didn't make any arithmetic mistakes. If we've + // arrived here, we think we've added the number of Nexus zones we + // needed to. + assert_eq!( + total_added, nexus_to_add, + "internal error counting Nexus zones" + ); + Ok(()) } } @@ -154,8 +305,10 @@ mod test { use super::Planner; use crate::blueprint_builder::test::example; use crate::blueprint_builder::test::policy_add_sled; + use crate::blueprint_builder::test::verify_blueprint; use crate::blueprint_builder::BlueprintBuilder; use nexus_inventory::now_db_precision; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::OmicronZoneType; use nexus_types::inventory::OmicronZonesFound; use omicron_common::api::external::Generation; @@ -177,6 +330,7 @@ mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); // Now run the planner. It should do nothing because our initial // system didn't have any issues that the planner currently knows how to @@ -188,6 +342,7 @@ mod test { "no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -196,6 +351,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint2); // Now add a new sled. let new_sled_id = @@ -210,6 +366,7 @@ mod test { "test: add NTP?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -229,6 +386,7 @@ mod test { )); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint3); // Check that with no change in inventory, the planner makes no changes. // It needs to wait for inventory to reflect the new NTP zone before @@ -240,6 +398,7 @@ mod test { "test: add nothing more", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); let diff = blueprint3.diff(&blueprint4); @@ -247,6 +406,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint4); // Now update the inventory to have the requested NTP zone. assert!(collection @@ -274,6 +434,7 @@ mod test { "test: add Crucible zones?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -295,11 +456,12 @@ mod test { assert_eq!(zones.len(), 3); for zone in &zones { let OmicronZoneType::Crucible { .. } = zone.zone_type else { - panic!("unexpectedly added a non-Crucible zone"); + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); }; } + verify_blueprint(&blueprint5); - // Check that there are no more steps + // Check that there are no more steps. let blueprint6 = Planner::new_based_on( logctx.log.clone(), &blueprint5, @@ -307,6 +469,7 @@ mod test { "test: no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -315,6 +478,257 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint6); + + logctx.cleanup_successful(); + } + + /// Check that the planner will add more Nexus zones to a single sled, if + /// needed + #[test] + fn test_add_multiple_nexus_to_one_sled() { + let logctx = test_setup_log("planner_add_multiple_nexus_to_one_sled"); + + // Use our example inventory collection as a starting point, but strip + // it down to just one sled. + let (sled_id, collection, mut policy) = { + let (mut collection, mut policy) = example(); + + // Pick one sled ID to keep and remove the rest. + let keep_sled_id = + policy.sleds.keys().next().copied().expect("no sleds"); + policy.sleds.retain(|&k, _v| keep_sled_id == k); + collection.sled_agents.retain(|&k, _v| keep_sled_id == k); + collection.omicron_zones.retain(|&k, _v| keep_sled_id == k); + + assert_eq!(collection.sled_agents.len(), 1); + assert_eq!(collection.omicron_zones.len(), 1); + + (keep_sled_id, collection, policy) + }; + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 1 Nexus instance on the one sled we + // kept. + assert_eq!(blueprint1.omicron_zones.len(), 1); + assert_eq!( + blueprint1 + .omicron_zones + .get(&sled_id) + .expect("missing kept sled") + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + + // Now run the planner. It should add additional Nexus instances to the + // one sled we have. + policy.target_nexus_zone_count = 5; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let mut sleds = diff.sleds_changed().collect::>(); + assert_eq!(sleds.len(), 1); + let (changed_sled_id, sled_changes) = sleds.pop().unwrap(); + assert_eq!(changed_sled_id, sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + assert_eq!(zones.len(), policy.target_nexus_zone_count - 1); + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Nexus zone: {zone:?}"); + }; + } + + logctx.cleanup_successful(); + } + + /// Check that the planner will spread additional Nexus zones out across + /// sleds as it adds them + #[test] + fn test_spread_additional_nexus_zones_across_sleds() { + let logctx = test_setup_log( + "planner_spread_additional_nexus_zones_across_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // All 3 sleds should get additional Nexus zones. We expect a total of + // 11 new Nexus zones, which should be spread evenly across the three + // sleds (two should get 4 and one should get 3). + assert_eq!(sleds.len(), 3); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (3 | 4) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); + + logctx.cleanup_successful(); + } + + /// Check that the planner will skip non-provisionable sleds when allocating + /// extra Nexus zones + #[test] + fn test_nexus_allocation_skips_nonprovisionable_sleds() { + let logctx = test_setup_log( + "planner_nexus_allocation_skips_nonprovisionable_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Arbitrarily choose one of the sleds and mark it non-provisionable. + let nonprovisionable_sled_id = { + let (sled_id, resources) = + policy.sleds.iter_mut().next().expect("no sleds"); + resources.provision_state = SledProvisionState::NonProvisionable; + *sled_id + }; + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // Only 2 of the 3 sleds should get additional Nexus zones. We expect a + // total of 11 new Nexus zones, which should be spread evenly across the + // two sleds (one gets 6 and the other gets 5), while the + // non-provisionable sled should be unchanged. + assert_eq!(sleds.len(), 2); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert!(sled_id != nonprovisionable_sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (5 | 6) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); logctx.cleanup_successful(); } diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 4263c34f3d..ac9d894050 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -109,6 +109,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index f954a35639..e0d8f32316 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -467,6 +467,7 @@ mod test { use super::BackgroundTask; use super::Driver; use crate::app::background::common::ActivationReason; + use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -477,6 +478,7 @@ mod test { use std::time::Instant; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TryRecvError; + use tokio::sync::mpsc::Sender; use tokio::sync::watch; type ControlPlaneTestContext = @@ -814,4 +816,82 @@ mod test { // such a task that would allow us to reliably distinguish between these // two without also spending a lot of wall-clock time on this test. } + + /// Simple BackgroundTask impl that sends a test-only SagaRequest + struct SagaRequestTask { + saga_request: Sender, + } + + impl SagaRequestTask { + fn new(saga_request: Sender) -> SagaRequestTask { + SagaRequestTask { saga_request } + } + } + + impl BackgroundTask for SagaRequestTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let _ = self.saga_request.send(SagaRequest::TestOnly).await; + serde_json::Value::Null + } + .boxed() + } + } + + #[nexus_test(server = crate::Server)] + async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let t1 = SagaRequestTask::new(saga_request); + + let mut driver = Driver::new(); + let (_dep_tx1, dep_rx1) = watch::channel(0); + + let h1 = driver.register( + "t1".to_string(), + "test saga request flow task".to_string(), + Duration::from_secs(300), // should not fire in this test + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + ); + + assert!(matches!( + saga_request_recv.try_recv(), + Err(mpsc::error::TryRecvError::Empty), + )); + + driver.activate(&h1); + + // wait 1 second for the saga request to arrive + tokio::select! { + _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { + assert!(false); + } + + saga_request = saga_request_recv.recv() => { + match saga_request { + None => { + assert!(false); + } + + Some(saga_request) => { + assert!(matches!( + saga_request, + SagaRequest::TestOnly, + )); + } + } + } + } + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 95fe5c933e..9d078f10d0 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -15,7 +15,9 @@ use super::external_endpoints; use super::inventory_collection; use super::nat_cleanup; use super::phantom_disks; +use super::region_replacement; use super::sync_service_zone_nat::ServiceZoneNatTracker; +use crate::app::sagas::SagaRequest; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -25,6 +27,7 @@ use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::collections::HashMap; use std::sync::Arc; +use tokio::sync::mpsc::Sender; use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with @@ -72,10 +75,15 @@ pub struct BackgroundTasks { /// task handle for the service zone nat tracker pub task_service_zone_nat_tracker: common::TaskHandle, + + /// task handle for the task that detects if regions need replacement and + /// begins the process + pub task_region_replacement: common::TaskHandle, } impl BackgroundTasks { /// Kick off all background tasks + #[allow(clippy::too_many_arguments)] pub fn start( opctx: &OpContext, datastore: Arc, @@ -84,6 +92,7 @@ impl BackgroundTasks { mgd_clients: &HashMap>, nexus_id: Uuid, resolver: internal_dns::resolver::Resolver, + saga_request: Sender, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -243,6 +252,26 @@ impl BackgroundTasks { ) }; + // Background task: detect if a region needs replacement and begin the + // process + let task_region_replacement = { + let detector = region_replacement::RegionReplacementDetector::new( + datastore, + saga_request.clone(), + ); + + let task = driver.register( + String::from("region_replacement"), + String::from("detects if a region requires replacing and begins the process"), + config.region_replacement.period_secs, + Box::new(detector), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -258,6 +287,7 @@ impl BackgroundTasks { task_blueprint_loader, task_blueprint_executor, task_service_zone_nat_tracker, + task_region_replacement, } } diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 2c5fa0ab3c..27cdddfe15 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -16,6 +16,7 @@ mod init; mod inventory_collection; mod nat_cleanup; mod phantom_disks; +mod region_replacement; mod status; mod sync_service_zone_nat; diff --git a/nexus/src/app/background/region_replacement.rs b/nexus/src/app/background/region_replacement.rs new file mode 100644 index 0000000000..fc92f888b9 --- /dev/null +++ b/nexus/src/app/background/region_replacement.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting regions that need replacing and beginning that +//! process +//! +//! TODO this is currently a placeholder for a future PR + +use super::common::BackgroundTask; +use crate::app::sagas::SagaRequest; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::mpsc::Sender; + +pub struct RegionReplacementDetector { + _datastore: Arc, + _saga_request: Sender, +} + +impl RegionReplacementDetector { + pub fn new( + datastore: Arc, + saga_request: Sender, + ) -> Self { + RegionReplacementDetector { + _datastore: datastore, + _saga_request: saga_request, + } + } +} + +impl BackgroundTask for RegionReplacementDetector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let log = &opctx.log; + warn!(&log, "region replacement task started"); + + // TODO + + warn!(&log, "region replacement task done"); + + json!({ + "region_replacement_started_ok": 0, + "region_replacement_started_err": 0, + }) + } + .boxed() + } +} diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 65f8f4d028..b8cb6deabf 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -18,7 +18,9 @@ use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolName; use nexus_types::identity::Asset; use nexus_types::inventory::Collection; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; +use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -169,11 +171,39 @@ impl super::Nexus { let zpools = zpools_by_sled_id .remove(&sled_id) .unwrap_or_else(BTreeSet::new); - let sled_info = SledResources { subnet, zpools }; + let sled_info = SledResources { + provision_state: sled_row.provision_state().into(), + subnet, + zpools, + }; (sled_id, sled_info) }) .collect(); + let service_ip_pool_ranges = { + let (authz_service_ip_pool, _) = + datastore.ip_pools_service_lookup(opctx).await?; + + let mut ip_ranges = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = datastore + .ip_pool_list_ranges( + opctx, + &authz_service_ip_pool, + &p.current_pagparams(), + ) + .await?; + // The use of `last_address` here assumes `paginator` is sorting + // in Ascending order (which it does - see the implementation of + // `current_pagparams()`). + paginator = p.found_batch(&batch, &|r| r.last_address); + ip_ranges.extend(batch.iter().map(IpRange::from)); + } + + ip_ranges + }; + // The choice of which inventory collection to use here is not // necessarily trivial. Inventory collections may be incomplete due to // transient (or even persistent) errors. It's not yet clear what @@ -192,7 +222,15 @@ impl super::Nexus { "fetching latest inventory collection for blueprint planner", )?; - Ok(PlanningContext { creator, policy: Policy { sleds }, inventory }) + Ok(PlanningContext { + creator, + policy: Policy { + sleds, + service_ip_pool_ranges, + target_nexus_zone_count: NEXUS_REDUNDANCY, + }, + inventory, + }) } async fn blueprint_add( @@ -252,7 +290,12 @@ impl super::Nexus { &planning_context.policy, &planning_context.creator, &inventory, - ); + ) + .map_err(|error| { + Error::internal_error(&format!( + "error creating blueprint planner: {error:#}", + )) + })?; let blueprint = planner.plan().map_err(|error| { Error::internal_error(&format!( "error generating blueprint: {}", diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index c9ca4db73e..7a9a26b05f 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -6,6 +6,7 @@ use self::external_endpoints::NexusCertResolver; use crate::app::oximeter::LazyTimeseriesClient; +use crate::app::sagas::SagaRequest; use crate::config; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -362,6 +363,8 @@ impl Nexus { Arc::clone(&db_datastore), ); + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let background_tasks = background::BackgroundTasks::start( &background_ctx, Arc::clone(&db_datastore), @@ -370,6 +373,7 @@ impl Nexus { &mg_clients, config.deployment.id, resolver.clone(), + saga_request, ); let external_resolver = { @@ -484,6 +488,29 @@ impl Nexus { } }); + // Spawn a task to receive SagaRequests from RPWs, and execute them + { + let nexus = nexus.clone(); + tokio::spawn(async move { + loop { + match saga_request_recv.recv().await { + None => { + // If this channel is closed, then RPWs will not be + // able to request that sagas be run. This will + // likely only occur when Nexus itself is shutting + // down, so emit an error and exit the task. + error!(&nexus.log, "saga request channel closed!"); + break; + } + + Some(saga_request) => { + nexus.handle_saga_request(saga_request).await; + } + } + } + }); + } + Ok(nexus) } @@ -828,6 +855,17 @@ impl Nexus { pub(crate) async fn resolver(&self) -> internal_dns::resolver::Resolver { self.internal_resolver.clone() } + + /// Reliable persistent workflows can request that sagas be executed by + /// sending a SagaRequest to a supplied channel. Execute those here. + pub(crate) async fn handle_saga_request(&self, saga_request: SagaRequest) { + match saga_request { + #[cfg(test)] + SagaRequest::TestOnly => { + unimplemented!(); + } + } + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 569153f23e..a4d559f823 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -52,7 +52,6 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; -use omicron_common::api::external::ResourceType; use omicron_common::api::internal::shared::ExternalPortDiscovery; use sled_agent_client::types::AddSledRequest; use sled_agent_client::types::EarlyNetworkConfigBody; @@ -213,11 +212,7 @@ impl super::Nexus { mapped_fleet_roles, }; - let rack_network_config = request.rack_network_config.as_ref().ok_or( - Error::invalid_request( - "cannot initialize a rack without a network config", - ), - )?; + let rack_network_config = &request.rack_network_config; self.db_datastore .rack_set_initialized( @@ -337,289 +332,278 @@ impl super::Nexus { // Currently calling some of the apis directly, but should we be using sagas // going forward via self.run_saga()? Note that self.create_runnable_saga and // self.execute_saga are currently not available within this scope. - info!(self.log, "Checking for Rack Network Configuration"); - if let Some(rack_network_config) = &request.rack_network_config { - info!(self.log, "Recording Rack Network Configuration"); - let address_lot_name = - Name::from_str("initial-infra").map_err(|e| { - Error::internal_error(&format!( - "unable to use `initial-infra` as `Name`: {e}" - )) - })?; - let identity = IdentityMetadataCreateParams { - name: address_lot_name.clone(), - description: "initial infrastructure ip address lot" - .to_string(), - }; + info!(self.log, "Recording Rack Network Configuration"); + let address_lot_name = + Name::from_str("initial-infra").map_err(|e| { + Error::internal_error(&format!( + "unable to use `initial-infra` as `Name`: {e}" + )) + })?; + let identity = IdentityMetadataCreateParams { + name: address_lot_name.clone(), + description: "initial infrastructure ip address lot".to_string(), + }; - let kind = AddressLotKind::Infra; + let kind = AddressLotKind::Infra; - let first_address = IpAddr::V4(rack_network_config.infra_ip_first); - let last_address = IpAddr::V4(rack_network_config.infra_ip_last); - let ipv4_block = - AddressLotBlockCreate { first_address, last_address }; + let first_address = IpAddr::V4(rack_network_config.infra_ip_first); + let last_address = IpAddr::V4(rack_network_config.infra_ip_last); + let ipv4_block = AddressLotBlockCreate { first_address, last_address }; - let blocks = vec![ipv4_block]; + let blocks = vec![ipv4_block]; - let address_lot_params = - AddressLotCreate { identity, kind, blocks }; + let address_lot_params = AddressLotCreate { identity, kind, blocks }; - match self - .db_datastore - .address_lot_create(opctx, &address_lot_params) - .await - { - Ok(_) => Ok(()), - Err(e) => match e { - Error::ObjectAlreadyExists { - type_name: _, - object_name: _, - } => Ok(()), - _ => Err(e), - }, - }?; + match self + .db_datastore + .address_lot_create(opctx, &address_lot_params) + .await + { + Ok(_) => Ok(()), + Err(e) => match e { + Error::ObjectAlreadyExists { type_name: _, object_name: _ } => { + Ok(()) + } + _ => Err(e), + }, + }?; - let mut bgp_configs = HashMap::new(); + let mut bgp_configs = HashMap::new(); - for bgp_config in &rack_network_config.bgp { - bgp_configs.insert(bgp_config.asn, bgp_config.clone()); + for bgp_config in &rack_network_config.bgp { + bgp_configs.insert(bgp_config.asn, bgp_config.clone()); - let bgp_config_name: Name = - format!("as{}", bgp_config.asn).parse().unwrap(); + let bgp_config_name: Name = + format!("as{}", bgp_config.asn).parse().unwrap(); - let announce_set_name: Name = - format!("as{}-announce", bgp_config.asn).parse().unwrap(); + let announce_set_name: Name = + format!("as{}-announce", bgp_config.asn).parse().unwrap(); - let address_lot_name: Name = - format!("as{}-lot", bgp_config.asn).parse().unwrap(); + let address_lot_name: Name = + format!("as{}-lot", bgp_config.asn).parse().unwrap(); - self.db_datastore - .address_lot_create( - &opctx, - &AddressLotCreate { - identity: IdentityMetadataCreateParams { - name: address_lot_name, - description: format!( - "Address lot for announce set in as {}", - bgp_config.asn - ), - }, - kind: AddressLotKind::Infra, - blocks: bgp_config - .originate - .iter() - .map(|o| AddressLotBlockCreate { - first_address: o.network().into(), - last_address: o.broadcast().into(), - }) - .collect(), + self.db_datastore + .address_lot_create( + &opctx, + &AddressLotCreate { + identity: IdentityMetadataCreateParams { + name: address_lot_name, + description: format!( + "Address lot for announce set in as {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create address lot for BGP as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_create_announce_set( - &opctx, - &BgpAnnounceSetCreate { - identity: IdentityMetadataCreateParams { - name: announce_set_name.clone(), - description: format!( - "Announce set for AS {}", - bgp_config.asn - ), - }, - announcement: bgp_config - .originate - .iter() - .map(|x| BgpAnnouncementCreate { - address_lot_block: NameOrId::Name( - format!("as{}", bgp_config.asn) - .parse() - .unwrap(), - ), - network: IpNetwork::from(*x).into(), - }) - .collect(), + kind: AddressLotKind::Infra, + blocks: bgp_config + .originate + .iter() + .map(|o| AddressLotBlockCreate { + first_address: o.network().into(), + last_address: o.broadcast().into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create address lot for BGP as {}: {}", + bgp_config.asn, e + )) + })?; + + self.db_datastore + .bgp_create_announce_set( + &opctx, + &BgpAnnounceSetCreate { + identity: IdentityMetadataCreateParams { + name: announce_set_name.clone(), + description: format!( + "Announce set for AS {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create bgp announce set for as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_config_set( - &opctx, - &BgpConfigCreate { - identity: IdentityMetadataCreateParams { - name: bgp_config_name, - description: format!( - "BGP config for AS {}", - bgp_config.asn + announcement: bgp_config + .originate + .iter() + .map(|x| BgpAnnouncementCreate { + address_lot_block: NameOrId::Name( + format!("as{}", bgp_config.asn) + .parse() + .unwrap(), ), - }, - asn: bgp_config.asn, - bgp_announce_set_id: announce_set_name.into(), - vrf: None, - }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to set bgp config for as {}: {}", - bgp_config.asn, e - )) - })?; - } + network: IpNetwork::from(*x).into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create bgp announce set for as {}: {}", + bgp_config.asn, e + )) + })?; - for (idx, uplink_config) in - rack_network_config.ports.iter().enumerate() - { - let switch = uplink_config.switch.to_string(); - let switch_location = Name::from_str(&switch).map_err(|e| { + self.db_datastore + .bgp_config_set( + &opctx, + &BgpConfigCreate { + identity: IdentityMetadataCreateParams { + name: bgp_config_name, + description: format!( + "BGP config for AS {}", + bgp_config.asn + ), + }, + asn: bgp_config.asn, + bgp_announce_set_id: announce_set_name.into(), + vrf: None, + }, + ) + .await + .map_err(|e| { Error::internal_error(&format!( - "unable to use {switch} as Name: {e}" + "unable to set bgp config for as {}: {}", + bgp_config.asn, e )) })?; + } - let uplink_name = format!("default-uplink{idx}"); - let name = Name::from_str(&uplink_name).unwrap(); + for (idx, uplink_config) in rack_network_config.ports.iter().enumerate() + { + let switch = uplink_config.switch.to_string(); + let switch_location = Name::from_str(&switch).map_err(|e| { + Error::internal_error(&format!( + "unable to use {switch} as Name: {e}" + )) + })?; - let identity = IdentityMetadataCreateParams { - name: name.clone(), - description: "initial uplink configuration".to_string(), - }; + let uplink_name = format!("default-uplink{idx}"); + let name = Name::from_str(&uplink_name).unwrap(); - let port_config = SwitchPortConfigCreate { - geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, - }; + let identity = IdentityMetadataCreateParams { + name: name.clone(), + description: "initial uplink configuration".to_string(), + }; - let mut port_settings_params = SwitchPortSettingsCreate { - identity, - port_config, - groups: vec![], - links: HashMap::new(), - interfaces: HashMap::new(), - routes: HashMap::new(), - bgp_peers: HashMap::new(), - addresses: HashMap::new(), + let port_config = SwitchPortConfigCreate { + geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, }; - let addresses: Vec
= uplink_config - .addresses - .iter() - .map(|a| Address { - address_lot: NameOrId::Name(address_lot_name.clone()), - address: (*a).into(), - }) - .collect(); - - port_settings_params - .addresses - .insert("phy0".to_string(), AddressConfig { addresses }); - - let routes: Vec = uplink_config - .routes - .iter() - .map(|r| Route { - dst: r.destination.into(), - gw: r.nexthop, - vid: None, - }) - .collect(); - - port_settings_params - .routes - .insert("phy0".to_string(), RouteConfig { routes }); - - let peers: Vec = uplink_config - .bgp_peers - .iter() - .map(|r| BgpPeer { - bgp_announce_set: NameOrId::Name( - format!("as{}-announce", r.asn).parse().unwrap(), - ), - bgp_config: NameOrId::Name( - format!("as{}", r.asn).parse().unwrap(), - ), - interface_name: "phy0".into(), - addr: r.addr.into(), - hold_time: r.hold_time.unwrap_or(6) as u32, - idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, - delay_open: r.delay_open.unwrap_or(0) as u32, - connect_retry: r.connect_retry.unwrap_or(3) as u32, - keepalive: r.keepalive.unwrap_or(2) as u32, - }) - .collect(); + let mut port_settings_params = SwitchPortSettingsCreate { + identity, + port_config, + groups: vec![], + links: HashMap::new(), + interfaces: HashMap::new(), + routes: HashMap::new(), + bgp_peers: HashMap::new(), + addresses: HashMap::new(), + }; - port_settings_params - .bgp_peers - .insert("phy0".to_string(), BgpPeerConfig { peers }); + let addresses: Vec
= uplink_config + .addresses + .iter() + .map(|a| Address { + address_lot: NameOrId::Name(address_lot_name.clone()), + address: (*a).into(), + }) + .collect(); + + port_settings_params + .addresses + .insert("phy0".to_string(), AddressConfig { addresses }); + + let routes: Vec = uplink_config + .routes + .iter() + .map(|r| Route { + dst: r.destination.into(), + gw: r.nexthop, + vid: None, + }) + .collect(); + + port_settings_params + .routes + .insert("phy0".to_string(), RouteConfig { routes }); + + let peers: Vec = uplink_config + .bgp_peers + .iter() + .map(|r| BgpPeer { + bgp_announce_set: NameOrId::Name( + format!("as{}-announce", r.asn).parse().unwrap(), + ), + bgp_config: NameOrId::Name( + format!("as{}", r.asn).parse().unwrap(), + ), + interface_name: "phy0".into(), + addr: r.addr.into(), + hold_time: r.hold_time.unwrap_or(6) as u32, + idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, + delay_open: r.delay_open.unwrap_or(0) as u32, + connect_retry: r.connect_retry.unwrap_or(3) as u32, + keepalive: r.keepalive.unwrap_or(2) as u32, + }) + .collect(); + + port_settings_params + .bgp_peers + .insert("phy0".to_string(), BgpPeerConfig { peers }); + + let link = LinkConfigCreate { + mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 + lldp: LldpServiceConfigCreate { + enabled: false, + lldp_config: None, + }, + fec: uplink_config.uplink_port_fec.into(), + speed: uplink_config.uplink_port_speed.into(), + autoneg: uplink_config.autoneg, + }; - let link = LinkConfigCreate { - mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 - lldp: LldpServiceConfigCreate { - enabled: false, - lldp_config: None, - }, - fec: uplink_config.uplink_port_fec.into(), - speed: uplink_config.uplink_port_speed.into(), - autoneg: uplink_config.autoneg, - }; + port_settings_params.links.insert("phy".to_string(), link); - port_settings_params.links.insert("phy".to_string(), link); + match self + .db_datastore + .switch_port_settings_create(opctx, &port_settings_params, None) + .await + { + Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), + Err(e) => Err(e), + }?; - match self - .db_datastore - .switch_port_settings_create( - opctx, - &port_settings_params, - None, - ) - .await - { - Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), - Err(e) => Err(e), - }?; - - let port_settings_id = self - .db_datastore - .switch_port_settings_get_id( - opctx, - nexus_db_model::Name(name.clone()), - ) - .await?; + let port_settings_id = self + .db_datastore + .switch_port_settings_get_id( + opctx, + nexus_db_model::Name(name.clone()), + ) + .await?; - let switch_port_id = self - .db_datastore - .switch_port_get_id( - opctx, - rack_id, - switch_location.into(), - Name::from_str(&uplink_config.port).unwrap().into(), - ) - .await?; + let switch_port_id = self + .db_datastore + .switch_port_get_id( + opctx, + rack_id, + switch_location.into(), + Name::from_str(&uplink_config.port).unwrap().into(), + ) + .await?; + + self.db_datastore + .switch_port_set_settings_id( + opctx, + switch_port_id, + Some(port_settings_id), + db::datastore::UpdatePrecondition::Null, + ) + .await?; + } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 + // record port speed - self.db_datastore - .switch_port_set_settings_id( - opctx, - switch_port_id, - Some(port_settings_id), - db::datastore::UpdatePrecondition::Null, - ) - .await?; - } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 - // record port speed - }; self.initial_bootstore_sync(&opctx).await?; Ok(()) @@ -871,36 +855,15 @@ impl super::Nexus { ) .await?; - // Grab the SPs from the last collection - let collection = - self.db_datastore.inventory_get_latest_collection(opctx).await?; - - // If there isn't a collection, we don't know about the sled - let Some(collection) = collection else { - return Err(Error::unavail("no inventory data available")); - }; - - // Find the revision - let Some(sp) = collection.sps.get(&baseboard_id) else { - return Err(Error::ObjectNotFound { - type_name: ResourceType::Sled, - lookup_type: - omicron_common::api::external::LookupType::ByCompositeId( - format!("{sled:?}"), - ), - }); - }; - - // Convert the baseboard as necessary - let baseboard = sled_agent_client::types::Baseboard::Gimlet { - identifier: sled.serial.clone(), - model: sled.part.clone(), - revision: sp.baseboard_revision.into(), + // Convert `UninitializedSledId` to the sled-agent type + let baseboard_id = sled_agent_client::types::BaseboardId { + serial_number: sled.serial.clone(), + part_number: sled.part.clone(), }; // Make the call to sled-agent let req = AddSledRequest { - sled_id: baseboard, + sled_id: baseboard_id, start_request: StartSledAgentRequest { generation: 0, schema_version: 1, diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 1bd85ecf32..e9f800c61b 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -17,6 +17,7 @@ use steno::ActionContext; use steno::ActionError; use steno::SagaType; use thiserror::Error; +use tokio::sync::mpsc; use uuid::Uuid; pub mod disk_create; @@ -408,3 +409,23 @@ where ) .await } + +/// Reliable persistent workflows can request that sagas be run as part of their +/// activation by sending a SagaRequest through a supplied channel to Nexus. +pub enum SagaRequest { + #[cfg(test)] + TestOnly, +} + +impl SagaRequest { + pub fn channel() -> (mpsc::Sender, mpsc::Receiver) + { + // Limit the maximum number of saga requests that background tasks can + // queue for Nexus to run. + // + // Note this value was chosen arbitrarily! + const MAX_QUEUED_SAGA_REQUESTS: usize = 128; + + mpsc::channel(MAX_QUEUED_SAGA_REQUESTS) + } +} diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index db179dc7f6..d016bd0421 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -76,7 +76,8 @@ async fn main() -> anyhow::Result<()> { // We use the unchecked constructor of the datastore because we // don't want to block on someone else applying an upgrade. - let datastore = DataStore::new_unchecked(pool).map_err(|e| anyhow!(e))?; + let datastore = + DataStore::new_unchecked(log.clone(), pool).map_err(|e| anyhow!(e))?; match args.cmd { Cmd::List => { diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ccd8cebad6..3a9e957328 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -411,7 +411,7 @@ async fn ping( Ok(HttpResponseOk(views::Ping { status: views::PingStatus::Ok })) } -/// Fetch the top-level IAM policy +/// Fetch top-level IAM policy #[endpoint { method = GET, path = "/v1/system/policy", @@ -430,7 +430,7 @@ async fn system_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the top-level IAM policy +/// Update top-level IAM policy #[endpoint { method = PUT, path = "/v1/system/policy", @@ -454,7 +454,7 @@ async fn system_policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the current silo's IAM policy +/// Fetch current silo's IAM policy #[endpoint { method = GET, path = "/v1/policy", @@ -481,7 +481,7 @@ pub(crate) async fn policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the current silo's IAM policy +/// Update current silo's IAM policy #[endpoint { method = PUT, path = "/v1/policy", @@ -513,7 +513,7 @@ async fn policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource utilization of the user's current silo +/// Fetch resource utilization for user's current silo #[endpoint { method = GET, path = "/v1/utilization", @@ -535,7 +535,7 @@ async fn utilization_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the current utilization of a given silo +/// Fetch current utilization for given silo #[endpoint { method = GET, path = "/v1/system/utilization/silos/{silo}", @@ -628,7 +628,7 @@ async fn system_quotas_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource quotas of a given silo +/// Fetch resource quotas for silo #[endpoint { method = GET, path = "/v1/system/silos/{silo}/quotas", @@ -651,7 +651,7 @@ async fn silo_quotas_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the resource quotas of a given silo +/// Update resource quotas for silo /// /// If a quota value is not specified, it will remain unchanged. #[endpoint { @@ -735,9 +735,9 @@ async fn silo_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo +/// Fetch silo /// -/// Fetch a silo by name or ID. +/// Fetch silo by name or ID. #[endpoint { method = GET, path = "/v1/system/silos/{silo}", @@ -759,7 +759,11 @@ async fn silo_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List IP pools available within silo +/// List IP pools linked to silo +/// +/// Linked IP pools are available to users in the specified silo. A silo can +/// have at most one default pool. IPs are allocated from the default pool when +/// users ask for one without specifying a pool. #[endpoint { method = GET, path = "/v1/system/silos/{silo}/ip-pools", @@ -803,7 +807,7 @@ async fn silo_ip_pool_list( /// Delete a silo /// -/// Delete a silo by name. +/// Delete a silo by name or ID. #[endpoint { method = DELETE, path = "/v1/system/silos/{silo}", @@ -825,7 +829,7 @@ async fn silo_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo's IAM policy +/// Fetch silo IAM policy #[endpoint { method = GET, path = "/v1/system/silos/{silo}/policy", @@ -847,7 +851,7 @@ async fn silo_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a silo's IAM policy +/// Update silo IAM policy #[endpoint { method = PUT, path = "/v1/system/silos/{silo}/policy", @@ -877,7 +881,7 @@ async fn silo_policy_update( // Silo-specific user endpoints -/// List built-in (system) users in a silo +/// List built-in (system) users in silo #[endpoint { method = GET, path = "/v1/system/users", @@ -918,7 +922,7 @@ struct UserParam { user_id: Uuid, } -/// Fetch a built-in (system) user +/// Fetch built-in (system) user #[endpoint { method = GET, path = "/v1/system/users/{user_id}", @@ -982,7 +986,7 @@ async fn silo_identity_provider_list( // Silo SAML identity providers -/// Create a SAML IdP +/// Create SAML IdP #[endpoint { method = POST, path = "/v1/system/identity-providers/saml", @@ -1011,7 +1015,7 @@ async fn saml_identity_provider_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a SAML IdP +/// Fetch SAML IdP #[endpoint { method = GET, path = "/v1/system/identity-providers/saml/{provider}", @@ -1049,7 +1053,7 @@ async fn saml_identity_provider_view( // "Local" Identity Provider -/// Create a user +/// Create user /// /// Users can only be created in Silos with `provision_type` == `Fixed`. /// Otherwise, Silo users are just-in-time (JIT) provisioned when a user first @@ -1082,7 +1086,7 @@ async fn local_idp_user_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a user +/// Delete user #[endpoint { method = DELETE, path = "/v1/system/identity-providers/local/users/{user_id}", @@ -1106,7 +1110,7 @@ async fn local_idp_user_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set or invalidate a user's password +/// Set or invalidate user's password /// /// Passwords can only be updated for users in Silos with identity mode /// `LocalOnly`. @@ -1174,7 +1178,7 @@ async fn project_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a project +/// Create project #[endpoint { method = POST, path = "/v1/projects", @@ -1195,7 +1199,7 @@ async fn project_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project +/// Fetch project #[endpoint { method = GET, path = "/v1/projects/{project}", @@ -1219,7 +1223,7 @@ async fn project_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a project +/// Delete project #[endpoint { method = DELETE, path = "/v1/projects/{project}", @@ -1276,7 +1280,7 @@ async fn project_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project's IAM policy +/// Fetch project's IAM policy #[endpoint { method = GET, path = "/v1/projects/{project}/policy", @@ -1301,7 +1305,7 @@ async fn project_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a project's IAM policy +/// Update project's IAM policy #[endpoint { method = PUT, path = "/v1/projects/{project}/policy", @@ -1367,7 +1371,7 @@ async fn project_ip_pool_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/ip-pools/{pool}", @@ -1430,7 +1434,7 @@ pub struct IpPoolPathParam { pub pool_name: Name, } -/// Create an IP pool +/// Create IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools", @@ -1451,7 +1455,7 @@ async fn ip_pool_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}", @@ -1475,7 +1479,7 @@ async fn ip_pool_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an IP pool +/// Delete IP pool #[endpoint { method = DELETE, path = "/v1/system/ip-pools/{pool}", @@ -1497,7 +1501,7 @@ async fn ip_pool_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update an IP pool +/// Update IP pool #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}", @@ -1521,7 +1525,7 @@ async fn ip_pool_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an IP pool's linked silos +/// List IP pool's linked silos #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/silos", @@ -1569,7 +1573,11 @@ async fn ip_pool_silo_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool available within a silo +/// Link IP pool to silo +/// +/// Users in linked silos can allocate external IPs from this pool for their +/// instances. A silo can have at most one default pool. IPs are allocated from +/// the default pool when users ask for one without specifying a pool. #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/silos", @@ -1595,7 +1603,7 @@ async fn ip_pool_silo_link( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Unlink an IP pool from a silo +/// Unlink IP pool from silo /// /// Will fail if there are any outstanding IPs allocated in the silo. #[endpoint { @@ -1620,10 +1628,12 @@ async fn ip_pool_silo_unlink( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool default or not-default for a silo +/// Make IP pool default for silo /// -/// When a pool is made default for a silo, any existing default will remain -/// linked to the silo, but will no longer be the default. +/// When a user asks for an IP (e.g., at instance create time) without +/// specifying a pool, the IP comes from the default pool if a default is +/// configured. When a pool is made the default for a silo, any existing default +/// will remain linked to the silo, but will no longer be the default. #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}/silos/{silo}", @@ -1650,7 +1660,7 @@ async fn ip_pool_silo_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the IP pool used for Oxide services +/// Fetch Oxide service IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools-service", @@ -1671,9 +1681,9 @@ async fn ip_pool_service_view( type IpPoolRangePaginationParams = PaginationParams; -/// List ranges for an IP pool +/// List ranges for IP pool /// -/// List ranges for an IP pool. Ranges are ordered by their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/ranges", @@ -1717,7 +1727,7 @@ async fn ip_pool_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool +/// Add range to IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/add", @@ -1741,7 +1751,7 @@ async fn ip_pool_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool +/// Remove range from IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/remove", @@ -1765,10 +1775,9 @@ async fn ip_pool_range_remove( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List ranges for the IP pool used for Oxide services +/// List IP ranges for the Oxide service pool /// -/// List ranges for the IP pool used for Oxide services. Ranges are ordered by -/// their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools-service/ranges", @@ -1809,7 +1818,7 @@ async fn ip_pool_service_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool used for Oxide services +/// Add IP range to Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/add", @@ -1830,7 +1839,7 @@ async fn ip_pool_service_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool used for Oxide services +/// Remove IP range from Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/remove", @@ -1885,7 +1894,7 @@ async fn floating_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a floating IP +/// Create floating IP #[endpoint { method = POST, path = "/v1/floating-ips", @@ -1911,7 +1920,7 @@ async fn floating_ip_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a floating IP +/// Delete floating IP #[endpoint { method = DELETE, path = "/v1/floating-ips/{floating_ip}", @@ -1941,7 +1950,7 @@ async fn floating_ip_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a floating IP +/// Fetch floating IP #[endpoint { method = GET, path = "/v1/floating-ips/{floating_ip}", @@ -1971,7 +1980,9 @@ async fn floating_ip_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a floating IP to an instance or other resource +/// Attach floating IP +/// +/// Attach floating IP to an instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/attach", @@ -2005,7 +2016,9 @@ async fn floating_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a floating IP from an instance or other resource +/// Detach floating IP +/// +// Detach floating IP from instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/detach", @@ -2097,7 +2110,7 @@ async fn disk_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a disk +/// Fetch disk #[endpoint { method = GET, path = "/v1/disks/{disk}", @@ -2123,7 +2136,7 @@ async fn disk_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a disk +/// Delete disk #[endpoint { method = DELETE, path = "/v1/disks/{disk}", @@ -2211,7 +2224,7 @@ async fn disk_metrics_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Start importing blocks into a disk +/// Start importing blocks into disk /// /// Start the process of importing blocks into a disk #[endpoint { @@ -2242,7 +2255,7 @@ async fn disk_bulk_write_import_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Import blocks into a disk +/// Import blocks into disk #[endpoint { method = POST, path = "/v1/disks/{disk}/bulk-write", @@ -2273,7 +2286,7 @@ async fn disk_bulk_write_import( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop importing blocks into a disk +/// Stop importing blocks into disk /// /// Stop the process of importing blocks into a disk #[endpoint { @@ -2371,7 +2384,7 @@ async fn instance_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an instance +/// Create instance #[endpoint { method = POST, path = "/v1/instances", @@ -2401,7 +2414,7 @@ async fn instance_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance +/// Fetch instance #[endpoint { method = GET, path = "/v1/instances/{instance}", @@ -2435,7 +2448,7 @@ async fn instance_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an instance +/// Delete instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}", @@ -2531,7 +2544,7 @@ async fn instance_reboot( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Boot an instance +/// Boot instance #[endpoint { method = POST, path = "/v1/instances/{instance}/start", @@ -2560,7 +2573,7 @@ async fn instance_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop an instance +/// Stop instance #[endpoint { method = POST, path = "/v1/instances/{instance}/stop", @@ -2589,7 +2602,7 @@ async fn instance_stop( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance's serial console +/// Fetch instance serial console #[endpoint { method = GET, path = "/v1/instances/{instance}/serial-console", @@ -2620,7 +2633,7 @@ async fn instance_serial_console( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stream an instance's serial console +/// Stream instance serial console #[channel { protocol = WEBSOCKETS, path = "/v1/instances/{instance}/serial-console/stream", @@ -2672,9 +2685,10 @@ async fn instance_serial_console_stream( } } -/// List the SSH public keys added to the instance via cloud-init during instance creation +/// List SSH public keys for instance /// -/// Note that this list is a snapshot in time and will not reflect updates made after +/// List SSH public keys injected via cloud-init during instance creation. Note +/// that this list is a snapshot in time and will not reflect updates made after /// the instance is created. #[endpoint { method = GET, @@ -2716,7 +2730,7 @@ async fn instance_ssh_public_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an instance's disks +/// List disks for instance #[endpoint { method = GET, path = "/v1/instances/{instance}/disks", @@ -2757,7 +2771,7 @@ async fn instance_disk_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a disk to an instance +/// Attach disk to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/attach", @@ -2789,7 +2803,7 @@ async fn instance_disk_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a disk from an instance +/// Detach disk from instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/detach", @@ -2860,7 +2874,7 @@ async fn certificate_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new system-wide x.509 certificate +/// Create new system-wide x.509 certificate /// /// This certificate is automatically used by the Oxide Control plane to serve /// external connections. @@ -2890,7 +2904,7 @@ struct CertificatePathParam { certificate: NameOrId, } -/// Fetch a certificate +/// Fetch certificate /// /// Returns the details of a specific certificate #[endpoint { @@ -2914,7 +2928,7 @@ async fn certificate_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a certificate +/// Delete certificate /// /// Permanently delete a certificate. This operation cannot be undone. #[endpoint { @@ -2942,7 +2956,7 @@ async fn certificate_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an address lot +/// Create address lot #[endpoint { method = POST, path = "/v1/system/networking/address-lot", @@ -2968,7 +2982,7 @@ async fn networking_address_lot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an address lot +/// Delete address lot #[endpoint { method = DELETE, path = "/v1/system/networking/address-lot/{address_lot}", @@ -3025,7 +3039,7 @@ async fn networking_address_lot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List the blocks in an address lot +/// List blocks in address lot #[endpoint { method = GET, path = "/v1/system/networking/address-lot/{address_lot}/blocks", @@ -3061,7 +3075,7 @@ async fn networking_address_lot_block_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a loopback address +/// Create loopback address #[endpoint { method = POST, path = "/v1/system/networking/loopback-address", @@ -3102,7 +3116,7 @@ pub struct LoopbackAddressPath { pub subnet_mask: u8, } -/// Delete a loopback address +/// Delete loopback address #[endpoint { method = DELETE, path = "/v1/system/networking/loopback-address/{rack_id}/{switch_location}/{address}/{subnet_mask}", @@ -3249,7 +3263,7 @@ async fn networking_switch_port_settings_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get information about a switch port +/// Get information about switch port #[endpoint { method = GET, path = "/v1/system/networking/switch-port-settings/{port}", @@ -3352,7 +3366,7 @@ async fn networking_switch_port_clear_settings( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP configuration +/// Create new BGP configuration #[endpoint { method = POST, path = "/v1/system/networking/bgp", @@ -3449,7 +3463,7 @@ async fn networking_bgp_imported_routes_ipv4( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP configuration +/// Delete BGP configuration #[endpoint { method = DELETE, path = "/v1/system/networking/bgp", @@ -3470,7 +3484,7 @@ async fn networking_bgp_config_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP announce set +/// Create new BGP announce set #[endpoint { method = POST, path = "/v1/system/networking/bgp-announce", @@ -3518,7 +3532,7 @@ async fn networking_bgp_announce_set_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP announce set +/// Delete BGP announce set #[endpoint { method = DELETE, path = "/v1/system/networking/bgp-announce", @@ -3539,7 +3553,7 @@ async fn networking_bgp_announce_set_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Enable a BFD session. +/// Enable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-enable", @@ -3560,7 +3574,7 @@ async fn networking_bfd_enable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Disable a BFD session. +/// Disable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-disable", @@ -3581,7 +3595,7 @@ async fn networking_bfd_disable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get BFD status. +/// Get BFD status #[endpoint { method = GET, path = "/v1/system/networking/bfd-status", @@ -3652,7 +3666,7 @@ async fn image_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an image +/// Create image /// /// Create a new image in a project. #[endpoint { @@ -3690,7 +3704,7 @@ async fn image_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an image +/// Fetch image /// /// Fetch the details for a specific image in a project. #[endpoint { @@ -3733,7 +3747,7 @@ async fn image_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an image +/// Delete image /// /// Permanently delete an image from a project. This operation cannot be undone. /// Any instances in the project using the image will continue to run, however @@ -3769,9 +3783,9 @@ async fn image_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Promote a project image +/// Promote project image /// -/// Promote a project image to be visible to all projects in the silo +/// Promote project image to be visible to all projects in the silo #[endpoint { method = POST, path = "/v1/images/{image}/promote", @@ -3803,9 +3817,9 @@ async fn image_promote( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Demote a silo image +/// Demote silo image /// -/// Demote a silo image to be visible only to a specified project +/// Demote silo image to be visible only to a specified project #[endpoint { method = POST, path = "/v1/images/{image}/demote", @@ -3877,7 +3891,7 @@ async fn instance_network_interface_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a network interface +/// Create network interface #[endpoint { method = POST, path = "/v1/network-interfaces", @@ -3906,7 +3920,7 @@ async fn instance_network_interface_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a network interface +/// Delete network interface /// /// Note that the primary interface for an instance cannot be deleted if there /// are any secondary interfaces. A new primary interface must be designated @@ -3943,7 +3957,7 @@ async fn instance_network_interface_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a network interface +/// Fetch network interface #[endpoint { method = GET, path = "/v1/network-interfaces/{interface}", @@ -3974,7 +3988,7 @@ async fn instance_network_interface_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a network interface +/// Update network interface #[endpoint { method = PUT, path = "/v1/network-interfaces/{interface}", @@ -4048,7 +4062,7 @@ async fn instance_external_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Allocate and attach an ephemeral IP to an instance +/// Allocate and attach ephemeral IP to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4086,7 +4100,7 @@ async fn instance_ephemeral_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach and deallocate an ephemeral IP from an instance +/// Detach and deallocate ephemeral IP from instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4158,7 +4172,7 @@ async fn snapshot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a snapshot +/// Create snapshot /// /// Creates a point-in-time snapshot from a disk. #[endpoint { @@ -4186,7 +4200,7 @@ async fn snapshot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a snapshot +/// Fetch snapshot #[endpoint { method = GET, path = "/v1/snapshots/{snapshot}", @@ -4214,7 +4228,7 @@ async fn snapshot_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a snapshot +/// Delete snapshot #[endpoint { method = DELETE, path = "/v1/snapshots/{snapshot}", @@ -4281,7 +4295,7 @@ async fn vpc_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC +/// Create VPC #[endpoint { method = POST, path = "/v1/vpcs", @@ -4307,7 +4321,7 @@ async fn vpc_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a VPC +/// Fetch VPC #[endpoint { method = GET, path = "/v1/vpcs/{vpc}", @@ -4362,7 +4376,7 @@ async fn vpc_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a VPC +/// Delete VPC #[endpoint { method = DELETE, path = "/v1/vpcs/{vpc}", @@ -4423,7 +4437,7 @@ async fn vpc_subnet_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a subnet +/// Create subnet #[endpoint { method = POST, path = "/v1/vpc-subnets", @@ -4448,7 +4462,7 @@ async fn vpc_subnet_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a subnet +/// Fetch subnet #[endpoint { method = GET, path = "/v1/vpc-subnets/{subnet}", @@ -4477,7 +4491,7 @@ async fn vpc_subnet_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a subnet +/// Delete subnet #[endpoint { method = DELETE, path = "/v1/vpc-subnets/{subnet}", @@ -4506,7 +4520,7 @@ async fn vpc_subnet_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a subnet +/// Update subnet #[endpoint { method = PUT, path = "/v1/vpc-subnets/{subnet}", @@ -4686,7 +4700,7 @@ async fn vpc_router_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a router +/// Fetch router #[endpoint { method = GET, path = "/v1/vpc-routers/{router}", @@ -4716,7 +4730,7 @@ async fn vpc_router_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC router +/// Create VPC router #[endpoint { method = POST, path = "/v1/vpc-routers", @@ -4748,7 +4762,7 @@ async fn vpc_router_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a router +/// Delete router #[endpoint { method = DELETE, path = "/v1/vpc-routers/{router}", @@ -4778,7 +4792,7 @@ async fn vpc_router_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a router +/// Update router #[endpoint { method = PUT, path = "/v1/vpc-routers/{router}", @@ -4852,7 +4866,7 @@ async fn vpc_router_route_list( // Vpc Router Routes -/// Fetch a route +/// Fetch route #[endpoint { method = GET, path = "/v1/vpc-router-routes/{route}", @@ -4885,7 +4899,7 @@ async fn vpc_router_route_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a router +/// Create router #[endpoint { method = POST, path = "/v1/vpc-router-routes", @@ -4917,7 +4931,7 @@ async fn vpc_router_route_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a route +/// Delete route #[endpoint { method = DELETE, path = "/v1/vpc-router-routes/{route}", @@ -4949,7 +4963,7 @@ async fn vpc_router_route_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a route +/// Update route #[endpoint { method = PUT, path = "/v1/vpc-router-routes/{route}", @@ -5024,7 +5038,7 @@ struct RackPathParam { rack_id: Uuid, } -/// Fetch a rack +/// Fetch rack #[endpoint { method = GET, path = "/v1/system/hardware/racks/{rack_id}", @@ -5045,7 +5059,7 @@ async fn rack_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List uninitialized sleds in a given rack +/// List uninitialized sleds #[endpoint { method = GET, path = "/v1/system/hardware/sleds-uninitialized", @@ -5072,7 +5086,7 @@ async fn sled_list_uninitialized( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a sled to an initialized rack +/// Add sled to initialized rack // // TODO: In the future this should really be a PUT request, once we resolve // https://github.com/oxidecomputer/omicron/issues/4494. It should also @@ -5129,7 +5143,7 @@ async fn sled_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a sled +/// Fetch sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}", @@ -5151,7 +5165,7 @@ async fn sled_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set the sled's provision state +/// Set sled provision state #[endpoint { method = PUT, path = "/v1/system/hardware/sleds/{sled_id}/provision-state", @@ -5189,7 +5203,7 @@ async fn sled_set_provision_state( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List instances running on a given sled +/// List instances running on given sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}/instances", @@ -5290,7 +5304,7 @@ async fn switch_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a switch +/// Fetch switch #[endpoint { method = GET, path = "/v1/system/hardware/switches/{switch_id}", @@ -5473,7 +5487,7 @@ async fn silo_metric( // Updates -/// Upload a TUF repository +/// Upload TUF repository #[endpoint { method = PUT, path = "/v1/system/update/repository", @@ -5498,7 +5512,9 @@ async fn system_update_put_repository( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get the description of a repository by system version. +/// Fetch TUF repository description +/// +/// Fetch description of TUF repository by system version. #[endpoint { method = GET, path = "/v1/system/update/repository/{system_version}", @@ -5653,7 +5669,7 @@ async fn user_builtin_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in user +/// Fetch built-in user #[endpoint { method = GET, path = "/v1/system/users-builtin/{user}", @@ -5735,7 +5751,7 @@ async fn role_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in role +/// Fetch built-in role #[endpoint { method = GET, path = "/v1/system/roles/{role_name}", @@ -5759,7 +5775,7 @@ async fn role_view( // Current user -/// Fetch the user associated with the current session +/// Fetch user for current session #[endpoint { method = GET, path = "/v1/me", @@ -5782,7 +5798,7 @@ pub(crate) async fn current_user_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the silo groups the current user belongs to +/// Fetch current user's groups #[endpoint { method = GET, path = "/v1/me/groups", @@ -5856,7 +5872,7 @@ async fn current_user_ssh_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an SSH public key +/// Create SSH public key /// /// Create an SSH public key for the currently authenticated user. #[endpoint { @@ -5884,9 +5900,9 @@ async fn current_user_ssh_key_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an SSH public key +/// Fetch SSH public key /// -/// Fetch an SSH public key associated with the currently authenticated user. +/// Fetch SSH public key associated with the currently authenticated user. #[endpoint { method = GET, path = "/v1/me/ssh-keys/{ssh_key}", @@ -5918,7 +5934,7 @@ async fn current_user_ssh_key_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an SSH public key +/// Delete SSH public key /// /// Delete an SSH public key associated with the currently authenticated user. #[endpoint { diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index e1392440a1..cb08bfcdc0 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -288,13 +288,15 @@ impl nexus_test_interface::NexusServer for Server { vec!["qsfp0".parse().unwrap()], )]), ), - rack_network_config: Some(RackNetworkConfig { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56" + .parse() + .unwrap(), infra_ip_first: Ipv4Addr::UNSPECIFIED, infra_ip_last: Ipv4Addr::UNSPECIFIED, ports: Vec::new(), bgp: Vec::new(), - }), + }, }, ) .await diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index da21602cb1..7baacf97ce 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -65,7 +65,7 @@ pub const RACK_UUID: &str = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"; pub const SWITCH_UUID: &str = "dae4e1f1-410e-4314-bff1-fec0504be07e"; pub const OXIMETER_UUID: &str = "39e6175b-4df2-4730-b11d-cbc1e60a2e78"; pub const PRODUCER_UUID: &str = "a6458b7d-87c3-4483-be96-854d814c20de"; -pub const RACK_SUBNET: &str = "fd00:1122:3344:01::/56"; +pub const RACK_SUBNET: &str = "fd00:1122:3344:0100::/56"; /// Password for the user created by the test suite /// diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 3571388747..8d37f9e3ef 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -103,6 +103,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 100 blueprints.period_secs_execute = 600 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index a6fc93e92a..a58871ee71 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -110,7 +110,7 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { let baseboard = uninitialized_sleds.pop().unwrap().baseboard; let sled_uuid = Uuid::new_v4(); let sa = SledAgentStartupInfo { - sa_address: "[fd00:1122:3344:01::1]:8080".parse().unwrap(), + sa_address: "[fd00:1122:3344:0100::1]:8080".parse().unwrap(), role: SledRole::Gimlet, baseboard: Baseboard { serial_number: baseboard.serial, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 3b4c3b3142..06427507d5 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -11,13 +11,17 @@ //! nexus/deployment does not currently know about nexus/db-model and it's //! convenient to separate these concerns.) +use crate::external_api::views::SledProvisionState; use crate::inventory::Collection; +pub use crate::inventory::NetworkInterface; +pub use crate::inventory::NetworkInterfaceKind; pub use crate::inventory::OmicronZoneConfig; pub use crate::inventory::OmicronZoneDataset; pub use crate::inventory::OmicronZoneType; pub use crate::inventory::OmicronZonesConfig; pub use crate::inventory::SourceNatConfig; pub use crate::inventory::ZpoolName; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; @@ -43,14 +47,26 @@ use uuid::Uuid; /// /// The current policy is pretty limited. It's aimed primarily at supporting /// the add/remove sled use case. +#[derive(Debug, Clone)] pub struct Policy { /// set of sleds that are supposed to be part of the control plane, along /// with information about resources available to the planner pub sleds: BTreeMap, + + /// ranges specified by the IP pool for externally-visible control plane + /// services (e.g., external DNS, Nexus, boundary NTP) + pub service_ip_pool_ranges: Vec, + + /// desired total number of deployed Nexus zones + pub target_nexus_zone_count: usize, } /// Describes the resources available on each sled for the planner +#[derive(Debug, Clone)] pub struct SledResources { + /// provision state of this sled + pub provision_state: SledProvisionState, + /// zpools on this sled /// /// (used to allocate storage for control plane zones with persistent @@ -466,10 +482,11 @@ impl<'a> OmicronZonesDiff<'a> { for z in &bbsledzones.zones { writeln!( f, - "{} zone {} type {} ({})", + "{} zone {} type {} underlay IP {} ({})", prefix, z.id, z.zone_type.label(), + z.underlay_address, label )?; } @@ -529,44 +546,65 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { DiffZoneChangedHow::DetailsChanged => { writeln!( f, - "- zone {} type {} (changed)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (changed)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::RemovedFromService => { writeln!( f, - "- zone {} type {} (in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (removed from service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (removed from service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::AddedToService => { writeln!( f, - "- zone {} type {} (not in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (not in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (added to service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (added to service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::NoChanges => { writeln!( f, - " zone {} type {} (unchanged)", - zone_id, zone_type, + " zone {} type {} underlay IP {} \ + (unchanged)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; } } @@ -575,8 +613,9 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { for zone in sled_changes.zones_added() { writeln!( f, - "+ zone {} type {} (added)", + "+ zone {} type {} underlay IP {} (added)", zone.id, + zone.underlay_address, zone.zone_type.label(), )?; } diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index bc25e8d4bd..ab15ec26b7 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -263,7 +263,7 @@ pub struct RackInitializationRequest { /// The external qsfp ports per sidecar pub external_port_count: ExternalPortDiscovery, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } pub type DnsConfigParams = dns_service_client::types::DnsConfigParams; diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index c99e51af4f..50e8b380b3 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -48,7 +48,7 @@ use uuid::Uuid; /// database. /// /// See the documentation in the database schema for more background. -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub struct Collection { /// unique identifier for this collection pub id: Uuid, diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 6fd83cef47..a55803eda9 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -651,7 +651,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -659,10 +658,6 @@ } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ @@ -688,7 +683,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ] }, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index bc26736b37..4714b64c52 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5618,7 +5618,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -5649,6 +5648,7 @@ "external_port_count", "internal_dns_zone_config", "internal_services_ip_pool_ranges", + "rack_network_config", "recovery_silo", "services" ] diff --git a/openapi/nexus.json b/openapi/nexus.json index 7aedd1b523..d27261b179 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -223,7 +223,7 @@ "tags": [ "silos" ], - "summary": "Create a new system-wide x.509 certificate", + "summary": "Create new system-wide x.509 certificate", "description": "This certificate is automatically used by the Oxide Control plane to serve external connections.", "operationId": "certificate_create", "requestBody": { @@ -261,7 +261,7 @@ "tags": [ "silos" ], - "summary": "Fetch a certificate", + "summary": "Fetch certificate", "description": "Returns the details of a specific certificate", "operationId": "certificate_view", "parameters": [ @@ -297,7 +297,7 @@ "tags": [ "silos" ], - "summary": "Delete a certificate", + "summary": "Delete certificate", "description": "Permanently delete a certificate. This operation cannot be undone.", "operationId": "certificate_delete", "parameters": [ @@ -443,7 +443,7 @@ "tags": [ "disks" ], - "summary": "Fetch a disk", + "summary": "Fetch disk", "operationId": "disk_view", "parameters": [ { @@ -487,7 +487,7 @@ "tags": [ "disks" ], - "summary": "Delete a disk", + "summary": "Delete disk", "operationId": "disk_delete", "parameters": [ { @@ -526,7 +526,7 @@ "tags": [ "disks" ], - "summary": "Import blocks into a disk", + "summary": "Import blocks into disk", "operationId": "disk_bulk_write_import", "parameters": [ { @@ -575,7 +575,7 @@ "tags": [ "disks" ], - "summary": "Start importing blocks into a disk", + "summary": "Start importing blocks into disk", "description": "Start the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_start", "parameters": [ @@ -615,7 +615,7 @@ "tags": [ "disks" ], - "summary": "Stop importing blocks into a disk", + "summary": "Stop importing blocks into disk", "description": "Stop the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_stop", "parameters": [ @@ -876,7 +876,7 @@ "tags": [ "floating-ips" ], - "summary": "Create a floating IP", + "summary": "Create floating IP", "operationId": "floating_ip_create", "parameters": [ { @@ -924,7 +924,7 @@ "tags": [ "floating-ips" ], - "summary": "Fetch a floating IP", + "summary": "Fetch floating IP", "operationId": "floating_ip_view", "parameters": [ { @@ -968,7 +968,7 @@ "tags": [ "floating-ips" ], - "summary": "Delete a floating IP", + "summary": "Delete floating IP", "operationId": "floating_ip_delete", "parameters": [ { @@ -1007,7 +1007,8 @@ "tags": [ "floating-ips" ], - "summary": "Attach a floating IP to an instance or other resource", + "summary": "Attach floating IP", + "description": "Attach floating IP to an instance or other resource.", "operationId": "floating_ip_attach", "parameters": [ { @@ -1063,7 +1064,7 @@ "tags": [ "floating-ips" ], - "summary": "Detach a floating IP from an instance or other resource", + "summary": "Detach floating IP", "operationId": "floating_ip_detach", "parameters": [ { @@ -1273,7 +1274,7 @@ "tags": [ "images" ], - "summary": "Create an image", + "summary": "Create image", "description": "Create a new image in a project.", "operationId": "image_create", "parameters": [ @@ -1321,7 +1322,7 @@ "tags": [ "images" ], - "summary": "Fetch an image", + "summary": "Fetch image", "description": "Fetch the details for a specific image in a project.", "operationId": "image_view", "parameters": [ @@ -1366,7 +1367,7 @@ "tags": [ "images" ], - "summary": "Delete an image", + "summary": "Delete image", "description": "Permanently delete an image from a project. This operation cannot be undone. Any instances in the project using the image will continue to run, however new instances can not be created with this image.", "operationId": "image_delete", "parameters": [ @@ -1406,8 +1407,8 @@ "tags": [ "images" ], - "summary": "Demote a silo image", - "description": "Demote a silo image to be visible only to a specified project", + "summary": "Demote silo image", + "description": "Demote silo image to be visible only to a specified project", "operationId": "image_demote", "parameters": [ { @@ -1454,8 +1455,8 @@ "tags": [ "images" ], - "summary": "Promote a project image", - "description": "Promote a project image to be visible to all projects in the silo", + "summary": "Promote project image", + "description": "Promote project image to be visible to all projects in the silo", "operationId": "image_promote", "parameters": [ { @@ -1568,7 +1569,7 @@ "tags": [ "instances" ], - "summary": "Create an instance", + "summary": "Create instance", "operationId": "instance_create", "parameters": [ { @@ -1616,7 +1617,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance", + "summary": "Fetch instance", "operationId": "instance_view", "parameters": [ { @@ -1660,7 +1661,7 @@ "tags": [ "instances" ], - "summary": "Delete an instance", + "summary": "Delete instance", "operationId": "instance_delete", "parameters": [ { @@ -1699,7 +1700,7 @@ "tags": [ "instances" ], - "summary": "List an instance's disks", + "summary": "List disks for instance", "operationId": "instance_disk_list", "parameters": [ { @@ -1775,7 +1776,7 @@ "tags": [ "instances" ], - "summary": "Attach a disk to an instance", + "summary": "Attach disk to instance", "operationId": "instance_disk_attach", "parameters": [ { @@ -1831,7 +1832,7 @@ "tags": [ "instances" ], - "summary": "Detach a disk from an instance", + "summary": "Detach disk from instance", "operationId": "instance_disk_detach", "parameters": [ { @@ -1933,7 +1934,7 @@ "tags": [ "instances" ], - "summary": "Allocate and attach an ephemeral IP to an instance", + "summary": "Allocate and attach ephemeral IP to instance", "operationId": "instance_ephemeral_ip_attach", "parameters": [ { @@ -1987,7 +1988,7 @@ "tags": [ "instances" ], - "summary": "Detach and deallocate an ephemeral IP from an instance", + "summary": "Detach and deallocate ephemeral IP from instance", "operationId": "instance_ephemeral_ip_detach", "parameters": [ { @@ -2128,7 +2129,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance's serial console", + "summary": "Fetch instance serial console", "operationId": "instance_serial_console", "parameters": [ { @@ -2207,7 +2208,7 @@ "tags": [ "instances" ], - "summary": "Stream an instance's serial console", + "summary": "Stream instance serial console", "operationId": "instance_serial_console_stream", "parameters": [ { @@ -2257,8 +2258,8 @@ "tags": [ "instances" ], - "summary": "List the SSH public keys added to the instance via cloud-init during instance creation", - "description": "Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", + "summary": "List SSH public keys for instance", + "description": "List SSH public keys injected via cloud-init during instance creation. Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", "operationId": "instance_ssh_public_key_list", "parameters": [ { @@ -2334,7 +2335,7 @@ "tags": [ "instances" ], - "summary": "Boot an instance", + "summary": "Boot instance", "operationId": "instance_start", "parameters": [ { @@ -2380,7 +2381,7 @@ "tags": [ "instances" ], - "summary": "Stop an instance", + "summary": "Stop instance", "operationId": "instance_stop", "parameters": [ { @@ -2485,7 +2486,7 @@ "tags": [ "projects" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "project_ip_pool_view", "parameters": [ { @@ -2583,7 +2584,7 @@ "tags": [ "session" ], - "summary": "Fetch the user associated with the current session", + "summary": "Fetch user for current session", "operationId": "current_user_view", "responses": { "200": { @@ -2610,7 +2611,7 @@ "tags": [ "session" ], - "summary": "Fetch the silo groups the current user belongs to", + "summary": "Fetch current user's groups", "operationId": "current_user_groups", "parameters": [ { @@ -2727,7 +2728,7 @@ "tags": [ "session" ], - "summary": "Create an SSH public key", + "summary": "Create SSH public key", "description": "Create an SSH public key for the currently authenticated user.", "operationId": "current_user_ssh_key_create", "requestBody": { @@ -2765,8 +2766,8 @@ "tags": [ "session" ], - "summary": "Fetch an SSH public key", - "description": "Fetch an SSH public key associated with the currently authenticated user.", + "summary": "Fetch SSH public key", + "description": "Fetch SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_view", "parameters": [ { @@ -2802,7 +2803,7 @@ "tags": [ "session" ], - "summary": "Delete an SSH public key", + "summary": "Delete SSH public key", "description": "Delete an SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_delete", "parameters": [ @@ -3006,7 +3007,7 @@ "tags": [ "instances" ], - "summary": "Create a network interface", + "summary": "Create network interface", "operationId": "instance_network_interface_create", "parameters": [ { @@ -3062,7 +3063,7 @@ "tags": [ "instances" ], - "summary": "Fetch a network interface", + "summary": "Fetch network interface", "operationId": "instance_network_interface_view", "parameters": [ { @@ -3114,7 +3115,7 @@ "tags": [ "instances" ], - "summary": "Update a network interface", + "summary": "Update network interface", "operationId": "instance_network_interface_update", "parameters": [ { @@ -3176,7 +3177,7 @@ "tags": [ "instances" ], - "summary": "Delete a network interface", + "summary": "Delete network interface", "description": "Note that the primary interface for an instance cannot be deleted if there are any secondary interfaces. A new primary interface must be designated first. The primary interface can be deleted if there are no secondary interfaces.", "operationId": "instance_network_interface_delete", "parameters": [ @@ -3252,7 +3253,7 @@ "tags": [ "silos" ], - "summary": "Fetch the current silo's IAM policy", + "summary": "Fetch current silo's IAM policy", "operationId": "policy_view", "responses": { "200": { @@ -3277,7 +3278,7 @@ "tags": [ "silos" ], - "summary": "Update the current silo's IAM policy", + "summary": "Update current silo's IAM policy", "operationId": "policy_update", "requestBody": { "content": { @@ -3371,7 +3372,7 @@ "tags": [ "projects" ], - "summary": "Create a project", + "summary": "Create project", "operationId": "project_create", "requestBody": { "content": { @@ -3408,7 +3409,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project", + "summary": "Fetch project", "operationId": "project_view", "parameters": [ { @@ -3490,7 +3491,7 @@ "tags": [ "projects" ], - "summary": "Delete a project", + "summary": "Delete project", "operationId": "project_delete", "parameters": [ { @@ -3521,7 +3522,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project's IAM policy", + "summary": "Fetch project's IAM policy", "operationId": "project_policy_view", "parameters": [ { @@ -3557,7 +3558,7 @@ "tags": [ "projects" ], - "summary": "Update a project's IAM policy", + "summary": "Update project's IAM policy", "operationId": "project_policy_update", "parameters": [ { @@ -3672,7 +3673,7 @@ "tags": [ "snapshots" ], - "summary": "Create a snapshot", + "summary": "Create snapshot", "description": "Creates a point-in-time snapshot from a disk.", "operationId": "snapshot_create", "parameters": [ @@ -3721,7 +3722,7 @@ "tags": [ "snapshots" ], - "summary": "Fetch a snapshot", + "summary": "Fetch snapshot", "operationId": "snapshot_view", "parameters": [ { @@ -3765,7 +3766,7 @@ "tags": [ "snapshots" ], - "summary": "Delete a snapshot", + "summary": "Delete snapshot", "operationId": "snapshot_delete", "parameters": [ { @@ -3922,7 +3923,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a rack", + "summary": "Fetch rack", "operationId": "rack_view", "parameters": [ { @@ -4018,7 +4019,7 @@ "tags": [ "system/hardware" ], - "summary": "Add a sled to an initialized rack", + "summary": "Add sled to initialized rack", "operationId": "sled_add", "requestBody": { "content": { @@ -4048,7 +4049,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a sled", + "summary": "Fetch sled", "operationId": "sled_view", "parameters": [ { @@ -4156,7 +4157,7 @@ "tags": [ "system/hardware" ], - "summary": "List instances running on a given sled", + "summary": "List instances running on given sled", "operationId": "sled_instance_list", "parameters": [ { @@ -4225,7 +4226,7 @@ "tags": [ "system/hardware" ], - "summary": "Set the sled's provision state", + "summary": "Set sled provision state", "operationId": "sled_set_provision_state", "parameters": [ { @@ -4274,7 +4275,7 @@ "tags": [ "system/hardware" ], - "summary": "List uninitialized sleds in a given rack", + "summary": "List uninitialized sleds", "operationId": "sled_list_uninitialized", "parameters": [ { @@ -4562,7 +4563,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a switch", + "summary": "Fetch switch", "operationId": "switch_view", "parameters": [ { @@ -4670,7 +4671,7 @@ "tags": [ "system/silos" ], - "summary": "Create a user", + "summary": "Create user", "description": "Users can only be created in Silos with `provision_type` == `Fixed`. Otherwise, Silo users are just-in-time (JIT) provisioned when a user first logs in using an external Identity Provider.", "operationId": "local_idp_user_create", "parameters": [ @@ -4719,7 +4720,7 @@ "tags": [ "system/silos" ], - "summary": "Delete a user", + "summary": "Delete user", "operationId": "local_idp_user_delete", "parameters": [ { @@ -4760,7 +4761,7 @@ "tags": [ "system/silos" ], - "summary": "Set or invalidate a user's password", + "summary": "Set or invalidate user's password", "description": "Passwords can only be updated for users in Silos with identity mode `LocalOnly`.", "operationId": "local_idp_user_set_password", "parameters": [ @@ -4812,7 +4813,7 @@ "tags": [ "system/silos" ], - "summary": "Create a SAML IdP", + "summary": "Create SAML IdP", "operationId": "saml_identity_provider_create", "parameters": [ { @@ -4860,7 +4861,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a SAML IdP", + "summary": "Fetch SAML IdP", "operationId": "saml_identity_provider_view", "parameters": [ { @@ -4964,7 +4965,7 @@ "tags": [ "system/networking" ], - "summary": "Create an IP pool", + "summary": "Create IP pool", "operationId": "ip_pool_create", "requestBody": { "content": { @@ -5001,7 +5002,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "ip_pool_view", "parameters": [ { @@ -5037,7 +5038,7 @@ "tags": [ "system/networking" ], - "summary": "Update an IP pool", + "summary": "Update IP pool", "operationId": "ip_pool_update", "parameters": [ { @@ -5083,7 +5084,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an IP pool", + "summary": "Delete IP pool", "operationId": "ip_pool_delete", "parameters": [ { @@ -5114,8 +5115,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for an IP pool", - "description": "List ranges for an IP pool. Ranges are ordered by their first address.", + "summary": "List ranges for IP pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_range_list", "parameters": [ { @@ -5176,7 +5177,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool", + "summary": "Add range to IP pool", "operationId": "ip_pool_range_add", "parameters": [ { @@ -5224,7 +5225,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool", + "summary": "Remove range from IP pool", "operationId": "ip_pool_range_remove", "parameters": [ { @@ -5265,7 +5266,7 @@ "tags": [ "system/networking" ], - "summary": "List an IP pool's linked silos", + "summary": "List IP pool's linked silos", "operationId": "ip_pool_silo_list", "parameters": [ { @@ -5331,7 +5332,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool available within a silo", + "summary": "Link IP pool to silo", + "description": "Users in linked silos can allocate external IPs from this pool for their instances. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "ip_pool_silo_link", "parameters": [ { @@ -5379,8 +5381,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool default or not-default for a silo", - "description": "When a pool is made default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", + "summary": "Make IP pool default for silo", + "description": "When a user asks for an IP (e.g., at instance create time) without specifying a pool, the IP comes from the default pool if a default is configured. When a pool is made the default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", "operationId": "ip_pool_silo_update", "parameters": [ { @@ -5433,7 +5435,7 @@ "tags": [ "system/networking" ], - "summary": "Unlink an IP pool from a silo", + "summary": "Unlink IP pool from silo", "description": "Will fail if there are any outstanding IPs allocated in the silo.", "operationId": "ip_pool_silo_unlink", "parameters": [ @@ -5472,7 +5474,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch the IP pool used for Oxide services", + "summary": "Fetch Oxide service IP pool", "operationId": "ip_pool_service_view", "responses": { "200": { @@ -5499,8 +5501,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for the IP pool used for Oxide services", - "description": "List ranges for the IP pool used for Oxide services. Ranges are ordered by their first address.", + "summary": "List IP ranges for the Oxide service pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_service_range_list", "parameters": [ { @@ -5552,7 +5554,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool used for Oxide services", + "summary": "Add IP range to Oxide service pool", "operationId": "ip_pool_service_range_add", "requestBody": { "content": { @@ -5589,7 +5591,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool used for Oxide services", + "summary": "Remove IP range from Oxide service pool", "operationId": "ip_pool_service_range_remove", "requestBody": { "content": { @@ -5773,7 +5775,7 @@ "tags": [ "system/networking" ], - "summary": "Create an address lot", + "summary": "Create address lot", "operationId": "networking_address_lot_create", "requestBody": { "content": { @@ -5810,7 +5812,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an address lot", + "summary": "Delete address lot", "operationId": "networking_address_lot_delete", "parameters": [ { @@ -5841,7 +5843,7 @@ "tags": [ "system/networking" ], - "summary": "List the blocks in an address lot", + "summary": "List blocks in address lot", "operationId": "networking_address_lot_block_list", "parameters": [ { @@ -5909,7 +5911,7 @@ "tags": [ "system/networking" ], - "summary": "Disable a BFD session.", + "summary": "Disable a BFD session", "operationId": "networking_bfd_disable", "requestBody": { "content": { @@ -5939,7 +5941,7 @@ "tags": [ "system/networking" ], - "summary": "Enable a BFD session.", + "summary": "Enable a BFD session", "operationId": "networking_bfd_enable", "requestBody": { "content": { @@ -5969,7 +5971,7 @@ "tags": [ "system/networking" ], - "summary": "Get BFD status.", + "summary": "Get BFD status", "operationId": "networking_bfd_status", "responses": { "200": { @@ -6065,7 +6067,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP configuration", + "summary": "Create new BGP configuration", "operationId": "networking_bgp_config_create", "requestBody": { "content": { @@ -6100,7 +6102,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP configuration", + "summary": "Delete BGP configuration", "operationId": "networking_bgp_config_delete", "parameters": [ { @@ -6171,7 +6173,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP announce set", + "summary": "Create new BGP announce set", "operationId": "networking_bgp_announce_set_create", "requestBody": { "content": { @@ -6206,7 +6208,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP announce set", + "summary": "Delete BGP announce set", "operationId": "networking_bgp_announce_set_delete", "parameters": [ { @@ -6369,7 +6371,7 @@ "tags": [ "system/networking" ], - "summary": "Create a loopback address", + "summary": "Create loopback address", "operationId": "networking_loopback_address_create", "requestBody": { "content": { @@ -6406,7 +6408,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a loopback address", + "summary": "Delete loopback address", "operationId": "networking_loopback_address_delete", "parameters": [ { @@ -6598,7 +6600,7 @@ "tags": [ "system/networking" ], - "summary": "Get information about a switch port", + "summary": "Get information about switch port", "operationId": "networking_switch_port_settings_view", "parameters": [ { @@ -6636,7 +6638,7 @@ "tags": [ "policy" ], - "summary": "Fetch the top-level IAM policy", + "summary": "Fetch top-level IAM policy", "operationId": "system_policy_view", "responses": { "200": { @@ -6661,7 +6663,7 @@ "tags": [ "policy" ], - "summary": "Update the top-level IAM policy", + "summary": "Update top-level IAM policy", "operationId": "system_policy_update", "requestBody": { "content": { @@ -6750,7 +6752,7 @@ "tags": [ "roles" ], - "summary": "Fetch a built-in role", + "summary": "Fetch built-in role", "operationId": "role_view", "parameters": [ { @@ -6942,8 +6944,8 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo", - "description": "Fetch a silo by name or ID.", + "summary": "Fetch silo", + "description": "Fetch silo by name or ID.", "operationId": "silo_view", "parameters": [ { @@ -6980,7 +6982,7 @@ "system/silos" ], "summary": "Delete a silo", - "description": "Delete a silo by name.", + "description": "Delete a silo by name or ID.", "operationId": "silo_delete", "parameters": [ { @@ -7011,7 +7013,8 @@ "tags": [ "system/silos" ], - "summary": "List IP pools available within silo", + "summary": "List IP pools linked to silo", + "description": "Linked IP pools are available to users in the specified silo. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "silo_ip_pool_list", "parameters": [ { @@ -7079,7 +7082,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo's IAM policy", + "summary": "Fetch silo IAM policy", "operationId": "silo_policy_view", "parameters": [ { @@ -7115,7 +7118,7 @@ "tags": [ "system/silos" ], - "summary": "Update a silo's IAM policy", + "summary": "Update silo IAM policy", "operationId": "silo_policy_update", "parameters": [ { @@ -7163,7 +7166,7 @@ "tags": [ "system/silos" ], - "summary": "View the resource quotas of a given silo", + "summary": "Fetch resource quotas for silo", "operationId": "silo_quotas_view", "parameters": [ { @@ -7199,7 +7202,7 @@ "tags": [ "system/silos" ], - "summary": "Update the resource quotas of a given silo", + "summary": "Update resource quotas for silo", "description": "If a quota value is not specified, it will remain unchanged.", "operationId": "silo_quotas_update", "parameters": [ @@ -7248,7 +7251,7 @@ "tags": [ "system/silos" ], - "summary": "List built-in (system) users in a silo", + "summary": "List built-in (system) users in silo", "operationId": "silo_user_list", "parameters": [ { @@ -7317,7 +7320,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in (system) user", + "summary": "Fetch built-in (system) user", "operationId": "silo_user_view", "parameters": [ { @@ -7424,7 +7427,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in user", + "summary": "Fetch built-in user", "operationId": "user_builtin_view", "parameters": [ { @@ -7520,7 +7523,7 @@ "tags": [ "system/silos" ], - "summary": "View the current utilization of a given silo", + "summary": "Fetch current utilization for given silo", "operationId": "silo_utilization_view", "parameters": [ { @@ -7626,7 +7629,7 @@ "tags": [ "silos" ], - "summary": "View the resource utilization of the user's current silo", + "summary": "Fetch resource utilization for user's current silo", "operationId": "utilization_view", "responses": { "200": { @@ -7828,7 +7831,7 @@ "tags": [ "vpcs" ], - "summary": "Create a subnet", + "summary": "Create subnet", "operationId": "vpc_subnet_create", "parameters": [ { @@ -7884,7 +7887,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a subnet", + "summary": "Fetch subnet", "operationId": "vpc_subnet_view", "parameters": [ { @@ -7936,7 +7939,7 @@ "tags": [ "vpcs" ], - "summary": "Update a subnet", + "summary": "Update subnet", "operationId": "vpc_subnet_update", "parameters": [ { @@ -7998,7 +8001,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a subnet", + "summary": "Delete subnet", "operationId": "vpc_subnet_delete", "parameters": [ { @@ -8196,7 +8199,7 @@ "tags": [ "vpcs" ], - "summary": "Create a VPC", + "summary": "Create VPC", "operationId": "vpc_create", "parameters": [ { @@ -8244,7 +8247,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a VPC", + "summary": "Fetch VPC", "operationId": "vpc_view", "parameters": [ { @@ -8342,7 +8345,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a VPC", + "summary": "Delete VPC", "operationId": "vpc_delete", "parameters": [ { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 4b53397ffb..395394defb 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1240,7 +1240,7 @@ "type": "object", "properties": { "sled_id": { - "$ref": "#/components/schemas/Baseboard" + "$ref": "#/components/schemas/BaseboardId" }, "start_request": { "$ref": "#/components/schemas/StartSledAgentRequest" @@ -1319,6 +1319,24 @@ } ] }, + "BaseboardId": { + "description": "A representation of a Baseboard ID as used in the inventory subsystem This type is essentially the same as a `Baseboard` except it doesn't have a revision or HW type (Gimlet, PC, Unknown).", + "type": "object", + "properties": { + "part_number": { + "description": "Oxide Part Number", + "type": "string" + }, + "serial_number": { + "description": "Serial number (unique for a given part number)", + "type": "string" + } + }, + "required": [ + "part_number", + "serial_number" + ] + }, "BgpConfig": { "type": "object", "properties": { diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 300e8412c3..b9645a174f 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -1132,7 +1132,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } ] } @@ -2172,7 +2172,7 @@ } }, "rack_network_config": { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } }, "required": [ @@ -2190,46 +2190,6 @@ "type": "string", "format": "uuid" }, - "RackNetworkConfigV1": { - "description": "Initial network configuration", - "type": "object", - "properties": { - "bgp": { - "description": "BGP configurations for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/BgpConfig" - } - }, - "infra_ip_first": { - "description": "First ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "infra_ip_last": { - "description": "Last ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "ports": { - "description": "Uplinks for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/PortConfigV1" - } - }, - "rack_subnet": { - "$ref": "#/components/schemas/Ipv6Network" - } - }, - "required": [ - "bgp", - "infra_ip_first", - "infra_ip_last", - "ports", - "rack_subnet" - ] - }, "RackOperationStatus": { "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackInitId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackResetId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", "oneOf": [ @@ -4698,6 +4658,38 @@ } ] }, + "UserSpecifiedRackNetworkConfig": { + "description": "User-specified parts of [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig).", + "type": "object", + "properties": { + "bgp": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BgpConfig" + } + }, + "infra_ip_first": { + "type": "string", + "format": "ipv4" + }, + "infra_ip_last": { + "type": "string", + "format": "ipv4" + }, + "ports": { + "type": "array", + "items": { + "$ref": "#/components/schemas/PortConfigV1" + } + } + }, + "required": [ + "bgp", + "infra_ip_first", + "infra_ip_last", + "ports" + ] + }, "IgnitionCommand": { "description": "Ignition command.\n\n
JSON schema\n\n```json { \"description\": \"Ignition command.\", \"type\": \"string\", \"enum\": [ \"power_on\", \"power_off\", \"power_reset\" ] } ```
", "type": "string", diff --git a/package-manifest.toml b/package-manifest.toml index 8944e59c37..1e88ddc760 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -446,10 +446,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//maghemite.sha256.txt -source.sha256 = "36e976ae9b1517b358ec7eadd5fb03f5d40d54074ff830a79895f8fc3e643935" +source.sha256 = "19d5eaa744257c32ccdca52af79d718aeb88a0af188345d33a4564a69b259632" output.type = "tarball" [package.mg-ddm] @@ -462,10 +462,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "bc3137751db24d2e44eca7118f6ca825ed3e9df736480fc210392802cd063dd8" +source.sha256 = "ffb647b3297ec616d3d9ea93396ad9edd16ed146048a660b34e9b86e85d466b7" output.type = "zone" output.intermediate_only = true @@ -477,10 +477,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" +source.sha256 = "26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" output.type = "zone" output.intermediate_only = true diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index cbd73ed066..f5ac5bd0ff 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -466,7 +466,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ], "properties": { @@ -521,19 +521,12 @@ }, "rack_network_config": { "description": "Initial rack network configuration", - "anyOf": [ + "allOf": [ { "$ref": "#/definitions/RackNetworkConfigV1" - }, - { - "type": "null" } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 79189e7f49..48444af8d4 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; use sha3::{Digest, Sha3_256}; use sled_hardware::Baseboard; use std::borrow::Cow; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; @@ -24,14 +24,13 @@ pub enum BootstrapAddressDiscovery { /// Ignore all bootstrap addresses except our own. OnlyOurs, /// Ignore all bootstrap addresses except the following. - OnlyThese { addrs: HashSet }, + OnlyThese { addrs: BTreeSet }, } // "Shadow" copy of `RackInitializeRequest` that does no validation on its // fields. #[derive(Clone, Deserialize)] struct UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr, trust_quorum_peers: Option>, bootstrap_discovery: BootstrapAddressDiscovery, ntp_servers: Vec, @@ -41,7 +40,7 @@ struct UnvalidatedRackInitializeRequest { external_dns_zone_name: String, external_certificates: Vec, recovery_silo: RecoverySiloConfig, - rack_network_config: Option, + rack_network_config: RackNetworkConfig, } /// Configuration for the "rack setup service". @@ -53,8 +52,6 @@ struct UnvalidatedRackInitializeRequest { #[derive(Clone, Deserialize, Serialize, PartialEq, JsonSchema)] #[serde(try_from = "UnvalidatedRackInitializeRequest")] pub struct RackInitializeRequest { - pub rack_subnet: Ipv6Addr, - /// The set of peer_ids required to initialize trust quorum /// /// The value is `None` if we are not using trust quorum @@ -89,7 +86,7 @@ pub struct RackInitializeRequest { pub recovery_silo: RecoverySiloConfig, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } // This custom debug implementation hides the private keys. @@ -98,7 +95,6 @@ impl std::fmt::Debug for RackInitializeRequest { // If you find a compiler error here, and you just added a field to this // struct, be sure to add it to the Debug impl below! let RackInitializeRequest { - rack_subnet, trust_quorum_peers: trust_qurorum_peers, bootstrap_discovery, ntp_servers, @@ -112,7 +108,6 @@ impl std::fmt::Debug for RackInitializeRequest { } = &self; f.debug_struct("RackInitializeRequest") - .field("rack_subnet", rack_subnet) .field("trust_quorum_peers", trust_qurorum_peers) .field("bootstrap_discovery", bootstrap_discovery) .field("ntp_servers", ntp_servers) @@ -155,7 +150,6 @@ impl TryFrom for RackInitializeRequest { } Ok(RackInitializeRequest { - rack_subnet: value.rack_subnet, trust_quorum_peers: value.trust_quorum_peers, bootstrap_discovery: value.bootstrap_discovery, ntp_servers: value.ntp_servers, @@ -174,10 +168,21 @@ impl TryFrom for RackInitializeRequest { pub type Certificate = nexus_client::types::Certificate; pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; +/// A representation of a Baseboard ID as used in the inventory subsystem +/// This type is essentially the same as a `Baseboard` except it doesn't have a +/// revision or HW type (Gimlet, PC, Unknown). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + /// A request to Add a given sled after rack initialization has occurred #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] pub struct AddSledRequest { - pub sled_id: Baseboard, + pub sled_id: BaseboardId, pub start_request: StartSledAgentRequest, } @@ -255,9 +260,6 @@ pub struct StartSledAgentRequestBody { /// true. pub is_lrtq_learner: bool, - // Note: The order of these fields is load bearing, because we serialize - // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it - // must come after non-table fields. /// Portion of the IP space to be managed by the Sled Agent. pub subnet: Ipv6Subnet, } @@ -360,6 +362,7 @@ pub fn test_config() -> RackInitializeRequest { #[cfg(test)] mod tests { + use std::net::Ipv4Addr; use std::net::Ipv6Addr; use super::*; @@ -387,7 +390,6 @@ mod tests { #[test] fn parse_rack_initialization_weak_hash() { let config = r#" - rack_subnet = "fd00:1122:3344:0100::" bootstrap_discovery.type = "only_ours" ntp_servers = [ "ntp.eng.oxide.computer" ] dns_servers = [ "1.1.1.1", "9.9.9.9" ] @@ -472,7 +474,6 @@ mod tests { // Conjure up a config; we'll tweak the internal services pools and // external DNS IPs, but no other fields matter. let mut config = UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -486,7 +487,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; // Valid configs: all external DNS IPs are contained in the IP pool diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f14a13aa41..52bfb20e5d 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -664,6 +664,27 @@ impl OmicronZoneType { *address, )) } + + /// Does this zone require time synchronization before it is initialized?" + /// + /// This function is somewhat conservative - the set of services + /// that can be launched before timesync has completed is intentionally kept + /// small, since it would be easy to add a service that expects time to be + /// reasonably synchronized. + pub fn requires_timesync(&self) -> bool { + match self { + // These zones can be initialized and started before time has been + // synchronized. For the NTP zones, this should be self-evident -- + // we need the NTP zone to actually perform time synchronization! + // + // The DNS zone is a bit of an exception here, since the NTP zone + // itself may rely on DNS lookups as a dependency. + OmicronZoneType::BoundaryNtp { .. } + | OmicronZoneType::InternalNtp { .. } + | OmicronZoneType::InternalDns { .. } => false, + _ => true, + } + } } impl crate::smf_helper::Service for OmicronZoneType { diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 33de7121d4..52bea295a5 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -70,12 +70,14 @@ impl SetupServiceConfig { } pub fn az_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new(self.rack_network_config.rack_subnet.ip()) } /// Returns the subnet for our rack. pub fn rack_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new( + self.rack_network_config.rack_subnet.ip(), + ) } /// Returns the subnet for the `index`-th sled in the rack. @@ -92,12 +94,12 @@ mod test { use anyhow::Context; use camino::Utf8PathBuf; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; #[test] fn test_subnets() { let cfg = SetupServiceConfig { - rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: vec![String::from("test.pool.example.com")], @@ -119,7 +121,13 @@ mod test { .parse() .unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; assert_eq!( diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index bed82a7a01..0b633c2057 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -15,8 +15,8 @@ use internal_dns::ServiceName; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, - MGD_PORT, MGS_PORT, NTP_PORT, NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, - SLED_PREFIX, + MGD_PORT, MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, + RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; use omicron_common::api::external::{MacAddr, Vni}; use omicron_common::api::internal::shared::SwitchLocation; @@ -35,7 +35,7 @@ use sled_agent_client::{ use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::num::Wrapping; use thiserror::Error; @@ -44,9 +44,6 @@ use uuid::Uuid; // The number of boundary NTP servers to create from RSS. const BOUNDARY_NTP_COUNT: usize = 2; -// The number of Nexus instances to create from RSS. -const NEXUS_COUNT: usize = 3; - // The number of CRDB instances to create from RSS. const CRDB_COUNT: usize = 5; @@ -458,7 +455,7 @@ impl Plan { } // Provision Nexus zones, continuing to stripe across sleds. - for _ in 0..NEXUS_COUNT { + for _ in 0..NEXUS_REDUNDANCY { let sled = { let which_sled = sled_allocator.next().ok_or(PlanError::NotEnoughSleds)?; @@ -708,7 +705,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - sleds: &HashMap, + sleds: &BTreeMap, ) -> Result { // Load the information we need about each Sled to be able to allocate // components on it. @@ -1078,6 +1075,7 @@ mod tests { use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::RecoverySiloConfig; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; const EXPECTED_RESERVED_ADDRESSES: u16 = 2; const EXPECTED_USABLE_ADDRESSES: u16 = @@ -1149,7 +1147,6 @@ mod tests { "fd01::103", ]; let config = Config { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -1173,7 +1170,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; let mut svp = ServicePortBuilder::new(&config); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 07f33893fc..efdd86d2f9 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet}; use std::net::{Ipv6Addr, SocketAddrV6}; use thiserror::Error; use uuid::Uuid; @@ -46,7 +46,7 @@ const RSS_SLED_PLAN_FILENAME: &str = "rss-sled-plan.json"; #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct Plan { pub rack_id: Uuid, - pub sleds: HashMap, + pub sleds: BTreeMap, // Store the provided RSS configuration as part of the sled plan; if it // changes after reboot, we need to know. @@ -81,7 +81,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - bootstrap_addrs: HashSet, + bootstrap_addrs: BTreeSet, use_trust_quorum: bool, ) -> Result { let rack_id = Uuid::new_v4(); @@ -117,7 +117,7 @@ impl Plan { info!(log, "Serializing plan"); - let mut sleds = std::collections::HashMap::new(); + let mut sleds = BTreeMap::new(); for (addr, allocation) in allocations { sleds.insert(addr, allocation); } @@ -152,4 +152,24 @@ mod tests { &serde_json::to_string_pretty(&schema).unwrap(), ); } + + #[test] + fn test_read_known_rss_sled_plans() { + let known_rss_sled_plans = &["madrid-rss-sled-plan.json"]; + + let path = Utf8PathBuf::from("tests/old-rss-sled-plans"); + let out_path = Utf8PathBuf::from("tests/output/new-rss-sled-plans"); + for sled_plan_basename in known_rss_sled_plans { + println!("checking {:?}", sled_plan_basename); + let contents = + std::fs::read_to_string(path.join(sled_plan_basename)) + .expect("failed to read file"); + let parsed: Plan = + serde_json::from_str(&contents).expect("failed to parse file"); + expectorate::assert_contents( + out_path.join(sled_plan_basename), + &serde_json::to_string_pretty(&parsed).unwrap(), + ); + } + } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index af81df52bb..2788e189cc 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -601,58 +601,55 @@ impl ServiceInner { .map(Into::into) .collect(); - let rack_network_config = match &config.rack_network_config { - Some(config) => { - let value = NexusTypes::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, - infra_ip_first: config.infra_ip_first, - infra_ip_last: config.infra_ip_last, - ports: config - .ports - .iter() - .map(|config| NexusTypes::PortConfigV1 { - port: config.port.clone(), - routes: config - .routes - .iter() - .map(|r| NexusTypes::RouteConfig { - destination: r.destination, - nexthop: r.nexthop, - }) - .collect(), - addresses: config.addresses.clone(), - switch: config.switch.into(), - uplink_port_speed: config.uplink_port_speed.into(), - uplink_port_fec: config.uplink_port_fec.into(), - autoneg: config.autoneg, - bgp_peers: config - .bgp_peers - .iter() - .map(|b| NexusTypes::BgpPeerConfig { - addr: b.addr, - asn: b.asn, - port: b.port.clone(), - hold_time: b.hold_time, - connect_retry: b.connect_retry, - delay_open: b.delay_open, - idle_hold_time: b.idle_hold_time, - keepalive: b.keepalive, - }) - .collect(), - }) - .collect(), - bgp: config - .bgp - .iter() - .map(|config| NexusTypes::BgpConfig { - asn: config.asn, - originate: config.originate.clone(), - }) - .collect(), - }; - Some(value) + let rack_network_config = { + let config = &config.rack_network_config; + NexusTypes::RackNetworkConfigV1 { + rack_subnet: config.rack_subnet, + infra_ip_first: config.infra_ip_first, + infra_ip_last: config.infra_ip_last, + ports: config + .ports + .iter() + .map(|config| NexusTypes::PortConfigV1 { + port: config.port.clone(), + routes: config + .routes + .iter() + .map(|r| NexusTypes::RouteConfig { + destination: r.destination, + nexthop: r.nexthop, + }) + .collect(), + addresses: config.addresses.clone(), + switch: config.switch.into(), + uplink_port_speed: config.uplink_port_speed.into(), + uplink_port_fec: config.uplink_port_fec.into(), + autoneg: config.autoneg, + bgp_peers: config + .bgp_peers + .iter() + .map(|b| NexusTypes::BgpPeerConfig { + addr: b.addr, + asn: b.asn, + port: b.port.clone(), + hold_time: b.hold_time, + connect_retry: b.connect_retry, + delay_open: b.delay_open, + idle_hold_time: b.idle_hold_time, + keepalive: b.keepalive, + }) + .collect(), + }) + .collect(), + bgp: config + .bgp + .iter() + .map(|config| NexusTypes::BgpConfig { + asn: config.asn, + originate: config.originate.clone(), + }) + .collect(), } - None => None, }; info!(self.log, "rack_network_config: {:#?}", rack_network_config); @@ -868,14 +865,14 @@ impl ServiceInner { // - Enough peers to create a new plan (if one does not exist) let bootstrap_addrs = match &config.bootstrap_discovery { BootstrapAddressDiscovery::OnlyOurs => { - HashSet::from([local_bootstrap_agent.our_address()]) + BTreeSet::from([local_bootstrap_agent.our_address()]) } BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = SledPlan::load(&self.log, storage_manager).await?; if let Some(plan) = &maybe_sled_plan { - let stored_peers: HashSet = + let stored_peers: BTreeSet = plan.sleds.keys().map(|a| *a.ip()).collect(); if stored_peers != bootstrap_addrs { let e = concat!( @@ -931,7 +928,7 @@ impl ServiceInner { schema_version: 1, body: EarlyNetworkConfigBody { ntp_servers: config.ntp_servers.clone(), - rack_network_config: config.rack_network_config.clone(), + rack_network_config: Some(config.rack_network_config.clone()), }, }; info!(self.log, "Writing Rack Network Configuration to bootstore"); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 77b6bcbed4..bc40187b38 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -57,7 +57,7 @@ use illumos_utils::running_zone::{ }; use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; use illumos_utils::zone::AddressRequest; -use illumos_utils::zone::Zones; +use illumos_utils::zpool::ZpoolName; use illumos_utils::{execute, PFEXEC}; use internal_dns::resolver::Resolver; use itertools::Itertools; @@ -80,8 +80,7 @@ use omicron_common::api::internal::shared::{ HostPortConfig, RackNetworkConfig, }; use omicron_common::backoff::{ - retry_notify, retry_policy_internal_service_aggressive, retry_policy_local, - BackoffError, + retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; use omicron_common::ledger::{self, Ledger, Ledgerable}; use omicron_common::nexus_config::{ @@ -101,7 +100,6 @@ use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::collections::HashSet; -use std::iter::FromIterator; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; @@ -112,6 +110,11 @@ use tokio::sync::{oneshot, MutexGuard}; use tokio::task::JoinHandle; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + const IPV6_UNSPECIFIED: IpAddr = IpAddr::V6(Ipv6Addr::UNSPECIFIED); #[derive(thiserror::Error, Debug)] @@ -160,6 +163,16 @@ pub enum Error { err: illumos_utils::running_zone::RunCommandError, }, + #[error("Cannot list zones")] + ZoneList(#[source] illumos_utils::zone::AdmError), + + #[error("Cannot remove zone")] + ZoneRemoval { + zone_name: String, + #[source] + err: illumos_utils::zone::AdmError, + }, + #[error("Failed to boot zone: {0}")] ZoneBoot(#[from] illumos_utils::running_zone::BootError), @@ -169,6 +182,9 @@ pub enum Error { #[error(transparent)] ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), + #[error("Failed to initialize zones: {errors:?}")] + ZoneEnsure { errors: Vec<(String, Error)> }, + #[error("Error contacting ddmd: {0}")] DdmError(#[from] DdmError), @@ -267,17 +283,47 @@ impl Error { impl From for omicron_common::api::external::Error { fn from(err: Error) -> Self { match err { - err @ Error::RequestedConfigConflicts(_) => { + Error::RequestedConfigConflicts(_) => { omicron_common::api::external::Error::invalid_request( &err.to_string(), ) } - err @ Error::RequestedConfigOutdated { .. } => { + Error::RequestedConfigOutdated { .. } => { omicron_common::api::external::Error::conflict(&err.to_string()) } - err @ Error::TimeNotSynchronized => { + Error::TimeNotSynchronized => { omicron_common::api::external::Error::unavail(&err.to_string()) } + Error::ZoneEnsure { errors } => { + // As a special case, if any zones failed to timesync, + // prioritize that error. + // + // This conversion to a 503 error was requested in + // https://github.com/oxidecomputer/omicron/issues/4776 , + // and we preserve that behavior here, even though we may + // launch many zones at the same time. + if let Some(err) = errors.iter().find_map(|(_, err)| { + if matches!(err, Error::TimeNotSynchronized) { + Some(err) + } else { + None + } + }) { + omicron_common::api::external::Error::unavail( + &err.to_string(), + ) + } else { + let internal_message = errors + .iter() + .map(|(name, err)| { + format!("failed to start {name}: {err:?}") + }) + .join("\n"); + omicron_common::api::external::Error::InternalError { + internal_message, + } + } + } _ => omicron_common::api::external::Error::InternalError { internal_message: err.to_string(), }, @@ -300,27 +346,6 @@ fn display_zone_init_errors(errors: &[(String, Box)]) -> String { output } -// Does this zone require time synchronization before it is initialized?" -// -// This function is somewhat conservative - the set of services -// that can be launched before timesync has completed is intentionally kept -// small, since it would be easy to add a service that expects time to be -// reasonably synchronized. -fn zone_requires_timesync(zone_type: &OmicronZoneType) -> bool { - match zone_type { - // These zones can be initialized and started before time has been - // synchronized. For the NTP zones, this should be self-evident -- - // we need the NTP zone to actually perform time synchronization! - // - // The DNS zone is a bit of an exception here, since the NTP zone - // itself may rely on DNS lookups as a dependency. - OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - | OmicronZoneType::InternalDns { .. } => false, - _ => true, - } -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -343,7 +368,13 @@ const ZONES_LEDGER_FILENAME: &str = "omicron-zones.json"; /// wants for all of its zones) with the locally-determined configuration for /// these zones. #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZonesConfigLocal { /// generation of the Omicron-provided part of the configuration @@ -404,7 +435,13 @@ impl OmicronZonesConfigLocal { /// wants for this zone) with any locally-determined configuration (like the /// path to the root filesystem) #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZoneConfigLocal { pub zone: OmicronZoneConfig, @@ -551,7 +588,33 @@ enum SledLocalZone { }, } -type ZoneMap = BTreeMap; +// The return type for `start_omicron_zones`. +// +// When multiple zones are started concurrently, some can fail while others +// succeed. This structure allows the function to return this nuanced +// information. +#[must_use] +struct StartZonesResult { + // The set of zones which have successfully started. + new_zones: Vec, + + // The set of (zone name, error) of zones that failed to start. + errors: Vec<(String, Error)>, +} + +// A running zone and the configuration which started it. +struct OmicronZone { + runtime: RunningZone, + config: OmicronZoneConfigLocal, +} + +impl OmicronZone { + fn name(&self) -> &str { + self.runtime.name() + } +} + +type ZoneMap = BTreeMap; /// Manages miscellaneous Sled-local services. pub struct ServiceManagerInner { @@ -718,7 +781,7 @@ impl ServiceManager { &self, // This argument attempts to ensure that the caller holds the right // lock. - _map: &MutexGuard<'_, BTreeMap>, + _map: &MutexGuard<'_, ZoneMap>, ) -> Result>, Error> { // First, try to load the current software's zone ledger. If that // works, we're done. @@ -893,84 +956,9 @@ impl ServiceManager { let omicron_zones_config = zones_config.clone().to_omicron_zones_config(); - // Initialize internal DNS only first: we need it to look up the - // boundary switch addresses. This dependency is implicit: when we call - // `ensure_all_omicron_zones` below, we eventually land in - // `opte_ports_needed()`, which for some service types (including Ntp - // but _not_ including InternalDns), we perform internal DNS lookups. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - None, - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!(z.zone_type, OmicronZoneType::InternalDns { .. }) - }, - ) - .await?; - - // Initialize NTP services next as they are required for time - // synchronization, which is a pre-requisite for the other services. We - // keep `OmicronZoneType::InternalDns` because - // `ensure_all_omicron_zones` is additive. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(&all_zones_request), - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!( - z.zone_type, - OmicronZoneType::InternalDns { .. } - | OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - ) - }, - ) - .await?; - - drop(existing_zones); - - info!(&self.inner.log, "Waiting for sled time synchronization"); - - retry_notify( - retry_policy_local(), - || async { - match self.timesync_get().await { - Ok(TimeSync { sync: true, .. }) => { - info!(&self.inner.log, "Time is synchronized"); - Ok(()) - } - Ok(ts) => Err(BackoffError::transient(format!( - "No sync {:?}", - ts - ))), - Err(e) => Err(BackoffError::transient(format!( - "Error checking for time synchronization: {}", - e - ))), - } - }, - |error, delay| { - warn!( - self.inner.log, - "Time not yet synchronised (retrying in {:?})", - delay; - "error" => ?error - ); - }, - ) - .await - .expect("Expected an infinite retry loop syncing time"); - - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize all remaining services self.ensure_all_omicron_zones( &mut existing_zones, - Some(&all_zones_request), omicron_zones_config, - |_| true, ) .await?; Ok(()) @@ -2688,17 +2676,73 @@ impl ServiceManager { Ok(running_zone) } - // Populates `existing_zones` according to the requests in `services`. - async fn initialize_omicron_zones_locked( + // Ensures that a single Omicron zone is running. + // + // This method is NOT idempotent. + // + // - If the zone already exists, in any form, it is fully removed + // before being initialized. This is primarily intended to remove "partially + // stopped/started" zones with detritus from interfering with a new zone + // being launched. + // - If zones need time to be synchronized before they are initialized + // (e.g., this is a hard requirement for CockroachDb) they can check the + // `time_is_synchronized` argument. + // - `all_u2_pools` provides a snapshot into durable storage on this sled, + // which gives the storage manager an opportunity to validate the zone's + // storage configuration against the reality of the current sled. + async fn start_omicron_zone( &self, - existing_zones: &mut BTreeMap, - requests: &Vec, - ) -> Result<(), Error> { - if let Some(name) = requests - .iter() - .map(|request| request.zone.zone_name()) - .duplicates() - .next() + zone: &OmicronZoneConfig, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + // Ensure the zone has been fully removed before we try to boot it. + // + // This ensures that old "partially booted/stopped" zones do not + // interfere with our installation. + self.ensure_removed(&zone).await?; + + // If this zone requires timesync and we aren't ready, fail it early. + if zone.zone_type.requires_timesync() && !time_is_synchronized { + return Err(Error::TimeNotSynchronized); + } + + // Ensure that this zone's storage is ready. + let root = self + .validate_storage_and_pick_mountpoint(&zone, &all_u2_pools) + .await?; + + let config = OmicronZoneConfigLocal { zone: zone.clone(), root }; + + let runtime = self + .initialize_zone( + ZoneArgs::Omicron(&config), + // filesystems= + &[], + // data_links= + &[], + ) + .await?; + + Ok(OmicronZone { runtime, config }) + } + + // Concurrently attempts to start all zones identified by requests. + // + // This method is NOT idempotent. + // + // If we try to start ANY zones concurrently, the result is contained + // in the `StartZonesResult` value. This will contain the set of zones which + // were initialized successfully, as well as the set of zones which failed + // to start. + async fn start_omicron_zones( + &self, + requests: impl Iterator + Clone, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + if let Some(name) = + requests.clone().map(|zone| zone.zone_name()).duplicates().next() { return Err(Error::BadServiceRequest { service: name, @@ -2706,38 +2750,29 @@ impl ServiceManager { }); } - let futures = requests.iter().map(|request| { - async move { - self.initialize_zone( - ZoneArgs::Omicron(request), - // filesystems= - &[], - // data_links= - &[], - ) + let futures = requests.map(|zone| async move { + self.start_omicron_zone(&zone, time_is_synchronized, all_u2_pools) .await - .map_err(|error| (request.zone.zone_name(), error)) - } + .map_err(|err| (zone.zone_name().to_string(), err)) }); + let results = futures::future::join_all(futures).await; + let mut new_zones = Vec::new(); let mut errors = Vec::new(); for result in results { match result { Ok(zone) => { - existing_zones.insert(zone.name().to_string(), zone); + info!(self.inner.log, "Zone started"; "zone" => zone.name()); + new_zones.push(zone); } - Err((zone_name, error)) => { - errors.push((zone_name, Box::new(error))); + Err((name, error)) => { + warn!(self.inner.log, "Zone failed to start"; "zone" => &name); + errors.push((name, error)) } } } - - if !errors.is_empty() { - return Err(Error::ZoneInitialize(errors)); - } - - Ok(()) + Ok(StartZonesResult { new_zones, errors }) } /// Create a zone bundle for the provided zone. @@ -2761,7 +2796,7 @@ impl ServiceManager { return self .inner .zone_bundler - .create(zone, ZoneBundleCause::ExplicitRequest) + .create(&zone.runtime, ZoneBundleCause::ExplicitRequest) .await; } Err(BundleError::NoSuchZone { name: name.to_string() }) @@ -2799,7 +2834,7 @@ impl ServiceManager { /// boot. pub async fn ensure_all_omicron_zones_persistent( &self, - request: OmicronZonesConfig, + mut request: OmicronZonesConfig, ) -> Result<(), Error> { let log = &self.inner.log; @@ -2838,21 +2873,47 @@ impl ServiceManager { // If the generation is the same as what we're running, but the contents // aren't, that's a problem, too. - if ledger_zone_config.omicron_generation == request.generation - && ledger_zone_config.clone().to_omicron_zones_config().zones - != request.zones - { - return Err(Error::RequestedConfigConflicts(request.generation)); + if ledger_zone_config.omicron_generation == request.generation { + // Nexus should send us consistent zone orderings; however, we may + // reorder the zone list inside `ensure_all_omicron_zones`. To avoid + // equality checks failing only because the two lists are ordered + // differently, sort them both here before comparing. + let mut ledger_zones = + ledger_zone_config.clone().to_omicron_zones_config().zones; + + // We sort by ID because we assume no two zones have the same ID. If + // that assumption is wrong, we may return an error here where the + // conflict is soley the list orders, but in such a case that's the + // least of our problems. + ledger_zones.sort_by_key(|z| z.id); + request.zones.sort_by_key(|z| z.id); + + if ledger_zones != request.zones { + return Err(Error::RequestedConfigConflicts( + request.generation, + )); + } } - let new_config = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(ledger_zone_config), - request, - |_| true, - ) - .await?; + let omicron_generation = request.generation; + let ledger_generation = ledger_zone_config.ledger_generation; + self.ensure_all_omicron_zones(&mut existing_zones, request).await?; + let zones = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.clone()) + .collect(); + + let new_config = OmicronZonesConfigLocal { + omicron_generation, + ledger_generation, + zones, + }; + + // If the contents of the ledger would be identical, we can avoid + // performing an update and commit. + if *ledger_zone_config == new_config { + return Ok(()); + } // Update the zones in the ledger and write it back to both M.2s *ledger_zone_config = new_config; @@ -2863,44 +2924,48 @@ impl ServiceManager { // Ensures that only the following Omicron zones are running. // - // Does not record any information such that these services are - // re-instantiated on boot. - async fn ensure_all_omicron_zones( + // This method strives to be idempotent. + // + // - Starting and stopping zones is not an atomic operation - it's possible + // that we cannot start a zone after a previous one has been successfully + // created (or destroyed) intentionally. As a result, even in error cases, + // it's possible that the set of `existing_zones` changes. However, this set + // will only change in the direction of `new_request`: zones will only be + // removed if they ARE NOT part of `new_request`, and zones will only be + // added if they ARE part of `new_request`. + // - Zones are not updated in-place: two zone configurations that differ + // in any way are treated as entirely distinct. + // - This method does not record any information such that these services + // are re-instantiated on boot. + async fn ensure_all_omicron_zones( &self, // The MutexGuard here attempts to ensure that the caller has the right // lock held when calling this function. - existing_zones: &mut MutexGuard<'_, BTreeMap>, - old_config: Option<&OmicronZonesConfigLocal>, + existing_zones: &mut MutexGuard<'_, ZoneMap>, new_request: OmicronZonesConfig, - filter: F, - ) -> Result - where - F: Fn(&OmicronZoneConfig) -> bool, - { - let log = &self.inner.log; - + ) -> Result<(), Error> { // Do some data-normalization to ensure we can compare the "requested // set" vs the "existing set" as HashSets. - let old_zones_set: HashSet = old_config - .map(|old_config| { - HashSet::from_iter( - old_config.zones.iter().map(|z| z.zone.clone()), - ) - }) - .unwrap_or_else(HashSet::new); - let requested_zones_set = - HashSet::from_iter(new_request.zones.into_iter().filter(filter)); + let old_zone_configs: HashSet = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.zone.clone()) + .collect(); + let requested_zones_set: HashSet = + new_request.zones.into_iter().collect(); let zones_to_be_removed = - old_zones_set.difference(&requested_zones_set); - let zones_to_be_added = requested_zones_set.difference(&old_zones_set); + old_zone_configs.difference(&requested_zones_set); + let zones_to_be_added = + requested_zones_set.difference(&old_zone_configs); - // For each new zone request, ensure that we've sufficiently - // synchronized time. - // - // NOTE: This imposes a constraint, during initial setup, cold boot, - // etc, that NTP and the internal DNS system it depends on MUST be - // initialized prior to other zones. + // Destroy zones that should not be running + for zone in zones_to_be_removed { + self.zone_bundle_and_try_remove(existing_zones, &zone).await; + } + + // Collect information that's necessary to start new zones + let storage = self.inner.storage.get_latest_resources().await; + let all_u2_pools = storage.all_u2_zpools(); let time_is_synchronized = match self.timesync_get_locked(&existing_zones).await { // Time is synchronized @@ -2908,166 +2973,179 @@ impl ServiceManager { // Time is not synchronized, or we can't check _ => false, }; - for zone in zones_to_be_added.clone() { - if zone_requires_timesync(&zone.zone_type) && !time_is_synchronized - { - return Err(Error::TimeNotSynchronized); - } + + // Concurrently boot all new zones + let StartZonesResult { new_zones, errors } = self + .start_omicron_zones( + zones_to_be_added, + time_is_synchronized, + &all_u2_pools, + ) + .await?; + + // Add the new zones to our tracked zone set + existing_zones.extend( + new_zones.into_iter().map(|zone| (zone.name().to_string(), zone)), + ); + + // If any zones failed to start, exit with an error + if !errors.is_empty() { + return Err(Error::ZoneEnsure { errors }); } + Ok(()) + } - // Destroy zones that should not be running - for zone in zones_to_be_removed { - let expected_zone_name = zone.zone_name(); - if let Some(mut zone) = existing_zones.remove(&expected_zone_name) { - debug!( - log, - "removing an existing zone"; - "zone_name" => &expected_zone_name, + // Attempts to take a zone bundle and remove a zone. + // + // Logs, but does not return an error on failure. + async fn zone_bundle_and_try_remove( + &self, + existing_zones: &mut MutexGuard<'_, ZoneMap>, + zone: &OmicronZoneConfig, + ) { + let log = &self.inner.log; + let expected_zone_name = zone.zone_name(); + let Some(mut zone) = existing_zones.remove(&expected_zone_name) else { + warn!( + log, + "Expected to remove zone, but could not find it"; + "zone_name" => &expected_zone_name, + ); + return; + }; + debug!( + log, + "removing an existing zone"; + "zone_name" => &expected_zone_name, + ); + if let Err(e) = self + .inner + .zone_bundler + .create(&zone.runtime, ZoneBundleCause::UnexpectedZone) + .await + { + error!( + log, + "Failed to take bundle of unexpected zone"; + "zone_name" => &expected_zone_name, + "reason" => ?e, + ); + } + if let Err(e) = zone.runtime.stop().await { + error!(log, "Failed to stop zone {}: {e}", zone.name()); + } + } + + // Ensures that if a zone is about to be installed, it does not exist. + async fn ensure_removed( + &self, + zone: &OmicronZoneConfig, + ) -> Result<(), Error> { + let zone_name = zone.zone_name(); + match Zones::find(&zone_name).await { + Ok(Some(zone)) => { + warn!( + self.inner.log, + "removing zone"; + "zone" => &zone_name, + "state" => ?zone.state(), ); - if let Err(e) = self - .inner - .zone_bundler - .create(&zone, ZoneBundleCause::UnexpectedZone) - .await + if let Err(e) = + Zones::halt_and_remove_logged(&self.inner.log, &zone_name) + .await { error!( - log, - "Failed to take bundle of unexpected zone"; - "zone_name" => &expected_zone_name, - "reason" => ?e, + self.inner.log, + "Failed to remove zone"; + "zone" => &zone_name, + "error" => %e, ); + return Err(Error::ZoneRemoval { + zone_name: zone_name.to_string(), + err: e, + }); } - if let Err(e) = zone.stop().await { - error!(log, "Failed to stop zone {}: {e}", zone.name()); - } - } else { - warn!(log, "Expected to remove zone, but could not find it"); + return Ok(()); } + Ok(None) => return Ok(()), + Err(err) => return Err(Error::ZoneList(err)), } + } - // Create zones that should be running - let storage = self.inner.storage.get_latest_resources().await; - let all_u2_pools = storage.all_u2_zpools(); - - let mut new_zones = Vec::new(); - for zone in zones_to_be_added { - // Check if we think the zone should already be running - let name = zone.zone_name(); - if existing_zones.contains_key(&name) { - // Make sure the zone actually exists in the right state too - match Zones::find(&name).await { - Ok(Some(zone)) if zone.state() == zone::State::Running => { - info!(log, "skipping running zone"; "zone" => &name); - continue; - } - _ => { - // Mismatch between SA's view and reality, let's try to - // clean up any remanants and try initialize it again - warn!( - log, - "expected to find existing zone in running state"; - "zone" => &name, - ); - if let Err(e) = - existing_zones.remove(&name).unwrap().stop().await - { - error!( - log, - "Failed to stop zone"; - "zone" => &name, - "error" => %e, - ); - } - } - } - } + // Returns a zone filesystem mountpoint, after ensuring that U.2 storage + // is valid. + async fn validate_storage_and_pick_mountpoint( + &self, + zone: &OmicronZoneConfig, + all_u2_pools: &Vec, + ) -> Result { + let name = zone.zone_name(); + + // For each new zone request, we pick a U.2 to store the zone + // filesystem. Note: This isn't known to Nexus right now, so it's a + // local-to-sled decision. + // + // Currently, the zone filesystem should be destroyed between + // reboots, so it's fine to make this decision locally. + let root = if let Some(dataset) = zone.dataset_name() { + // Check that the dataset is actually ready to be used. + let [zoned, canmount, encryption] = + illumos_utils::zfs::Zfs::get_values( + &dataset.full_name(), + &["zoned", "canmount", "encryption"], + ) + .map_err(|err| Error::GetZfsValue { + zone: zone.zone_name(), + source: err, + })?; - // For each new zone request, we pick a U.2 to store the zone - // filesystem. Note: This isn't known to Nexus right now, so it's a - // local-to-sled decision. - // - // Currently, the zone filesystem should be destroyed between - // reboots, so it's fine to make this decision locally. - let root = if let Some(dataset) = zone.dataset_name() { - // Check that the dataset is actually ready to be used. - let [zoned, canmount, encryption] = - illumos_utils::zfs::Zfs::get_values( - &dataset.full_name(), - &["zoned", "canmount", "encryption"], - ) - .map_err(|err| Error::GetZfsValue { + let check_property = |name, actual, expected| { + if actual != expected { + return Err(Error::DatasetNotReady { zone: zone.zone_name(), - source: err, - })?; - - let check_property = |name, actual, expected| { - if actual != expected { - return Err(Error::DatasetNotReady { - zone: zone.zone_name(), - dataset: dataset.full_name(), - prop_name: String::from(name), - prop_value: actual, - prop_value_expected: String::from(expected), - }); - } - return Ok(()); - }; - check_property("zoned", zoned, "on")?; - check_property("canmount", canmount, "on")?; - if dataset.dataset().dataset_should_be_encrypted() { - check_property("encryption", encryption, "aes-256-gcm")?; - } - - // If the zone happens to already manage a dataset, then - // we co-locate the zone dataset on the same zpool. - // - // This slightly reduces the underlying fault domain for the - // service. - let data_pool = dataset.pool(); - if !all_u2_pools.contains(&data_pool) { - warn!( - log, - "zone dataset requested on a zpool which doesn't exist"; - "zone" => &name, - "zpool" => %data_pool - ); - return Err(Error::MissingDevice { - device: format!("zpool: {data_pool}"), + dataset: dataset.full_name(), + prop_name: String::from(name), + prop_value: actual, + prop_value_expected: String::from(expected), }); } - data_pool.dataset_mountpoint(ZONE_DATASET) - } else { - // If the zone it not coupled to other datsets, we pick one - // arbitrarily. - let mut rng = rand::thread_rng(); - all_u2_pools - .choose(&mut rng) - .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) - .ok_or_else(|| Error::U2NotFound)? - .clone() + return Ok(()); }; - - new_zones.push(OmicronZoneConfigLocal { zone: zone.clone(), root }); - } - - self.initialize_omicron_zones_locked(existing_zones, &new_zones) - .await?; - - if let Some(old_config) = old_config { - for old_zone in &old_config.zones { - if requested_zones_set.contains(&old_zone.zone) { - new_zones.push(old_zone.clone()); - } + check_property("zoned", zoned, "on")?; + check_property("canmount", canmount, "on")?; + if dataset.dataset().dataset_should_be_encrypted() { + check_property("encryption", encryption, "aes-256-gcm")?; } - } - Ok(OmicronZonesConfigLocal { - omicron_generation: new_request.generation, - ledger_generation: old_config - .map(|c| c.ledger_generation) - .unwrap_or_else(Generation::new), - zones: new_zones, - }) + // If the zone happens to already manage a dataset, then + // we co-locate the zone dataset on the same zpool. + // + // This slightly reduces the underlying fault domain for the + // service. + let data_pool = dataset.pool(); + if !all_u2_pools.contains(&data_pool) { + warn!( + self.inner.log, + "zone dataset requested on a zpool which doesn't exist"; + "zone" => &name, + "zpool" => %data_pool + ); + return Err(Error::MissingDevice { + device: format!("zpool: {data_pool}"), + }); + } + data_pool.dataset_mountpoint(ZONE_DATASET) + } else { + // If the zone it not coupled to other datsets, we pick one + // arbitrarily. + let mut rng = rand::thread_rng(); + all_u2_pools + .choose(&mut rng) + .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) + .ok_or_else(|| Error::U2NotFound)? + .clone() + }; + Ok(root) } pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { @@ -3080,7 +3158,7 @@ impl ServiceManager { if zone.name().contains(&ZoneType::CockroachDb.to_string()) { let address = Zones::get_address( Some(zone.name()), - &zone.control_interface(), + &zone.runtime.control_interface(), )? .ip(); let host = &format!("[{address}]:{COCKROACH_PORT}"); @@ -3088,7 +3166,7 @@ impl ServiceManager { log, "Initializing CRDB Cluster - sending request to {host}" ); - if let Err(err) = zone.run_cmd(&[ + if let Err(err) = zone.runtime.run_cmd(&[ "/opt/oxide/cockroachdb/bin/cockroach", "init", "--insecure", @@ -3103,26 +3181,28 @@ impl ServiceManager { } }; info!(log, "Formatting CRDB"); - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbwipe.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbinit.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbwipe.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbinit.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; info!(log, "Formatting CRDB - Completed"); // In the single-sled case, if there are multiple CRDB nodes on @@ -3203,7 +3283,8 @@ impl ServiceManager { // connect to the UNIX socket at // format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root()) - match ntp_zone.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) { + match ntp_zone.runtime.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) + { Ok(stdout) => { let v: Vec<&str> = stdout.split(',').collect(); @@ -3793,6 +3874,15 @@ mod test { expected_zone_name_prefix: &str, ) -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zone doesn't already exist + let find_zone_ctx = MockZones::find_context(); + let prefix = expected_zone_name_prefix.to_string(); + find_zone_ctx.expect().return_once(move |zone_name| { + assert!(zone_name.starts_with(&prefix)); + Ok(None) + }); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().return_once( @@ -3850,6 +3940,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -3867,6 +3958,11 @@ mod test { // because these functions may return any number of times. fn expect_new_services() -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zones don't already exist + let find_zone_ctx = MockZones::find_context(); + find_zone_ctx.expect().returning(move |_zone_name| Ok(None)); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().returning( @@ -3925,6 +4021,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -4214,9 +4311,24 @@ mod test { OmicronZoneType::Oximeter { address }, ) .await; + + // First, ensure this is the right kind of error. + let err = result.unwrap_err(); + let errors = match &err { + Error::ZoneEnsure { errors } => errors, + err => panic!("unexpected result: {err:?}"), + }; + assert_eq!(errors.len(), 1); assert_matches::assert_matches!( - result, - Err(Error::TimeNotSynchronized) + errors[0].1, + Error::TimeNotSynchronized + ); + + // Next, ensure this still converts to an "unavail" common error + let common_err = omicron_common::api::external::Error::from(err); + assert_matches::assert_matches!( + common_err, + omicron_common::api::external::Error::ServiceUnavailable { .. } ); // Should succeed: we don't care that time has not yet synchronized (for @@ -4521,88 +4633,6 @@ mod test { logctx.cleanup_successful(); } - #[tokio::test] - async fn test_old_ledger_migration_continue() { - // This test is just like "test_old_ledger_migration", except that we - // deploy a new zone after migration and before shutting down the - // service manager. This tests that new changes modify the new, - // migrated config. - let logctx = omicron_test_utils::dev::test_setup_log( - "test_old_ledger_migration_continue", - ); - let test_config = TestConfig::new().await; - - // Before we start the service manager, stuff one of our old-format - // service ledgers into place. - let contents = - include_str!("../tests/old-service-ledgers/rack2-sled10.json"); - std::fs::write( - test_config.config_dir.path().join(SERVICES_LEDGER_FILENAME), - contents, - ) - .expect("failed to copy example old-format services ledger into place"); - - // Now start the service manager. - let helper = - LedgerTestHelper::new(logctx.log.clone(), &test_config).await; - let mgr = helper.clone().new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - - // Trigger the migration code. - let unused = Mutex::new(BTreeMap::new()); - let migrated_ledger = mgr - .load_ledgered_zones(&unused.lock().await) - .await - .expect("failed to load ledgered zones") - .unwrap(); - - // The other test verified that migration has happened normally so let's - // assume it has. Now provision a new zone. - let vv = migrated_ledger.data().omicron_generation.next(); - let id = Uuid::new_v4(); - - let _expectations = expect_new_services(); - let address = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0); - let mut zones = - migrated_ledger.data().clone().to_omicron_zones_config().zones; - zones.push(OmicronZoneConfig { - id, - underlay_address: Ipv6Addr::LOCALHOST, - zone_type: OmicronZoneType::InternalNtp { - address, - ntp_servers: vec![], - dns_servers: vec![], - domain: None, - }, - }); - mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig { - generation: vv, - zones, - }) - .await - .expect("failed to add new zone after migration"); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - // Just to be sure, shut down the manager and create a new one without - // triggering migration again. It should now report one more zone than - // was migrated earlier. - drop_service_manager(mgr); - - let mgr = helper.new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - drop_service_manager(mgr); - logctx.cleanup_successful(); - } - #[tokio::test] async fn test_old_ledger_migration_bad() { let logctx = omicron_test_utils::dev::test_setup_log( diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index b214667631..fd5995b8f1 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -26,6 +26,8 @@ use omicron_common::FileKv; use slog::{info, Drain, Logger}; use std::collections::HashMap; use std::net::IpAddr; +use std::net::Ipv4Addr; +use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use std::sync::Arc; @@ -455,7 +457,13 @@ pub async fn run_standalone_server( external_port_count: NexusTypes::ExternalPortDiscovery::Static( HashMap::new(), ), - rack_network_config: None, + rack_network_config: NexusTypes::RackNetworkConfigV1 { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; handoff_to_nexus(&log, &config, &rack_init_request).await?; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index eaf354db26..bcc354232e 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -9,7 +9,7 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkSetupError, }; -use crate::bootstrap::params::StartSledAgentRequest; +use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; use crate::instance_manager::{InstanceManager, ReservoirMode}; use crate::long_running_tasks::LongRunningTaskHandles; @@ -1187,8 +1187,8 @@ pub enum AddSledError { }, #[error("Failed to connect to DDM")] DdmAdminClient(#[source] ddm_admin_client::DdmError), - #[error("Failed to learn bootstrap ip for {0}")] - NotFound(Baseboard), + #[error("Failed to learn bootstrap ip for {0:?}")] + NotFound(BaseboardId), #[error("Failed to initialize {sled_id}: {err}")] BootstrapTcpClient { sled_id: Baseboard, @@ -1199,7 +1199,7 @@ pub enum AddSledError { /// Add a sled to an initialized rack. pub async fn sled_add( log: Logger, - sled_id: Baseboard, + sled_id: BaseboardId, request: StartSledAgentRequest, ) -> Result<(), AddSledError> { // Get all known bootstrap addresses via DDM @@ -1227,16 +1227,20 @@ pub async fn sled_add( }) .collect::>(); - // Execute the futures until we find our matching sled or done searching + // Execute the futures until we find our matching sled or are done searching let mut target_ip = None; + let mut found_baseboard = None; while let Some((ip, result)) = addrs_to_sleds.next().await { match result { Ok(baseboard) => { // Convert from progenitor type back to `sled-hardware` // type. - let found = baseboard.into_inner().into(); - if sled_id == found { + let found: Baseboard = baseboard.into_inner().into(); + if sled_id.serial_number == found.identifier() + && sled_id.part_number == found.model() + { target_ip = Some(ip); + found_baseboard = Some(found); break; } } @@ -1259,10 +1263,14 @@ pub async fn sled_add( log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), ); + // Safe to unwrap, because we would have bailed when checking target_ip + // above otherwise. baseboard and target_ip are set together. + let baseboard = found_baseboard.unwrap(); + client.start_sled_agent(&request).await.map_err(|err| { - AddSledError::BootstrapTcpClient { sled_id: sled_id.clone(), err } + AddSledError::BootstrapTcpClient { sled_id: baseboard.clone(), err } })?; - info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %sled_id); + info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %baseboard); Ok(()) } diff --git a/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..5512247ee8 --- /dev/null +++ b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1 @@ +{"rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","sleds":{"[fdb0:a840:2504:396::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b3e78a88-0f2e-476e-a8a9-2d8c90a169d6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:103::/64"}}},"[fdb0:a840:2504:157::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"168e1ad6-1e4b-4f7a-b894-157974bd8bb8","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:104::/64"}}},"[fdb0:a840:2504:355::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b9877212-212b-4588-b818-9c7b53c5b143","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:102::/64"}}},"[fdb0:a840:2504:3d2::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"c3a0f8be-5b05-4ee8-8c4e-2514de6501b6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:101::/64"}}}},"config":{"rack_subnet":"fd00:1122:3344:100::","trust_quorum_peers":[{"type":"gimlet","identifier":"BRM42220081","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220046","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM44220001","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220004","model":"913-0000019","revision":6}],"bootstrap_discovery":{"type":"only_these","addrs":["fdb0:a840:2504:3d2::1","fdb0:a840:2504:355::1","fdb0:a840:2504:396::1","fdb0:a840:2504:157::1"]},"ntp_servers":["ntp.eng.oxide.computer"],"dns_servers":["1.1.1.1","9.9.9.9"],"internal_services_ip_pool_ranges":[{"first":"172.20.28.1","last":"172.20.28.10"}],"external_dns_ips":["172.20.28.1"],"external_dns_zone_name":"madrid.eng.oxide.computer","external_certificates":[{"cert":"","key":""}],"recovery_silo":{"silo_name":"recovery","user_name":"recovery","user_password_hash":"$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY"},"rack_network_config":{"rack_subnet":"fd00:1122:3344:1::/56","infra_ip_first":"172.20.15.37","infra_ip_last":"172.20.15.38","ports":[{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.38/29"],"switch":"switch0","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false},{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.37/29"],"switch":"switch1","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false}],"bgp":[]}}} diff --git a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..69f68c60ad --- /dev/null +++ b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1,164 @@ +{ + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "sleds": { + "[fdb0:a840:2504:157::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "168e1ad6-1e4b-4f7a-b894-157974bd8bb8", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:104::/64" + } + } + }, + "[fdb0:a840:2504:355::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b9877212-212b-4588-b818-9c7b53c5b143", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:102::/64" + } + } + }, + "[fdb0:a840:2504:396::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b3e78a88-0f2e-476e-a8a9-2d8c90a169d6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:103::/64" + } + } + }, + "[fdb0:a840:2504:3d2::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "c3a0f8be-5b05-4ee8-8c4e-2514de6501b6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:101::/64" + } + } + } + }, + "config": { + "trust_quorum_peers": [ + { + "type": "gimlet", + "identifier": "BRM42220081", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220046", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM44220001", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220004", + "model": "913-0000019", + "revision": 6 + } + ], + "bootstrap_discovery": { + "type": "only_these", + "addrs": [ + "fdb0:a840:2504:157::1", + "fdb0:a840:2504:355::1", + "fdb0:a840:2504:396::1", + "fdb0:a840:2504:3d2::1" + ] + }, + "ntp_servers": [ + "ntp.eng.oxide.computer" + ], + "dns_servers": [ + "1.1.1.1", + "9.9.9.9" + ], + "internal_services_ip_pool_ranges": [ + { + "first": "172.20.28.1", + "last": "172.20.28.10" + } + ], + "external_dns_ips": [ + "172.20.28.1" + ], + "external_dns_zone_name": "madrid.eng.oxide.computer", + "external_certificates": [ + { + "cert": "", + "key": "" + } + ], + "recovery_silo": { + "silo_name": "recovery", + "user_name": "recovery", + "user_password_hash": "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY" + }, + "rack_network_config": { + "rack_subnet": "fd00:1122:3344:1::/56", + "infra_ip_first": "172.20.15.37", + "infra_ip_last": "172.20.15.38", + "ports": [ + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.38/29" + ], + "switch": "switch0", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + }, + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.37/29" + ], + "switch": "switch1", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + } + ], + "bgp": [] + } + } +} \ No newline at end of file diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 8fc2429169..40ed41bfda 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 15f0a4ebe1..2e259aa42f 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index f7a93260e3..6c874d9a70 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 12cb2afd24..d0b4f94d9f 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/tools/ci_download_clickhouse b/tools/ci_download_clickhouse index 03a5bff24c..675566fad7 100755 --- a/tools/ci_download_clickhouse +++ b/tools/ci_download_clickhouse @@ -20,7 +20,7 @@ DOWNLOAD_DIR="$TARGET_DIR/downloads" DEST_DIR="./$TARGET_DIR/clickhouse" # If you change this, you must also update the md5sums below -CIDL_VERSION="v22.8.9.24" +CIDL_VERSION="$(cat "$SOURCE_DIR/clickhouse_version")" source "$SOURCE_DIR/clickhouse_checksums" # Download from manually-populated S3 bucket for now diff --git a/tools/ci_download_cockroachdb b/tools/ci_download_cockroachdb index ca484c000f..5755e7e665 100755 --- a/tools/ci_download_cockroachdb +++ b/tools/ci_download_cockroachdb @@ -13,7 +13,7 @@ set -o errexit SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" ARG0="$(basename "${BASH_SOURCE[0]}")" -# If you change this, you must also update the md5sums below +# If you change this, you must also update the sha256sums below CIDL_VERSION="$(cat "$SOURCE_DIR/cockroachdb_version")" source "$SOURCE_DIR/cockroachdb_checksums" @@ -49,6 +49,7 @@ function main # Configure this program configure_os "$CIDL_OS" CIDL_URL="$CIDL_URL_BASE/$TARBALL_FILENAME" + CIDL_SHA256FUNC="do_sha256sum" # Download the file. echo "URL: $CIDL_URL" @@ -60,9 +61,9 @@ function main local DO_DOWNLOAD="true" if [[ -f "$TARBALL_FILE" ]]; then # If the file exists with a valid checksum, we can skip downloading. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" == "$CIDL_MD5" ]]; then + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" == "$CIDL_SHA256" ]]; then DO_DOWNLOAD="false" fi fi @@ -72,12 +73,12 @@ function main do_download_curl "$CIDL_URL" "$TARBALL_FILE" || \ fail "failed to download file" - # Verify the md5sum. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" != "$CIDL_MD5" ]]; then - fail "md5sum mismatch \ - (expected $CIDL_MD5, found $calculated_md5)" + # Verify the sha256sum. + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" != "$CIDL_SHA256" ]]; then + fail "sha256sum mismatch \ + (expected $CIDL_SHA256, found $calculated_sha256)" fi fi @@ -105,24 +106,21 @@ function configure_os darwin*) CIDL_BUILD="darwin-10.9-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_DARWIN" - CIDL_MD5FUNC="do_md5" + CIDL_SHA256="$CIDL_SHA256_DARWIN" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; linux-gnu*) CIDL_BUILD="linux-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_LINUX" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_LINUX" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; solaris*) CIDL_BUILD="illumos" CIDL_SUFFIX="tar.gz" - CIDL_MD5="$CIDL_MD5_ILLUMOS" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_ILLUMOS" CIDL_URL_BASE="$CIDL_URL_ILLUMOS" CIDL_ASSEMBLE="do_assemble_illumos" ;; @@ -143,14 +141,9 @@ function do_download_curl curl --silent --show-error --fail --location --output "$2" "$1" } -function do_md5 +function do_sha256sum { - md5 < "$1" -} - -function do_md5sum -{ - md5sum < "$1" | awk '{print $1}' + sha256sum < "$1" | awk '{print $1}' } function do_untar diff --git a/tools/clickhouse_version b/tools/clickhouse_version new file mode 100644 index 0000000000..93b98bf738 --- /dev/null +++ b/tools/clickhouse_version @@ -0,0 +1 @@ +v22.8.9.24 \ No newline at end of file diff --git a/tools/cockroachdb_checksums b/tools/cockroachdb_checksums index 50e873100f..20b6e237f8 100644 --- a/tools/cockroachdb_checksums +++ b/tools/cockroachdb_checksums @@ -1,3 +1,3 @@ -CIDL_MD5_DARWIN="2db972c254b4e3b599e12110520178b5" -CIDL_MD5_LINUX="8c3170883e0a0be1a34b44090c067a8c" -CIDL_MD5_ILLUMOS="d8999aff364e5d70f226e139fda724a3" +CIDL_SHA256_DARWIN="1ca69e0911af11a73305c3c6f4650b912d70754900b5bf7b80a1d361efe36561" +CIDL_SHA256_LINUX="24c321820e7ee45fa07fe91ac138befe13ad860e41c6ed595ce58823205ff4a9" +CIDL_SHA256_ILLUMOS="f151714ba3a6e02caaaa59727482c36085e60d6bd2fa963938e9a3d8c8a77088" diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 8ee3001179..6c58d83ea3 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0b0dbc2f8bbc5d2d9be92d64c4865f8f9335355aae62f7de9f67f81dfb3f1803" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 3fa53a9483..896be8d38c 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0ac038bbaa54d0ae0ac5ccaeff48f03070618372cca26c9d09b716b909bf9355" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 1dacea54dc..8fc4d083f8 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" -MGD_LINUX_SHA256="248732202f5102bf0947f5f91871379b6c6945fe387d4272cebe6e08f1b58184" \ No newline at end of file +CIDL_SHA256="26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" +MGD_LINUX_SHA256="b2c823dd714fad67546a0e0c0d4ae56f2fe2e7c43434469b38e13b78de9f6968" \ No newline at end of file diff --git a/wicket-common/src/rack_setup.rs b/wicket-common/src/rack_setup.rs index e3d5fad5fb..f28c0639a9 100644 --- a/wicket-common/src/rack_setup.rs +++ b/wicket-common/src/rack_setup.rs @@ -5,12 +5,24 @@ // Copyright 2023 Oxide Computer Company use omicron_common::address; -use omicron_common::api::internal::shared::RackNetworkConfig; +use omicron_common::api::internal::shared::BgpConfig; +use omicron_common::api::internal::shared::PortConfigV1; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use std::collections::BTreeSet; use std::net::IpAddr; +use std::net::Ipv4Addr; + +/// User-specified parts of +/// [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct UserSpecifiedRackNetworkConfig { + pub infra_ip_first: Ipv4Addr, + pub infra_ip_last: Ipv4Addr, + pub ports: Vec, + pub bgp: Vec, +} // The portion of `CurrentRssUserConfig` that can be posted in one shot; it is // provided by the wicket user uploading a TOML file, currently. @@ -27,5 +39,5 @@ pub struct PutRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: RackNetworkConfig, + pub rack_network_config: UserSpecifiedRackNetworkConfig, } diff --git a/wicket/src/cli/rack_setup/config_template.toml b/wicket/src/cli/rack_setup/config_template.toml index 2886fa01d7..d091237b5f 100644 --- a/wicket/src/cli/rack_setup/config_template.toml +++ b/wicket/src/cli/rack_setup/config_template.toml @@ -40,7 +40,6 @@ bootstrap_sleds = [] # TODO: docs on network config [rack_network_config] -rack_subnet = "" infra_ip_first = "" infra_ip_last = "" diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 5a8e8a560e..d050610c30 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -19,7 +19,7 @@ use wicket_common::rack_update::SpType; use wicketd_client::types::BootstrapSledDescription; use wicketd_client::types::CurrentRssUserConfigInsensitive; use wicketd_client::types::IpRange; -use wicketd_client::types::RackNetworkConfigV1; +use wicketd_client::types::UserSpecifiedRackNetworkConfig; static TEMPLATE: &str = include_str!("config_template.toml"); @@ -176,7 +176,7 @@ fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { fn populate_network_table( table: &mut Table, - config: Option<&RackNetworkConfigV1>, + config: Option<&UserSpecifiedRackNetworkConfig>, ) { // Helper function to serialize enums into their appropriate string // representations. @@ -195,7 +195,6 @@ fn populate_network_table( }; for (property, value) in [ - ("rack_subnet", config.rack_subnet.to_string()), ("infra_ip_first", config.infra_ip_first.to_string()), ("infra_ip_last", config.infra_ip_last.to_string()), ] { @@ -350,7 +349,6 @@ fn populate_network_table( #[cfg(test)] mod tests { use super::*; - use omicron_common::api::internal::shared::RackNetworkConfigV1 as InternalRackNetworkConfig; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; use wicket_common::rack_update::SpIdentifier; @@ -373,6 +371,7 @@ mod tests { use omicron_common::api::internal::shared::PortSpeed as InternalPortSpeed; use omicron_common::api::internal::shared::RouteConfig as InternalRouteConfig; use omicron_common::api::internal::shared::SwitchLocation as InternalSwitchLocation; + use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig as InternalUserSpecifiedRackNetworkConfig; let rnc = value.rack_network_config.unwrap(); @@ -401,8 +400,7 @@ mod tests { .collect(), external_dns_ips: value.external_dns_ips, ntp_servers: value.ntp_servers, - rack_network_config: InternalRackNetworkConfig { - rack_subnet: rnc.rack_subnet, + rack_network_config: InternalUserSpecifiedRackNetworkConfig { infra_ip_first: rnc.infra_ip_first, infra_ip_last: rnc.infra_ip_last, ports: rnc @@ -514,8 +512,7 @@ mod tests { )], external_dns_ips: vec!["10.0.0.1".parse().unwrap()], ntp_servers: vec!["ntp1.com".into(), "ntp2.com".into()], - rack_network_config: Some(RackNetworkConfigV1 { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: Some(UserSpecifiedRackNetworkConfig { infra_ip_first: "172.30.0.1".parse().unwrap(), infra_ip_last: "172.30.0.10".parse().unwrap(), ports: vec![PortConfigV1 { diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs index be21984997..c009d597c8 100644 --- a/wicket/src/ui/panes/update.rs +++ b/wicket/src/ui/panes/update.rs @@ -1435,12 +1435,13 @@ impl UpdatePane { Constraint::Length(cell_width), Constraint::Length(cell_width), ]; - let header_table = Table::new(std::iter::empty(), &width_constraints) - .header( - Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) - .style(header_style), - ) - .block(block.clone().title("OVERVIEW (* = active)")); + let header_table = + Table::new(std::iter::empty::(), &width_constraints) + .header( + Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) + .style(header_style), + ) + .block(block.clone().title("OVERVIEW (* = active)")); frame.render_widget(header_table, self.table_headers_rect); // For the selected item, draw the version table. diff --git a/wicket/src/ui/widgets/ignition.rs b/wicket/src/ui/widgets/ignition.rs index cef942d2c7..1e04c4d02b 100644 --- a/wicket/src/ui/widgets/ignition.rs +++ b/wicket/src/ui/widgets/ignition.rs @@ -61,29 +61,26 @@ impl IgnitionPopup { format!("IGNITION: {}", component.to_string_uppercase()), style::header(true), )]), - body: Text { - lines: vec![ - Line::from(vec![Span::styled( - "Power On", - style::line( - self.selected_command == IgnitionCommand::PowerOn, - ), - )]), - Line::from(vec![Span::styled( - "Power Off", - style::line( - self.selected_command == IgnitionCommand::PowerOff, - ), - )]), - Line::from(vec![Span::styled( - "Power Reset", - style::line( - self.selected_command - == IgnitionCommand::PowerReset, - ), - )]), - ], - }, + body: Text::from(vec![ + Line::from(vec![Span::styled( + "Power On", + style::line( + self.selected_command == IgnitionCommand::PowerOn, + ), + )]), + Line::from(vec![Span::styled( + "Power Off", + style::line( + self.selected_command == IgnitionCommand::PowerOff, + ), + )]), + Line::from(vec![Span::styled( + "Power Reset", + style::line( + self.selected_command == IgnitionCommand::PowerReset, + ), + )]), + ]), buttons: vec![ButtonText::new("Close", "Esc")], } } diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index 9c1740679f..9748a93bd5 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -32,7 +32,6 @@ use http::StatusCode; use internal_dns::resolver::Resolver; use omicron_common::address; use omicron_common::api::external::SemverVersion; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::update::ArtifactHashId; use omicron_common::update::ArtifactId; @@ -47,6 +46,7 @@ use std::net::IpAddr; use std::net::Ipv6Addr; use std::time::Duration; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; use wicket_common::update_events::EventReport; use wicket_common::WICKETD_TIMEOUT; @@ -172,7 +172,7 @@ pub struct CurrentRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: Option, + pub rack_network_config: Option, } // This is a summary of the subset of `RackInitializeRequest` that is sensitive; @@ -1189,12 +1189,14 @@ async fn post_start_preflight_uplink_check( let (network_config, dns_servers, ntp_servers) = { let rss_config = rqctx.rss_config.lock().unwrap(); - let network_config = - rss_config.rack_network_config().cloned().ok_or_else(|| { + let network_config = rss_config + .user_specified_rack_network_config() + .cloned() + .ok_or_else(|| { HttpError::for_bad_request( None, "uplink preflight check requires setting \ - the uplink config for RSS" + the uplink config for RSS" .to_string(), ) })?; diff --git a/wicketd/src/preflight_check.rs b/wicketd/src/preflight_check.rs index 75cc5f6e09..4cd17604a0 100644 --- a/wicketd/src/preflight_check.rs +++ b/wicketd/src/preflight_check.rs @@ -2,7 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use slog::o; use slog::Logger; @@ -12,6 +11,7 @@ use std::sync::Mutex; use tokio::sync::oneshot; use update_engine::events::EventReport; use update_engine::GenericSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; mod uplink; @@ -44,7 +44,7 @@ impl PreflightCheckerHandler { pub(crate) async fn uplink_start( &self, - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, @@ -94,7 +94,7 @@ pub(crate) struct PreflightCheckerBusy; #[derive(Debug)] enum PreflightCheck { Uplink { - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index 47995f0c10..31d479a5ed 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -22,7 +22,6 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::api::internal::shared::PortConfigV1; use omicron_common::api::internal::shared::PortFec as OmicronPortFec; use omicron_common::api::internal::shared::PortSpeed as OmicronPortSpeed; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::OMICRON_DPD_TAG; use schemars::JsonSchema; @@ -49,6 +48,7 @@ use trust_dns_resolver::error::ResolveError; use trust_dns_resolver::error::ResolveErrorKind; use trust_dns_resolver::TokioAsyncResolver; use update_engine::StepSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; const DNS_PORT: u16 = 53; @@ -68,7 +68,7 @@ const IPADM: &str = "/usr/sbin/ipadm"; const ROUTE: &str = "/usr/sbin/route"; pub(super) async fn run_local_uplink_preflight_check( - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index f654597d81..4bc1a6b62b 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -26,7 +26,6 @@ use gateway_client::types::SpType; use omicron_certificates::CertificateError; use omicron_common::address; use omicron_common::address::Ipv4Range; -use omicron_common::api::internal::shared::RackNetworkConfig; use sled_hardware::Baseboard; use slog::warn; use std::collections::BTreeSet; @@ -34,6 +33,7 @@ use std::mem; use std::net::IpAddr; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; // TODO-correctness For now, we always use the same rack subnet when running // RSS. When we get to multirack, this will be wrong, but there are many other @@ -64,7 +64,7 @@ pub(crate) struct CurrentRssConfig { external_dns_zone_name: String, external_certificates: Vec, recovery_silo_password_hash: Option, - rack_network_config: Option, + rack_network_config: Option, // External certificates are uploaded in two separate actions (cert then // key, or vice versa). Here we store a partial certificate; once we have @@ -82,7 +82,9 @@ impl CurrentRssConfig { &self.ntp_servers } - pub(crate) fn rack_network_config(&self) -> Option<&RackNetworkConfig> { + pub(crate) fn user_specified_rack_network_config( + &self, + ) -> Option<&UserSpecifiedRackNetworkConfig> { self.rack_network_config.as_ref() } @@ -252,7 +254,6 @@ impl CurrentRssConfig { .collect(); let request = RackInitializeRequest { - rack_subnet: RACK_SUBNET, trust_quorum_peers, bootstrap_discovery: BootstrapAddressDiscovery::OnlyThese( bootstrap_ips, @@ -268,7 +269,7 @@ impl CurrentRssConfig { user_name: UserId(RECOVERY_SILO_USERNAME.into()), user_password_hash, }, - rack_network_config: Some(rack_network_config), + rack_network_config, }; Ok(request) @@ -452,7 +453,7 @@ impl From<&'_ CurrentRssConfig> for CurrentRssUserConfig { } fn validate_rack_network_config( - config: &RackNetworkConfig, + config: &UserSpecifiedRackNetworkConfig, ) -> Result { use bootstrap_agent_client::types::BgpConfig as BaBgpConfig; use bootstrap_agent_client::types::BgpPeerConfig as BaBgpPeerConfig; @@ -497,7 +498,7 @@ fn validate_rack_network_config( // TODO Add more client side checks on `rack_network_config` contents? Ok(bootstrap_agent_client::types::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, + rack_subnet: RACK_SUBNET.into(), infra_ip_first: config.infra_ip_first, infra_ip_last: config.infra_ip_last, ports: config diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 4e62ba13e3..7038f9c038 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -233,42 +233,42 @@ bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-f dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-linux-gnu.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-illumos.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } @@ -276,7 +276,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } @@ -286,7 +286,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] }