diff --git a/.envrc b/.envrc index 036459a4a9..48df8e3c63 100644 --- a/.envrc +++ b/.envrc @@ -6,7 +6,7 @@ PATH_add out/clickhouse PATH_add out/dendrite-stub/bin PATH_add out/mgd/root/opt/oxide/mgd/bin -if nix flake show &> /dev/null +if [ "$OMICRON_USE_FLAKE" = 1 ] && nix flake show &> /dev/null then use flake; -fi \ No newline at end of file +fi diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 92f7af36d5..68d816fc2d 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@e17a4e247d4a36441181d7758c499d97e1e006bd # v2 + uses: taiki-e/install-action@717ed1cb83959ef327137c2f806e1d8597bfca9f # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fa99017b0d..724f88e7a3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,10 +26,11 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-22.04, macos-12 ] + os: [ ubuntu-22.04, macos-14 ] steps: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo + if: ${{ startsWith(matrix.os, 'ubuntu') }} run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: diff --git a/.gitignore b/.gitignore index fc3cb4133a..fc5fd5f297 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,6 @@ README.html TODO.html logs out -tools/clickhouse* -tools/cockroach* /clickhouse/ /cockroachdb/ smf/nexus/root.json diff --git a/Cargo.lock b/Cargo.lock index 31d3529588..03c7778355 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -332,7 +332,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7b2dbe9169059af0f821e811180fddc971fc210c776c133c7819ccd6e478db" dependencies = [ - "rustix 0.38.30", + "rustix 0.38.31", "tempfile", "windows-sys 0.52.0", ] @@ -462,20 +462,20 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source = "git+https://github.com/oxidecomputer/propolis?rev=c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d#c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" dependencies = [ "bhyve_api_sys", "libc", - "strum", + "strum 0.25.0", ] [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source = "git+https://github.com/oxidecomputer/propolis?rev=c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d#c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" dependencies = [ "libc", - "strum", + "strum 0.25.0", ] [[package]] @@ -830,6 +830,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cbc" version = "0.1.2" @@ -1049,6 +1058,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "console" version = "0.15.8" @@ -1213,20 +1235,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" -[[package]] -name = "crossbeam" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" -dependencies = [ - "cfg-if", - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1261,16 +1269,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "crossbeam-queue" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.16" @@ -1310,7 +1308,7 @@ dependencies = [ [[package]] name = "crucible-agent-client" version = "0.0.1" -source = "git+https://github.com/oxidecomputer/crucible?rev=2d4bc11232d53f177c286383926fa5f8c1b2a938#2d4bc11232d53f177c286383926fa5f8c1b2a938" +source = "git+https://github.com/oxidecomputer/crucible?rev=796dce526dd7ed7b52a0429a486ccba4a9da1ce5#796dce526dd7ed7b52a0429a486ccba4a9da1ce5" dependencies = [ "anyhow", "chrono", @@ -1326,7 +1324,7 @@ dependencies = [ [[package]] name = "crucible-pantry-client" version = "0.0.1" -source = "git+https://github.com/oxidecomputer/crucible?rev=2d4bc11232d53f177c286383926fa5f8c1b2a938#2d4bc11232d53f177c286383926fa5f8c1b2a938" +source = "git+https://github.com/oxidecomputer/crucible?rev=796dce526dd7ed7b52a0429a486ccba4a9da1ce5#796dce526dd7ed7b52a0429a486ccba4a9da1ce5" dependencies = [ "anyhow", "chrono", @@ -1343,7 +1341,7 @@ dependencies = [ [[package]] name = "crucible-smf" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/crucible?rev=2d4bc11232d53f177c286383926fa5f8c1b2a938#2d4bc11232d53f177c286383926fa5f8c1b2a938" +source = "git+https://github.com/oxidecomputer/crucible?rev=796dce526dd7ed7b52a0429a486ccba4a9da1ce5#796dce526dd7ed7b52a0429a486ccba4a9da1ce5" dependencies = [ "crucible-workspace-hack", "libc", @@ -1397,6 +1395,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.9.2" @@ -2020,9 +2039,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.13" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" +checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" [[package]] name = "ecdsa" @@ -2224,9 +2243,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fatfs" @@ -2247,7 +2266,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -2258,7 +2277,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -3654,7 +3673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -4180,6 +4199,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +dependencies = [ + "serde", +] + [[package]] name = "nanorand" version = "0.7.0" @@ -4315,7 +4343,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4387,7 +4415,7 @@ dependencies = [ "slog", "static_assertions", "steno", - "strum", + "strum 0.25.0", "subprocess", "swrite", "term", @@ -4450,7 +4478,7 @@ dependencies = [ "serde_json", "sled-agent-client", "slog", - "strum", + "strum 0.25.0", "thiserror", "tokio", "uuid", @@ -4539,7 +4567,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4856,7 +4884,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "slog", - "strum", + "strum 0.25.0", "test-strategy", "thiserror", "tokio", @@ -4865,21 +4893,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "omicron-deploy" -version = "0.1.0" -dependencies = [ - "anyhow", - "camino", - "clap 4.4.3", - "crossbeam", - "omicron-package", - "omicron-workspace-hack", - "serde", - "serde_derive", - "thiserror", -] - [[package]] name = "omicron-dev" version = "0.1.0" @@ -5059,7 +5072,7 @@ dependencies = [ "slog-term", "sp-sim", "steno", - "strum", + "strum 0.25.0", "subprocess", "tempfile", "term", @@ -5082,9 +5095,12 @@ dependencies = [ "async-bb8-diesel", "chrono", "clap 4.4.3", + "crossterm", "crucible-agent-client", + "csv", "diesel", "dropshot", + "dyn-clone", "expectorate", "futures", "gateway-client", @@ -5093,6 +5109,7 @@ dependencies = [ "humantime", "internal-dns", "ipnetwork", + "multimap", "nexus-client", "nexus-db-model", "nexus-db-queries", @@ -5106,13 +5123,14 @@ dependencies = [ "omicron-workspace-hack", "oximeter-client", "pq-sys", + "ratatui", "regex", "serde", "serde_json", "sled-agent-client", "slog", "slog-error-chain", - "strum", + "strum 0.25.0", "subprocess", "tabled", "textwrap 0.16.0", @@ -5147,7 +5165,7 @@ dependencies = [ "slog-bunyan", "slog-term", "smf", - "strum", + "strum 0.25.0", "swrite", "tar", "thiserror", @@ -5385,7 +5403,7 @@ dependencies = [ "regex-syntax 0.8.2", "reqwest", "ring 0.17.7", - "rustix 0.38.30", + "rustix 0.38.31", "schemars", "semver 1.0.21", "serde", @@ -5671,7 +5689,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "strum", + "strum 0.25.0", "thiserror", "trybuild", "uuid", @@ -5724,7 +5742,7 @@ dependencies = [ "slog-async", "slog-dtrace", "slog-term", - "strum", + "strum 0.25.0", "subprocess", "thiserror", "tokio", @@ -5766,7 +5784,7 @@ dependencies = [ "slog-term", "sqlformat", "sqlparser", - "strum", + "strum 0.25.0", "tabled", "tempfile", "thiserror", @@ -5934,27 +5952,26 @@ dependencies = [ [[package]] name = "parse-display" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6509d08722b53e8dafe97f2027b22ccbe3a5db83cb352931e9716b0aa44bc5c" +checksum = "06af5f9333eb47bd9ba8462d612e37a8328a5cb80b13f0af4de4c3b89f52dee5" dependencies = [ - "once_cell", "parse-display-derive", "regex", + "regex-syntax 0.8.2", ] [[package]] name = "parse-display-derive" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68517892c8daf78da08c0db777fcc17e07f2f63ef70041718f8a7630ad84f341" +checksum = "dc9252f259500ee570c75adcc4e317fa6f57a1e47747d622e0bf838002a7b790" dependencies = [ - "once_cell", "proc-macro2", "quote", "regex", - "regex-syntax 0.7.5", - "structmeta", + "regex-syntax 0.8.2", + "structmeta 0.3.0", "syn 2.0.48", ] @@ -6613,7 +6630,7 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source = "git+https://github.com/oxidecomputer/propolis?rev=c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d#c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" dependencies = [ "async-trait", "base64", @@ -6634,7 +6651,7 @@ dependencies = [ [[package]] name = "propolis-mock-server" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source = "git+https://github.com/oxidecomputer/propolis?rev=c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d#c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" dependencies = [ "anyhow", "atty", @@ -6664,7 +6681,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source = "git+https://github.com/oxidecomputer/propolis?rev=c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d#c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" dependencies = [ "schemars", "serde", @@ -6852,19 +6869,20 @@ dependencies = [ [[package]] name = "ratatui" -version = "0.25.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5659e52e4ba6e07b2dad9f1158f578ef84a73762625ddb51536019f34d180eb" +checksum = "bcb12f8fbf6c62614b0d56eb352af54f6a22410c3b079eb53ee93c7b97dd31d8" dependencies = [ "bitflags 2.4.0", "cassowary", + "compact_str", "crossterm", "indoc 2.0.3", "itertools 0.12.1", "lru", "paste", "stability", - "strum", + "strum 0.26.1", "unicode-segmentation", "unicode-width", ] @@ -6961,7 +6979,7 @@ dependencies = [ "nu-ansi-term", "serde", "strip-ansi-escapes", - "strum", + "strum 0.25.0", "strum_macros 0.25.2", "thiserror", "unicode-segmentation", @@ -7029,12 +7047,6 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.2" @@ -7400,9 +7412,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.30" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.0", "errno", @@ -7574,8 +7586,7 @@ dependencies = [ [[package]] name = "samael" version = "0.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75583aad4a51c50fc0af69c230d18078c9d5a69a98d0f6013d01053acf744f4" +source = "git+https://github.com/oxidecomputer/samael?branch=oxide/omicron#9e609a8f6fa0dd84e3bb8f579f46bd780c8be62b" dependencies = [ "base64", "bindgen", @@ -8120,6 +8131,7 @@ dependencies = [ name = "sled-agent-client" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", "chrono", "ipnetwork", @@ -8617,7 +8629,19 @@ checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d" dependencies = [ "proc-macro2", "quote", - "structmeta-derive", + "structmeta-derive 0.2.0", + "syn 2.0.48", +] + +[[package]] +name = "structmeta" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive 0.3.0", "syn 2.0.48", ] @@ -8632,6 +8656,17 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "structmeta-derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "structopt" version = "0.3.26" @@ -8665,6 +8700,15 @@ dependencies = [ "strum_macros 0.25.2", ] +[[package]] +name = "strum" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "723b93e8addf9aa965ebe2d11da6d7540fa2283fcea14b3371ff055f7ba13f5f" +dependencies = [ + "strum_macros 0.26.1", +] + [[package]] name = "strum_macros" version = "0.24.3" @@ -8691,6 +8735,19 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "strum_macros" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3417fc93d76740d974a01654a09777cb500428cc874ca9f45edfe0c4d4cd18" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] + [[package]] name = "subprocess" version = "0.2.9" @@ -8855,14 +8912,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -8919,7 +8975,7 @@ checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9" dependencies = [ "proc-macro2", "quote", - "structmeta", + "structmeta 0.2.0", "syn 2.0.48", ] @@ -9599,9 +9655,9 @@ dependencies = [ [[package]] name = "tui-tree-widget" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136011b328c4f392499a02c4b5b78d509fb297bf9c10f2bda5d11d65cb946e4c" +checksum = "5c317bb061f42d943a2eb118b5de0ee98fc2443f0631e54b24a19de014a28810" dependencies = [ "ratatui", "unicode-width", diff --git a/Cargo.toml b/Cargo.toml index 6e4799d184..d33758fc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,6 @@ members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", "dev-tools/xtask", "dns-server", "end-to-end-tests", @@ -96,7 +95,6 @@ default-members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", # Do not include xtask in the list of default members, because this causes # hakari to not work as well and build times to be longer. # See omicron#4392. @@ -180,9 +178,10 @@ cookie = "0.18" criterion = { version = "0.5.1", features = [ "async_tokio" ] } crossbeam = "0.8" crossterm = { version = "0.27.0", features = ["event-stream"] } -crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } -crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } -crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } +crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "796dce526dd7ed7b52a0429a486ccba4a9da1ce5" } +crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "796dce526dd7ed7b52a0429a486ccba4a9da1ce5" } +crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "796dce526dd7ed7b52a0429a486ccba4a9da1ce5" } +csv = "1.3.0" curve25519-dalek = "4" datatest-stable = "0.2.3" display-error-chain = "0.2.0" @@ -197,6 +196,7 @@ dns-server = { path = "dns-server" } dns-service-client = { path = "clients/dns-service-client" } dpd-client = { path = "clients/dpd-client" } dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] } +dyn-clone = "1.0.16" either = "1.9.0" expectorate = "1.1.0" fatfs = "0.3.6" @@ -248,6 +248,7 @@ mime_guess = "2.0.4" mockall = "0.12" newtype_derive = "0.1.6" mg-admin-client = { path = "clients/mg-admin-client" } +multimap = "0.8.1" nexus-blueprint-execution = { path = "nexus/blueprint-execution" } nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } @@ -293,7 +294,7 @@ oximeter-instruments = { path = "oximeter/instruments" } oximeter-macro-impl = { path = "oximeter/oximeter-macro-impl" } oximeter-producer = { path = "oximeter/producer" } p256 = "0.13" -parse-display = "0.8.2" +parse-display = "0.9.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } parse-size = "1.0.0" paste = "1.0.14" @@ -308,13 +309,13 @@ prettyplease = { version = "0.2.16", features = ["verbatim"] } proc-macro2 = "1.0" progenitor = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "ff6c4df2e816eee6e7b2b0488777d30ef35ee217" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "ff6c4df2e816eee6e7b2b0488777d30ef35ee217" } -propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "ff6c4df2e816eee6e7b2b0488777d30ef35ee217" } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" } +propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" } proptest = "1.4.0" quote = "1.0" rand = "0.8.5" -ratatui = "0.25.0" +ratatui = "0.26.1" rayon = "1.8" rcgen = "0.12.1" reedline = "0.28.0" @@ -378,7 +379,7 @@ syn = { version = "2.0" } tabled = "0.15.0" tar = "0.4" tempdir = "0.3" -tempfile = "3.9" +tempfile = "3.10" term = "0.7" termios = "0.3" textwrap = "0.16.0" @@ -400,7 +401,7 @@ trust-dns-server = "0.22" trybuild = "1.0.89" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } -tui-tree-widget = "0.16.0" +tui-tree-widget = "0.17.0" unicode-width = "0.1.11" update-common = { path = "update-common" } update-engine = { path = "update-engine" } @@ -611,3 +612,8 @@ branch = "oxide/omicron" # to it. [patch.crates-io.omicron-workspace-hack] path = "workspace-hack" + +# Pulls in https://github.com/njaremko/samael/pull/41 +[patch.crates-io.samael] +git = "https://github.com/oxidecomputer/samael" +branch = "oxide/omicron" diff --git a/bootstore/src/schemes/v0/peer.rs b/bootstore/src/schemes/v0/peer.rs index 5290e64672..1175e64143 100644 --- a/bootstore/src/schemes/v0/peer.rs +++ b/bootstore/src/schemes/v0/peer.rs @@ -62,6 +62,14 @@ pub enum NodeRequestError { }, } +impl From for omicron_common::api::external::Error { + fn from(error: NodeRequestError) -> Self { + omicron_common::api::external::Error::internal_error(&format!( + "{error}" + )) + } +} + /// A request sent to the `Node` task from the `NodeHandle` pub enum NodeApiRequest { /// Initialize a rack at the behest of RSS running on the same scrimlet as diff --git a/clients/ddm-admin-client/build.rs b/clients/ddm-admin-client/build.rs index da74ee9962..c51ec05faa 100644 --- a/clients/ddm-admin-client/build.rs +++ b/clients/ddm-admin-client/build.rs @@ -33,7 +33,9 @@ fn main() -> Result<()> { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. let local_path = - format!("../../out/downloads/ddm-admin-{commit}.json"); + env::var("DDM_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/ddm-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_ddm_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/clients/dpd-client/build.rs b/clients/dpd-client/build.rs index 6a65ab9495..536869b4a2 100644 --- a/clients/dpd-client/build.rs +++ b/clients/dpd-client/build.rs @@ -38,7 +38,10 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the // requisite openapi spec. - let local_path = format!("../../out/downloads/dpd-{commit}.json"); + let local_path = + env::var("DPD_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/dpd-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_dendrite_openapi` (after updating `tools/dendrite_openapi_version` if the dendrite commit in package-manifest.toml has changed)"); } diff --git a/clients/mg-admin-client/build.rs b/clients/mg-admin-client/build.rs index dcc7ae61cb..d9886d0ece 100644 --- a/clients/mg-admin-client/build.rs +++ b/clients/mg-admin-client/build.rs @@ -31,8 +31,9 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. - let local_path = - format!("../../out/downloads/mg-admin-{commit}.json"); + let local_path = env::var("MG_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/mg-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_mg_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index 8630030b24..71b94441ed 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" license = "MPL-2.0" [dependencies] +anyhow.workspace = true async-trait.workspace = true chrono.workspace = true omicron-common.workspace = true diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 39de64ec62..eb1e57b11f 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -4,8 +4,11 @@ //! Interface for making API requests to a Sled Agent +use anyhow::Context; use async_trait::async_trait; use std::convert::TryFrom; +use std::net::IpAddr; +use std::net::SocketAddr; use uuid::Uuid; progenitor::generate_api!( @@ -86,6 +89,74 @@ impl types::OmicronZoneType { | types::OmicronZoneType::Oximeter { .. } => false, } } + + /// Identifies whether this is a Nexus zone + pub fn is_nexus(&self) -> bool { + match self { + types::OmicronZoneType::Nexus { .. } => true, + + types::OmicronZoneType::BoundaryNtp { .. } + | types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::ExternalDns { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => false, + } + } + + /// This zone's external IP + pub fn external_ip(&self) -> anyhow::Result> { + match self { + types::OmicronZoneType::Nexus { external_ip, .. } => { + Ok(Some(*external_ip)) + } + + types::OmicronZoneType::ExternalDns { dns_address, .. } => { + let dns_address = + dns_address.parse::().with_context(|| { + format!( + "failed to parse ExternalDns address {dns_address}" + ) + })?; + Ok(Some(dns_address.ip())) + } + + types::OmicronZoneType::BoundaryNtp { snat_cfg, .. } => { + Ok(Some(snat_cfg.ip)) + } + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => Ok(None), + } + } + + /// The service vNIC providing external connectivity to this zone + pub fn service_vnic(&self) -> Option<&types::NetworkInterface> { + match self { + types::OmicronZoneType::Nexus { nic, .. } + | types::OmicronZoneType::ExternalDns { nic, .. } + | types::OmicronZoneType::BoundaryNtp { nic, .. } => Some(nic), + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => None, + } + } } impl omicron_common::api::external::ClientError for types::Error { @@ -351,7 +422,6 @@ impl From for types::Ipv6Net { impl From for types::IpNet { fn from(s: std::net::IpAddr) -> Self { - use std::net::IpAddr; match s { IpAddr::V4(v4) => Self::V4(v4.into()), IpAddr::V6(v6) => Self::V6(v6.into()), diff --git a/clients/wicketd-client/src/lib.rs b/clients/wicketd-client/src/lib.rs index 01c3b04f87..09f9ca1418 100644 --- a/clients/wicketd-client/src/lib.rs +++ b/clients/wicketd-client/src/lib.rs @@ -51,6 +51,7 @@ progenitor::generate_api!( CurrentRssUserConfigInsensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfigSensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, + UserSpecifiedRackNetworkConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, GetLocationResponse = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, }, replace = { diff --git a/common/src/address.rs b/common/src/address.rs index 65a6604daf..152fb9319e 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -24,6 +24,12 @@ pub const MAX_PORT: u16 = u16::MAX; /// minimum possible value for a tcp or udp port pub const MIN_PORT: u16 = u16::MIN; +/// The amount of redundancy for Nexus services. +/// +/// This is used by both RSS (to distribute the initial set of services) and the +/// Reconfigurator (to know whether to add new Nexus zones) +pub const NEXUS_REDUNDANCY: usize = 3; + /// The amount of redundancy for internal DNS servers. /// /// Must be less than or equal to MAX_DNS_REDUNDANCY. @@ -457,6 +463,18 @@ impl TryFrom<(Ipv6Addr, Ipv6Addr)> for IpRange { } } +impl From for IpRange { + fn from(value: Ipv4Range) -> Self { + Self::V4(value) + } +} + +impl From for IpRange { + fn from(value: Ipv6Range) -> Self { + Self::V6(value) + } +} + /// A non-decreasing IPv4 address range, inclusive of both ends. /// /// The first address must be less than or equal to the last address. diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 24f4c34797..2545d4cb91 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -340,6 +340,8 @@ pub struct BackgroundTaskConfig { pub sync_service_zone_nat: SyncServiceZoneNatConfig, /// configuration for the bfd manager task pub bfd_manager: BfdManagerConfig, + /// configuration for region replacement task + pub region_replacement: RegionReplacementConfig, } #[serde_as] @@ -444,6 +446,14 @@ pub struct BlueprintTasksConfig { pub period_secs_execute: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionReplacementConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -548,8 +558,8 @@ mod test { ConfigDropshotWithTls, ConsoleConfig, Database, DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, LoadErrorKind, MgdConfig, NatCleanupConfig, - PackageConfig, PhantomDiskConfig, SchemeName, TimeseriesDbConfig, - Tunables, UpdatesConfig, + PackageConfig, PhantomDiskConfig, RegionReplacementConfig, SchemeName, + TimeseriesDbConfig, Tunables, UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; @@ -706,6 +716,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -819,7 +830,10 @@ mod test { }, sync_service_zone_nat: SyncServiceZoneNatConfig { period_secs: Duration::from_secs(30) - } + }, + region_replacement: RegionReplacementConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -882,6 +896,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index e08d5f9477..3f566f55ee 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -12,9 +12,12 @@ anyhow.workspace = true async-bb8-diesel.workspace = true chrono.workspace = true clap.workspace = true +crossterm.workspace = true crucible-agent-client.workspace = true +csv.workspace = true diesel.workspace = true dropshot.workspace = true +dyn-clone.workspace = true futures.workspace = true gateway-client.workspace = true gateway-messages.workspace = true @@ -29,6 +32,7 @@ omicron-common.workspace = true oximeter-client.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" +ratatui.workspace = true serde.workspace = true serde_json.workspace = true sled-agent-client.workspace = true @@ -43,6 +47,7 @@ uuid.workspace = true ipnetwork.workspace = true omicron-workspace-hack.workspace = true nexus-test-utils.workspace = true +multimap.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index c2a4250595..9c41c25cc0 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -160,6 +160,8 @@ pub struct DbFetchOptions { /// Subcommands that query or update the database #[derive(Debug, Subcommand)] enum DbCommands { + /// Print information about the rack + Rack(RackArgs), /// Print information about disks Disks(DiskArgs), /// Print information about internal and external DNS @@ -180,6 +182,18 @@ enum DbCommands { Validate(ValidateArgs), } +#[derive(Debug, Args)] +struct RackArgs { + #[command(subcommand)] + command: RackCommands, +} + +#[derive(Debug, Subcommand)] +enum RackCommands { + /// Summarize current racks + List, +} + #[derive(Debug, Args)] struct DiskArgs { #[command(subcommand)] @@ -399,13 +413,16 @@ impl DbArgs { // here. We will then check the schema version explicitly and warn the // user if it doesn't match. let datastore = Arc::new( - DataStore::new_unchecked(pool) + DataStore::new_unchecked(log.clone(), pool) .map_err(|e| anyhow!(e).context("creating datastore"))?, ); check_schema_version(&datastore).await; let opctx = OpContext::for_tests(log.clone(), datastore.clone()); match &self.command { + DbCommands::Rack(RackArgs { command: RackCommands::List }) => { + cmd_db_rack_list(&opctx, &datastore, &self.fetch_opts).await + } DbCommands::Disks(DiskArgs { command: DiskCommands::Info(uuid), }) => cmd_db_disk_info(&opctx, &datastore, uuid).await, @@ -619,6 +636,50 @@ async fn cmd_db_disk_list( Ok(()) } +/// Run `omdb db rack info`. +async fn cmd_db_rack_list( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct RackRow { + id: String, + initialized: bool, + tuf_base_url: String, + rack_subnet: String, + } + + let ctx = || "listing racks".to_string(); + + let limit = fetch_opts.fetch_limit; + let rack_list = datastore + .rack_list(opctx, &first_page(limit)) + .await + .context("listing racks")?; + check_limit(&rack_list, limit, ctx); + + let rows = rack_list.into_iter().map(|rack| RackRow { + id: rack.id().to_string(), + initialized: rack.initialized, + tuf_base_url: rack.tuf_base_url.unwrap_or_else(|| "-".to_string()), + rack_subnet: rack + .rack_subnet + .map(|subnet| subnet.to_string()) + .unwrap_or_else(|| "-".to_string()), + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + /// Run `omdb db disk info `. async fn cmd_db_disk_info( opctx: &OpContext, diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index 770cba9f62..ece4c4f109 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -22,6 +22,12 @@ use gateway_client::types::SpState; use gateway_client::types::SpType; use tabled::Tabled; +mod dashboard; +mod sensors; + +use dashboard::DashboardArgs; +use sensors::SensorsArgs; + /// Arguments to the "omdb mgs" subcommand #[derive(Debug, Args)] pub struct MgsArgs { @@ -35,19 +41,25 @@ pub struct MgsArgs { #[derive(Debug, Subcommand)] enum MgsCommands { + /// Dashboard of SPs + Dashboard(DashboardArgs), + /// Show information about devices and components visible to MGS Inventory(InventoryArgs), + + /// Show information about sensors, as gleaned by MGS + Sensors(SensorsArgs), } #[derive(Debug, Args)] struct InventoryArgs {} impl MgsArgs { - pub(crate) async fn run_cmd( + async fn mgs_client( &self, omdb: &Omdb, log: &slog::Logger, - ) -> Result<(), anyhow::Error> { + ) -> Result { let mgs_url = match &self.mgs_url { Some(cli_or_env_url) => cli_or_env_url.clone(), None => { @@ -68,11 +80,24 @@ impl MgsArgs { } }; eprintln!("note: using MGS URL {}", &mgs_url); - let mgs_client = gateway_client::Client::new(&mgs_url, log.clone()); + Ok(gateway_client::Client::new(&mgs_url, log.clone())) + } + pub(crate) async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> Result<(), anyhow::Error> { match &self.command { - MgsCommands::Inventory(inventory_args) => { - cmd_mgs_inventory(&mgs_client, inventory_args).await + MgsCommands::Dashboard(args) => { + dashboard::cmd_mgs_dashboard(omdb, log, self, args).await + } + MgsCommands::Inventory(args) => { + let mgs_client = self.mgs_client(omdb, log).await?; + cmd_mgs_inventory(&mgs_client, args).await + } + MgsCommands::Sensors(args) => { + sensors::cmd_mgs_sensors(omdb, log, self, args).await } } } @@ -156,6 +181,10 @@ fn sp_type_to_str(s: &SpType) -> &'static str { } } +fn sp_to_string(s: &SpIdentifier) -> String { + format!("{} {}", sp_type_to_str(&s.type_), s.slot) +} + fn show_sp_ids(sp_ids: &[SpIdentifier]) -> Result<(), anyhow::Error> { #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs new file mode 100644 index 0000000000..153618b7c0 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs @@ -0,0 +1,1113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Code for the MGS dashboard subcommand + +use anyhow::{Context, Result}; +use chrono::{Local, Offset, TimeZone}; +use crossterm::{ + event::{ + self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, + KeyModifiers, + }, + execute, + terminal::{ + disable_raw_mode, enable_raw_mode, EnterAlternateScreen, + LeaveAlternateScreen, + }, +}; +use dyn_clone::DynClone; +use ratatui::{ + backend::{Backend, CrosstermBackend}, + layout::{Alignment, Constraint, Direction, Layout, Rect}, + style::{Color, Modifier, Style}, + symbols, + text::{Line, Span}, + widgets::{ + Axis, Block, Borders, Chart, Dataset, List, ListItem, ListState, + Paragraph, + }, + Frame, Terminal, +}; + +use crate::mgs::sensors::{ + sensor_data, sensor_metadata, SensorId, SensorInput, SensorMetadata, + SensorValues, SensorsArgs, +}; +use crate::mgs::sp_to_string; +use clap::Args; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpIdentifier; +use multimap::MultiMap; +use std::collections::HashMap; +use std::fs::File; +use std::io; +use std::time::{Duration, Instant, SystemTime}; + +#[derive(Debug, Args)] +pub(crate) struct DashboardArgs { + #[clap(flatten)] + sensors_args: SensorsArgs, + + /// simulate real-time with input + #[clap(long)] + simulate_realtime: bool, +} + +struct StatefulList { + state: ListState, + n: usize, +} + +impl StatefulList { + fn next(&mut self) { + self.state.select(match self.state.selected() { + Some(ndx) => Some((ndx + 1) % self.n), + None => Some(0), + }); + } + + fn previous(&mut self) { + self.state.select(match self.state.selected() { + Some(0) => Some(self.n - 1), + Some(ndx) => Some(ndx - 1), + None => Some(0), + }); + } + + fn unselect(&mut self) { + self.state.select(None); + } + + fn selected(&self) -> Option { + self.state.selected() + } +} + +struct Series { + name: String, + color: Color, + data: Vec<(f64, f64)>, + raw: Vec>, +} + +trait Attributes: DynClone { + fn label(&self) -> String; + fn legend_label(&self) -> String; + fn x_axis_label(&self) -> String { + "Time".to_string() + } + fn y_axis_label(&self) -> String; + fn axis_value(&self, val: f64) -> String; + fn legend_value(&self, val: f64) -> String; + + fn increase(&mut self, _ndx: usize) -> Option { + None + } + + fn decrease(&mut self, _ndx: usize) -> Option { + None + } + + fn clear(&mut self) {} +} + +dyn_clone::clone_trait_object!(Attributes); + +#[derive(Clone)] +struct TempGraph; + +impl Attributes for TempGraph { + fn label(&self) -> String { + "Temperature".to_string() + } + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Degrees Celsius".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.0}°", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:4.2}°", val) + } +} + +#[derive(Clone)] +struct FanGraph; + +impl Attributes for FanGraph { + fn label(&self) -> String { + "Fan speed".to_string() + } + fn legend_label(&self) -> String { + "Fans".to_string() + } + + fn y_axis_label(&self) -> String { + "RPM".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:3.1}K", val / 1000.0) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:.0}", val) + } +} + +#[derive(Clone)] +struct CurrentGraph; + +impl Attributes for CurrentGraph { + fn label(&self) -> String { + "Output current".to_string() + } + + fn legend_label(&self) -> String { + "Regulators".to_string() + } + + fn y_axis_label(&self) -> String { + "Rails".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}A", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}A", val) + } +} + +#[derive(Clone)] +struct VoltageGraph; + +impl Attributes for VoltageGraph { + fn label(&self) -> String { + "Voltage".to_string() + } + + fn legend_label(&self) -> String { + "Rails".to_string() + } + + fn y_axis_label(&self) -> String { + "Volts".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}V", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}V", val) + } +} + +#[derive(Clone)] +struct SensorGraph; + +impl Attributes for SensorGraph { + fn label(&self) -> String { + "Sensor output".to_string() + } + + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Units".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}", val) + } +} + +struct Graph { + series: Vec, + legend: StatefulList, + time: usize, + width: usize, + offs: usize, + interpolate: usize, + bounds: [f64; 2], + attributes: Box, +} + +impl Graph { + fn new(all: &[String], attr: Box) -> Result { + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, s) in all.iter().enumerate() { + series.push(Series { + name: s.to_string(), + color: colors[ndx % colors.len()], + data: Vec::new(), + raw: Vec::new(), + }) + } + + Ok(Graph { + series, + legend: StatefulList { state: ListState::default(), n: all.len() }, + time: 0, + width: 600, + offs: 0, + interpolate: 0, + bounds: [20.0, 120.0], + attributes: attr, + }) + } + + fn flip(from: &[(&Self, String)], series_ndx: usize) -> Self { + let rep = from[0].0; + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, (graph, name)) in from.iter().enumerate() { + series.push(Series { + name: name.clone(), + color: colors[ndx % colors.len()], + data: graph.series[series_ndx].data.clone(), + raw: graph.series[series_ndx].raw.clone(), + }); + } + + Graph { + series, + legend: StatefulList { state: ListState::default(), n: from.len() }, + time: rep.time, + width: rep.width, + offs: rep.offs, + interpolate: rep.interpolate, + bounds: rep.bounds, + attributes: rep.attributes.clone(), + } + } + + fn data(&mut self, data: &[Option]) { + for (ndx, s) in self.series.iter_mut().enumerate() { + s.raw.push(data[ndx]); + } + + self.time += 1; + + if self.offs > 0 { + self.offs += 1; + } + } + + fn update_data(&mut self) { + for s in &mut self.series { + s.data = Vec::new(); + } + + for i in 0..self.width { + if self.time < (self.width - i) + self.offs { + continue; + } + + let offs = self.time - (self.width - i) - self.offs; + + for (_ndx, s) in &mut self.series.iter_mut().enumerate() { + if let Some(datum) = s.raw[offs] { + let point = (i as f64, datum as f64); + + if self.interpolate != 0 { + if let Some(last) = s.data.last() { + let x_delta = point.0 - last.0; + let slope = (point.1 - last.1) / x_delta; + let x_inc = x_delta / self.interpolate as f64; + + for x in 0..self.interpolate { + s.data.push(( + point.0 + x as f64 * x_inc, + point.1 + (slope * x_inc), + )); + } + } + } + + s.data.push((i as f64, datum as f64)); + } + } + } + + self.update_bounds(); + } + + fn update_bounds(&mut self) { + let selected = self.legend.state.selected(); + let mut min = None; + let mut max = None; + + for (ndx, s) in self.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + for (_, datum) in &s.data { + min = match min { + Some(min) if datum < min => Some(datum), + None => Some(datum), + _ => min, + }; + + max = match max { + Some(max) if datum > max => Some(datum), + None => Some(datum), + _ => max, + }; + } + } + + if let Some(min) = min { + self.bounds[0] = ((min * 0.85) / 2.0) * 2.0; + } + + if self.bounds[0] < 0.0 { + self.bounds[0] = 0.0; + } + + if let Some(max) = max { + self.bounds[1] = ((max * 1.15) / 2.0) * 2.0; + } + } + + fn previous(&mut self) { + self.legend.previous(); + } + + fn next(&mut self) { + self.legend.next(); + } + + fn unselect(&mut self) { + self.legend.unselect(); + } + + fn selected(&self) -> Option { + self.legend.selected() + } + + fn set_interpolate(&mut self) { + let interpolate = (1000.0 - self.width as f64) / self.width as f64; + + if interpolate >= 1.0 { + self.interpolate = interpolate as usize; + } else { + self.interpolate = 0; + } + } + + fn zoom_in(&mut self) { + self.width = (self.width as f64 * 0.8) as usize; + self.set_interpolate(); + } + + fn zoom_out(&mut self) { + self.width = (self.width as f64 * 1.25) as usize; + self.set_interpolate(); + } + + fn time_right(&mut self) { + let delta = (self.width as f64 * 0.25) as usize; + + if delta > self.offs { + self.offs = 0; + } else { + self.offs -= delta; + } + } + + fn time_left(&mut self) { + self.offs += (self.width as f64 * 0.25) as usize; + } +} + +struct Dashboard { + graphs: HashMap<(SpIdentifier, MeasurementKind), Graph>, + flipped: HashMap, + sids: HashMap<(SpIdentifier, MeasurementKind), Vec>, + kinds: Vec, + selected_kind: usize, + sps: Vec, + selected_sp: usize, + status: String, + time: u64, +} + +impl Dashboard { + fn new(metadata: &SensorMetadata) -> Result { + let mut sps = + metadata.sensors_by_sp.keys().copied().collect::>(); + let mut graphs = HashMap::new(); + let mut sids = HashMap::new(); + sps.sort(); + + let kinds = vec![ + MeasurementKind::Temperature, + MeasurementKind::Speed, + MeasurementKind::Current, + ]; + + for &sp in sps.iter() { + let sensors = metadata.sensors_by_sp.get_vec(&sp).unwrap(); + let mut by_kind = MultiMap::new(); + + for sid in sensors { + let (_, s, _) = metadata.sensors_by_id.get(sid).unwrap(); + by_kind.insert(s.kind, (s.name.clone(), *sid)); + } + + let keys = by_kind.keys().copied().collect::>(); + + for k in keys { + let mut v = by_kind.remove(&k).unwrap(); + v.sort(); + + let labels = + v.iter().map(|(n, _)| n.clone()).collect::>(); + + graphs.insert( + (sp, k), + Graph::new( + labels.as_slice(), + match k { + MeasurementKind::Temperature => Box::new(TempGraph), + MeasurementKind::Current => Box::new(CurrentGraph), + MeasurementKind::Speed => Box::new(FanGraph), + MeasurementKind::Voltage => Box::new(VoltageGraph), + _ => Box::new(SensorGraph), + }, + )?, + ); + + sids.insert( + (sp, k), + v.iter().map(|(_, sid)| *sid).collect::>(), + ); + } + } + + let status = sp_to_string(&sps[0]); + + Ok(Dashboard { + graphs, + flipped: HashMap::new(), + sids, + kinds, + selected_kind: 0, + sps, + selected_sp: 0, + status, + time: secs()?, + }) + } + + fn status(&self) -> Vec<(&str, &str)> { + vec![("Status", &self.status)] + } + + fn update_data(&mut self) { + for graph in self.graphs.values_mut() { + graph.update_data(); + } + + for graph in self.flipped.values_mut() { + graph.update_data(); + } + } + + fn up(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.previous(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().previous(); + } + } + + fn down(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.next(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().next(); + } + } + + fn esc(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.unselect(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().unselect(); + } + } + + fn left(&mut self) { + if self.selected_sp == 0 { + self.selected_sp = self.sps.len() - 1; + } else { + self.selected_sp -= 1; + } + + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn right(&mut self) { + self.selected_sp = (self.selected_sp + 1) % self.sps.len(); + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn time_left(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_left(); + } + + for graph in self.flipped.values_mut() { + graph.time_left(); + } + } + + fn time_right(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_right(); + } + + for graph in self.flipped.values_mut() { + graph.time_right(); + } + } + + fn flip(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if self.flipped.remove(&selected_kind).is_some() { + return; + } + + let sp = self.sps[self.selected_sp]; + + let graph = self.graphs.get(&(sp, selected_kind)).unwrap(); + + if let Some(ndx) = graph.selected() { + let mut from = vec![]; + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + from.push(( + self.graphs.get(&(*sp, selected_kind)).unwrap(), + sp_to_string(sp), + )); + } + + self.flipped + .insert(selected_kind, Graph::flip(from.as_slice(), ndx)); + } + } + + fn tab(&mut self) { + self.selected_kind = (self.selected_kind + 1) % self.kinds.len(); + } + + fn zoom_in(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_in(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_in(); + } + } + + fn zoom_out(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_out(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_out(); + } + } + + fn gap(&mut self, length: u64) { + let mut gap: Vec> = vec![]; + + for (graph, sids) in &self.sids { + while gap.len() < sids.len() { + gap.push(None); + } + + let graph = self.graphs.get_mut(graph).unwrap(); + + for _ in 0..length { + graph.data(&gap[0..sids.len()]); + } + } + } + + fn values(&mut self, values: &SensorValues) { + for (graph, sids) in &self.sids { + let mut data = vec![]; + + for sid in sids { + if let Some(value) = values.values.get(sid) { + data.push(*value); + } else { + data.push(None); + } + } + + let graph = self.graphs.get_mut(graph).unwrap(); + graph.data(data.as_slice()); + } + + self.time = values.time; + } +} + +fn run_dashboard( + terminal: &mut Terminal, + dashboard: &mut Dashboard, + force_update: bool, +) -> Result { + let update = if crossterm::event::poll(Duration::from_secs(0))? { + if let Event::Key(key) = event::read()? { + match key.code { + KeyCode::Char('q') => return Ok(true), + KeyCode::Char('+') => dashboard.zoom_in(), + KeyCode::Char('-') => dashboard.zoom_out(), + KeyCode::Char('<') => dashboard.time_left(), + KeyCode::Char('>') => dashboard.time_right(), + KeyCode::Char('!') => dashboard.flip(), + KeyCode::Char('l') => { + // + // ^L -- form feed -- is historically used to clear and + // redraw the screen. And, notably, it is what dtach(1) + // will send when attaching to a dashboard. If we + // see ^L, clear the terminal to force a total redraw. + // + if key.modifiers == KeyModifiers::CONTROL { + terminal.clear()?; + } + } + KeyCode::Up => dashboard.up(), + KeyCode::Down => dashboard.down(), + KeyCode::Right => dashboard.right(), + KeyCode::Left => dashboard.left(), + KeyCode::Esc => dashboard.esc(), + KeyCode::Tab => dashboard.tab(), + _ => {} + } + } + true + } else { + force_update + }; + + if update { + dashboard.update_data(); + terminal.draw(|f| draw(f, dashboard))?; + } + + Ok(false) +} + +fn secs() -> Result { + let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH)?; + Ok(now.as_secs()) +} + +/// +/// Runs `omdb mgs dashboard` +/// +pub(crate) async fn cmd_mgs_dashboard( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &DashboardArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.sensors_args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, values) = + sensor_metadata(&mut input, &args.sensors_args).await?; + + let mut dashboard = Dashboard::new(&metadata)?; + let mut last = values.time; + let mut force = true; + let mut update = true; + + dashboard.values(&values); + + if args.sensors_args.input.is_some() && !args.simulate_realtime { + loop { + let values = sensor_data(&mut input, &metadata).await?; + + if values.time == 0 { + break; + } + + if values.time != last + 1 { + dashboard.gap(values.time - last - 1); + } + + last = values.time; + dashboard.values(&values); + } + + update = false; + } + + // setup terminal + enable_raw_mode()?; + let mut stdout = io::stdout(); + execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?; + let backend = CrosstermBackend::new(stdout); + let mut terminal = Terminal::new(backend)?; + + let res = 'outer: loop { + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break Err(err), + Ok(true) => break Ok(()), + _ => {} + } + + force = false; + + let now = match secs() { + Err(err) => break Err(err), + Ok(now) => now, + }; + + if update && now != last { + let kicked = Instant::now(); + let f = sensor_data(&mut input, &metadata); + last = now; + + while Instant::now().duration_since(kicked).as_millis() < 800 { + tokio::time::sleep(Duration::from_millis(10)).await; + + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break 'outer Err(err), + Ok(true) => break 'outer Ok(()), + _ => {} + } + } + + let values = match f.await { + Err(err) => break Err(err), + Ok(v) => v, + }; + + dashboard.values(&values); + force = true; + continue; + } + + tokio::time::sleep(Duration::from_millis(10)).await; + }; + + // restore terminal + disable_raw_mode()?; + execute!( + terminal.backend_mut(), + LeaveAlternateScreen, + DisableMouseCapture + )?; + terminal.show_cursor()?; + + if let Err(err) = res { + println!("{err:?}"); + } + + Ok(()) +} + +fn draw_graph(f: &mut Frame, parent: Rect, graph: &mut Graph, now: u64) { + // + // We want the right panel to be 31 characters wide (a left-justified 20 + // and a right justified 8 + margins), but we don't want it to consume + // more than 80%; calculate accordingly. + // + let r = std::cmp::min((31 * 100) / parent.width, 80); + + let chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints( + [Constraint::Percentage(100 - r), Constraint::Percentage(r)] + .as_ref(), + ) + .split(parent); + + let latest = now as i64 - graph.offs as i64; + let earliest = Local.timestamp_opt(latest - graph.width as i64, 0).unwrap(); + let latest = Local.timestamp_opt(latest, 0).unwrap(); + + // + // We want a format that preserves horizontal real estate just a tad more + // than .to_rfc3339_opts()... + // + let fmt = "%Y-%m-%d %H:%M:%S"; + + let tz_offset = earliest.offset().fix().local_minus_utc(); + let tz = if tz_offset != 0 { + let hours = tz_offset / 3600; + let minutes = (tz_offset % 3600) / 60; + + if minutes != 0 { + format!("Z{:+}:{:02}", hours, minutes.abs()) + } else { + format!("Z{:+}", hours) + } + } else { + "Z".to_string() + }; + + let x_labels = vec![ + Span::styled( + format!("{}{}", earliest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + format!("{}{}", latest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + ]; + + let mut datasets = vec![]; + let selected = graph.legend.state.selected(); + + for (ndx, s) in graph.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + datasets.push( + Dataset::default() + .name(&*s.name) + .marker(symbols::Marker::Braille) + .style(Style::default().fg(s.color)) + .data(&s.data), + ); + } + + let chart = Chart::new(datasets) + .block( + Block::default() + .title(Span::styled( + graph.attributes.label(), + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL), + ) + .x_axis( + Axis::default() + .title(graph.attributes.x_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(x_labels) + .bounds([0.0, graph.width as f64]) + .labels_alignment(Alignment::Right), + ) + .y_axis( + Axis::default() + .title(graph.attributes.y_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(vec![ + Span::styled( + graph.attributes.axis_value(graph.bounds[0]), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + graph.attributes.axis_value(graph.bounds[1]), + Style::default().add_modifier(Modifier::BOLD), + ), + ]) + .bounds(graph.bounds), + ); + + f.render_widget(chart, chunks[0]); + + let mut rows = vec![]; + + for s in &graph.series { + let val = match s.raw.last() { + None | Some(None) => "-".to_string(), + Some(Some(val)) => graph.attributes.legend_value((*val).into()), + }; + + rows.push(ListItem::new(Line::from(vec![ + Span::styled( + format!("{:<20}", s.name), + Style::default().fg(s.color), + ), + Span::styled(format!("{:>8}", val), Style::default().fg(s.color)), + ]))); + } + + let list = List::new(rows) + .block( + Block::default() + .borders(Borders::ALL) + .title(graph.attributes.legend_label()), + ) + .highlight_style( + Style::default() + .bg(Color::LightGreen) + .fg(Color::Black) + .add_modifier(Modifier::BOLD), + ); + + // We can now render the item list + f.render_stateful_widget(list, chunks[1], &mut graph.legend.state); +} + +fn draw_graphs(f: &mut Frame, parent: Rect, dashboard: &mut Dashboard) { + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints( + [ + Constraint::Ratio(1, 2), + Constraint::Ratio(1, 4), + Constraint::Ratio(1, 4), + ] + .as_ref(), + ) + .split(parent); + + let sp = dashboard.sps[dashboard.selected_sp]; + + for (i, k) in dashboard.kinds.iter().enumerate() { + if let Some(graph) = dashboard.flipped.get_mut(k) { + draw_graph(f, screen[i], graph, dashboard.time); + } else { + draw_graph( + f, + screen[i], + dashboard.graphs.get_mut(&(sp, *k)).unwrap(), + dashboard.time, + ); + } + } +} + +fn draw_status(f: &mut Frame, parent: Rect, status: &[(&str, &str)]) { + let mut bar = vec![]; + + for i in 0..status.len() { + let s = &status[i]; + + bar.push(Span::styled( + s.0, + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::styled( + ": ", + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::raw(s.1)); + + if i < status.len() - 1 { + bar.push(Span::raw(" | ")); + } + } + + let text = vec![Line::from(bar)]; + + let para = Paragraph::new(text) + .alignment(Alignment::Right) + .style(Style::default().fg(Color::White).bg(Color::Black)); + + f.render_widget(para, parent); +} + +fn draw(f: &mut Frame, dashboard: &mut Dashboard) { + let size = f.size(); + + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Min(1), Constraint::Length(1)].as_ref()) + .split(size); + + draw_graphs(f, screen[0], dashboard); + draw_status(f, screen[1], &dashboard.status()); +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs new file mode 100644 index 0000000000..d00bebd96c --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs @@ -0,0 +1,950 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of the "mgs sensors" subcommand + +use anyhow::{bail, Context}; +use clap::Args; +use gateway_client::types::MeasurementErrorCode; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpComponentDetails; +use gateway_client::types::SpIdentifier; +use gateway_client::types::SpIgnition; +use gateway_client::types::SpType; +use multimap::MultiMap; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +#[derive(Debug, Args)] +pub(crate) struct SensorsArgs { + /// verbose messages + #[clap(long, short)] + pub verbose: bool, + + /// restrict to specified sled(s) + #[clap(long, use_value_delimiter = true)] + pub sled: Vec, + + /// exclude sleds rather than include them + #[clap(long, short)] + pub exclude: bool, + + /// include switches + #[clap(long)] + pub switches: bool, + + /// include PSC + #[clap(long)] + pub psc: bool, + + /// print sensors every second + #[clap(long, short)] + pub sleep: bool, + + /// parseable output + #[clap(long, short)] + pub parseable: bool, + + /// show latencies + #[clap(long)] + pub show_latencies: bool, + + /// restrict sensors by type of sensor + #[clap( + long, + short, + value_name = "sensor type", + use_value_delimiter = true + )] + pub types: Option>, + + /// restrict sensors by name + #[clap( + long, + short, + value_name = "sensor name", + use_value_delimiter = true + )] + pub named: Option>, + + /// simulate using specified file as input + #[clap(long, short)] + pub input: Option, + + /// start time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub start: Option, + + /// end time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub end: Option, + + /// duration, if using an input file + #[clap( + long, + value_name = "seconds", + requires = "input", + conflicts_with = "end" + )] + pub duration: Option, +} + +impl SensorsArgs { + fn matches_sp(&self, sp: &SpIdentifier) -> bool { + match sp.type_ { + SpType::Sled => { + let matched = if !self.sled.is_empty() { + self.sled.contains(&sp.slot) + } else { + true + }; + + matched != self.exclude + } + SpType::Switch => self.switches, + SpType::Power => self.psc, + } + } +} + +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Sensor { + pub name: String, + pub kind: MeasurementKind, +} + +impl Sensor { + fn units(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "°C", + MeasurementKind::Current | MeasurementKind::InputCurrent => "A", + MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V", + MeasurementKind::Speed => "RPM", + MeasurementKind::Power => "W", + } + } + + fn format(&self, value: f32, parseable: bool) -> String { + if parseable { + format!("{value}") + } else { + match self.kind { + MeasurementKind::Speed => { + // + // This space is deliberate: other units (°C, V, A) look + // more natural when directly attached to their value -- + // but RPM looks decidedly unnatural without a space. + // + format!("{value:0} RPM") + } + _ => { + format!("{value:.2}{}", self.units()) + } + } + } + } + + fn to_kind_string(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "temp", + MeasurementKind::Power => "power", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::InputCurrent => "input-current", + MeasurementKind::InputVoltage => "input-voltage", + MeasurementKind::Speed => "speed", + } + } + + fn from_string(name: &str, kind: &str) -> Option { + let k = match kind { + "temp" | "temperature" => Some(MeasurementKind::Temperature), + "power" => Some(MeasurementKind::Power), + "current" => Some(MeasurementKind::Current), + "voltage" => Some(MeasurementKind::Voltage), + "input-current" => Some(MeasurementKind::InputCurrent), + "input-voltage" => Some(MeasurementKind::InputVoltage), + "speed" => Some(MeasurementKind::Speed), + _ => None, + }; + + k.map(|kind| Sensor { name: name.to_string(), kind }) + } +} + +pub(crate) enum SensorInput { + MgsClient(gateway_client::Client), + CsvReader(csv::Reader, csv::Position), +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct SensorId(u32); + +#[derive(Debug)] +pub(crate) struct SensorMetadata { + pub sensors_by_sensor: MultiMap, + pub sensors_by_sensor_and_sp: + HashMap>, + pub sensors_by_id: + HashMap, + pub sensors_by_sp: MultiMap, + pub work_by_sp: + HashMap)>>, + #[allow(dead_code)] + pub start_time: Option, + pub end_time: Option, +} + +struct SensorSpInfo { + info: Vec<(SpIdentifier, SpInfo)>, + time: u64, + latencies: Option>, +} + +pub(crate) struct SensorValues { + pub values: HashMap>, + pub latencies: Option>, + pub time: u64, +} + +/// +/// We identify a device as either a physical device (i.e., when connecting +/// to MGS), or as a field in the CSV header (i.e., when processing data +/// postmortem. It's handy to have this as enum to allow most of the code +/// to be agnostic to the underlying source, but callers of ['device'] and +/// ['field'] are expected to know which of these they're dealing with. +/// +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub(crate) enum DeviceIdentifier { + Field(usize), + Device(String), +} + +impl DeviceIdentifier { + fn device(&self) -> &String { + match self { + Self::Device(ref device) => device, + _ => panic!(), + } + } + + fn field(&self) -> usize { + match self { + Self::Field(field) => *field, + _ => panic!(), + } + } +} + +struct SpInfo { + devices: MultiMap)>, + timestamps: Vec, +} + +async fn sp_info( + mgs_client: gateway_client::Client, + type_: SpType, + slot: u32, +) -> Result { + let mut devices = MultiMap::new(); + let mut timestamps = vec![]; + + timestamps.push(std::time::Instant::now()); + + // + // First, get a component list. + // + let components = mgs_client.sp_component_list(type_, slot).await?; + timestamps.push(std::time::Instant::now()); + + // + // Now, for every component, we're going to get its details: for those + // that are sensors (and contain measurements), we will store the name + // of the sensor as well as the retrieved value. + // + for c in &components.components { + for s in mgs_client + .sp_component_get(type_, slot, &c.component) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind, name, value } => Some( + (Sensor { name: name.clone(), kind: *kind }, Some(*value)), + ), + SpComponentDetails::MeasurementError { kind, name, error } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(( + Sensor { name: name.clone(), kind: *kind }, + None, + )), + } + } + _ => None, + }) + { + devices.insert(DeviceIdentifier::Device(c.component.clone()), s); + } + } + + timestamps.push(std::time::Instant::now()); + + Ok(SpInfo { devices, timestamps }) +} + +async fn sp_info_mgs( + mgs_client: &gateway_client::Client, + args: &SensorsArgs, +) -> Result { + let mut rval = vec![]; + let mut latencies = HashMap::new(); + + // + // First, get all of the SPs that we can see via Ignition + // + let all_sp_list = + mgs_client.ignition_list().await.context("listing ignition")?; + + let mut sp_list = all_sp_list + .iter() + .filter_map(|ignition| { + if matches!(ignition.details, SpIgnition::Yes { .. }) + && ignition.id.type_ == SpType::Sled + { + if args.matches_sp(&ignition.id) { + return Some(ignition.id); + } + } + None + }) + .collect::>(); + + if args.switches { + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 0 }); + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 1 }); + } + + if args.psc { + sp_list.push(SpIdentifier { type_: SpType::Power, slot: 0 }); + } + + sp_list.sort(); + + let now = std::time::Instant::now(); + + let mut handles = vec![]; + for sp_id in sp_list { + let handle = + tokio::spawn(sp_info(mgs_client.clone(), sp_id.type_, sp_id.slot)); + + handles.push((sp_id, handle)); + } + + for (sp_id, handle) in handles { + match handle.await.unwrap() { + Ok(info) => { + let l0 = info.timestamps[1].duration_since(info.timestamps[0]); + let l1 = info.timestamps[2].duration_since(info.timestamps[1]); + + if args.verbose { + eprintln!( + "mgs: latencies for {sp_id:?}: {l1:.1?} {l0:.1?}", + ); + } + + latencies.insert(sp_id, l0 + l1); + rval.push((sp_id, info)); + } + + Err(err) => { + eprintln!("failed to read devices for {:?}: {:?}", sp_id, err); + } + } + } + + if args.verbose { + eprintln!("total discovery time {:?}", now.elapsed()); + } + + Ok(SensorSpInfo { + info: rval, + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + latencies: Some(latencies), + }) +} + +fn sp_info_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + args: &SensorsArgs, +) -> Result { + let mut sps = vec![]; + let headers = reader.headers()?; + + let expected = ["TIME", "SENSOR", "KIND"]; + let len = expected.len(); + let hlen = headers.len(); + + if hlen < len { + bail!("expected as least {len} fields (found {headers:?})"); + } + + for ndx in 0..len { + if &headers[ndx] != expected[ndx] { + bail!( + "malformed headers: expected {}, found {} ({headers:?})", + &expected[ndx], + &headers[ndx] + ); + } + } + + for ndx in len..hlen { + let field = &headers[ndx]; + let parts: Vec<&str> = field.splitn(2, '-').collect(); + + if parts.len() != 2 { + bail!("malformed field \"{field}\""); + } + + let type_ = match parts[0] { + "SLED" => SpType::Sled, + "SWITCH" => SpType::Switch, + "POWER" => SpType::Power, + _ => { + bail!("unknown type {}", parts[0]); + } + }; + + let slot = parts[1].parse::().or_else(|_| { + bail!("invalid slot in \"{field}\""); + })?; + + let sp = SpIdentifier { type_, slot }; + + if args.matches_sp(&sp) { + sps.push(Some(sp)); + } else { + sps.push(None); + } + } + + let mut iter = reader.records(); + let mut sensors = HashSet::new(); + let mut by_sp = MultiMap::new(); + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + if time.is_none() { + let t = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(start) = args.start { + if t < start { + continue; + } + } + + if let Some(end) = args.end { + if let Some(start) = args.start { + if start > end { + bail!( + "specified start time is later than end time" + ); + } + } + + if t > end { + bail!( + "specified end time ({end}) is earlier \ + than time of earliest record ({t})" + ); + } + } + + time = Some(t); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if sensors.get(&sensor).is_some() { + break; + } + + sensors.insert(sensor.clone()); + + for (ndx, sp) in sps.iter().enumerate() { + if let Some(sp) = sp { + let value = match record[ndx + len].parse::() { + Ok(value) => Some(value), + _ => { + // + // We want to distinguish between the device + // having an error ("X") and it being absent + // ("-"); if it's absent, we don't want to add + // it at all. + // + match &record[ndx + len] { + "X" => {} + "-" => continue, + _ => { + bail!( + "line {}: unrecognized value \ + \"{}\" in field {}", + position.line(), + record[ndx + len].to_string(), + ndx + len + ); + } + } + + None + } + }; + + by_sp.insert(sp, (sensor.clone(), value)); + } + } + } + } else { + break; + } + } + + if time.is_none() { + bail!("no data found"); + } + + let mut rval = vec![]; + + for (field, sp) in sps.iter().enumerate() { + let mut devices = MultiMap::new(); + + if let Some(sp) = sp { + if let Some(v) = by_sp.remove(sp) { + devices.insert_many(DeviceIdentifier::Field(field + len), v); + } + + rval.push((*sp, SpInfo { devices, timestamps: vec![] })); + } + } + + Ok(SensorSpInfo { info: rval, time: time.unwrap(), latencies: None }) +} + +pub(crate) async fn sensor_metadata( + input: &mut SensorInput, + args: &SensorsArgs, +) -> Result<(Arc, SensorValues), anyhow::Error> { + let by_kind = if let Some(types) = &args.types { + let mut h = HashSet::new(); + + for t in types { + h.insert(match Sensor::from_string("", t) { + None => bail!("invalid sensor kind {t}"), + Some(s) => s.kind, + }); + } + + Some(h) + } else { + None + }; + + let by_name = args + .named + .as_ref() + .map(|named| named.into_iter().collect::>()); + + let info = match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_info_mgs(mgs_client, args).await? + } + SensorInput::CsvReader(reader, position) => { + sp_info_csv(reader, position, args)? + } + }; + + let mut sensors_by_sensor = MultiMap::new(); + let mut sensors_by_sensor_and_sp = HashMap::new(); + let mut sensors_by_id = HashMap::new(); + let mut sensors_by_sp = MultiMap::new(); + let mut values = HashMap::new(); + let mut work_by_sp = HashMap::new(); + + let mut current = 0; + let time = info.time; + + for (sp_id, info) in info.info { + let mut sp_work = vec![]; + + for (device, sensors) in info.devices { + let mut device_work = vec![]; + + for (sensor, value) in sensors { + if let Some(ref by_kind) = by_kind { + if by_kind.get(&sensor.kind).is_none() { + continue; + } + } + + if let Some(ref by_name) = by_name { + if by_name.get(&sensor.name).is_none() { + continue; + } + } + + let id = SensorId(current); + current += 1; + + sensors_by_id + .insert(id, (sp_id, sensor.clone(), device.clone())); + + if value.is_none() && args.verbose { + eprintln!( + "mgs: error for {sp_id:?} on {sensor:?} ({device:?})" + ); + } + + sensors_by_sensor.insert(sensor.clone(), id); + + let by_sp = sensors_by_sensor_and_sp + .entry(sensor) + .or_insert_with(|| HashMap::new()); + by_sp.insert(sp_id, id); + sensors_by_sp.insert(sp_id, id); + values.insert(id, value); + + device_work.push(id); + } + + sp_work.push((device, device_work)); + } + + work_by_sp.insert(sp_id, sp_work); + } + + Ok(( + Arc::new(SensorMetadata { + sensors_by_sensor, + sensors_by_sensor_and_sp, + sensors_by_id, + sensors_by_sp, + work_by_sp, + start_time: args.start, + end_time: match args.end { + Some(end) => Some(end), + None => args.duration.map(|duration| time + duration), + }, + }), + SensorValues { values, time, latencies: info.latencies }, + )) +} + +async fn sp_read_sensors( + mgs_client: &gateway_client::Client, + id: &SpIdentifier, + metadata: &SensorMetadata, +) -> Result<(Vec<(SensorId, Option)>, Duration), anyhow::Error> { + let work = metadata.work_by_sp.get(id).unwrap(); + let mut rval = vec![]; + + let start = std::time::Instant::now(); + + for (component, ids) in work.iter() { + for (value, id) in mgs_client + .sp_component_get(id.type_, id.slot, component.device()) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind: _, name: _, value } => { + Some(Some(*value)) + } + SpComponentDetails::MeasurementError { error, .. } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(None), + } + } + _ => None, + }) + .zip(ids.iter()) + { + rval.push((*id, value)); + } + } + + Ok((rval, start.elapsed())) +} + +async fn sp_data_mgs( + mgs_client: &gateway_client::Client, + metadata: &Arc, +) -> Result { + let mut values = HashMap::new(); + let mut latencies = HashMap::new(); + let mut handles = vec![]; + + for sp_id in metadata.sensors_by_sp.keys() { + let mgs_client = mgs_client.clone(); + let id = *sp_id; + let metadata = Arc::clone(&metadata); + + let handle = tokio::spawn(async move { + sp_read_sensors(&mgs_client, &id, &metadata).await + }); + + handles.push((id, handle)); + } + + for (id, handle) in handles { + let (rval, latency) = handle.await.unwrap()?; + + latencies.insert(id, latency); + + for (id, value) in rval { + values.insert(id, value); + } + } + + Ok(SensorValues { + values, + latencies: Some(latencies), + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + }) +} + +fn sp_data_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + metadata: &SensorMetadata, +) -> Result { + let headers = reader.headers()?; + let hlen = headers.len(); + let mut values = HashMap::new(); + + reader.seek(position.clone())?; + let mut iter = reader.records(); + + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + let now = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(time) = time { + if now != time { + break; + } + } else { + if let Some(end) = metadata.end_time { + if now > end { + time = Some(0); + break; + } + } + + time = Some(now); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if let Some(ids) = metadata.sensors_by_sensor.get_vec(&sensor) { + for id in ids { + let (_, _, d) = metadata.sensors_by_id.get(id).unwrap(); + let value = match record[d.field()].parse::() { + Ok(value) => Some(value), + _ => None, + }; + + values.insert(*id, value); + } + } + } else { + bail!("bad sensor at line {}", position.line()); + } + } else { + time = Some(0); + break; + } + } + + Ok(SensorValues { values, latencies: None, time: time.unwrap() }) +} + +pub(crate) async fn sensor_data( + input: &mut SensorInput, + metadata: &Arc, +) -> Result { + match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_data_mgs(mgs_client, metadata).await + } + SensorInput::CsvReader(reader, position) => { + sp_data_csv(reader, position, &metadata) + } + } +} + +/// +/// Runs `omdb mgs sensors` +/// +pub(crate) async fn cmd_mgs_sensors( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &SensorsArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, mut values) = sensor_metadata(&mut input, args).await?; + + let mut sensors = metadata.sensors_by_sensor.keys().collect::>(); + sensors.sort(); + + let mut sps = metadata.sensors_by_sp.keys().collect::>(); + sps.sort(); + + let print_value = |v| { + if args.parseable { + print!(",{v}"); + } else { + print!(" {v:>8}"); + } + }; + + let print_header = || { + if !args.parseable { + print!("{:20} ", "NAME"); + } else { + print!("TIME,SENSOR,KIND"); + } + + for sp in &sps { + print_value(format!( + "{}-{}", + crate::mgs::sp_type_to_str(&sp.type_).to_uppercase(), + sp.slot + )); + } + + println!(); + }; + + let print_name = |sensor: &Sensor, now: u64| { + if !args.parseable { + print!("{:20} ", sensor.name); + } else { + print!("{now},{},{}", sensor.name, sensor.to_kind_string()); + } + }; + + let print_latency = |now: u64| { + if !args.parseable { + print!("{:20} ", "LATENCY"); + } else { + print!("{now},{},{}", "LATENCY", "latency"); + } + }; + + let mut wakeup = + tokio::time::Instant::now() + tokio::time::Duration::from_millis(1000); + + print_header(); + + loop { + for sensor in &sensors { + print_name(sensor, values.time); + + let by_sp = metadata.sensors_by_sensor_and_sp.get(sensor).unwrap(); + + for sp in &sps { + print_value(if let Some(id) = by_sp.get(sp) { + if let Some(value) = values.values.get(id) { + match value { + Some(value) => { + sensor.format(*value, args.parseable) + } + None => "X".to_string(), + } + } else { + "?".to_string() + } + } else { + "-".to_string() + }); + } + + println!(); + } + + if args.show_latencies { + if let Some(latencies) = values.latencies { + print_latency(values.time); + + for sp in &sps { + print_value(if let Some(latency) = latencies.get(sp) { + format!("{}ms", latency.as_millis()) + } else { + "?".to_string() + }); + } + } + + println!(); + } + + if !args.sleep { + if args.input.is_none() { + break; + } + } else { + tokio::time::sleep_until(wakeup).await; + wakeup += tokio::time::Duration::from_millis(1000); + } + + values = sensor_data(&mut input, &metadata).await?; + + if args.input.is_some() && values.time == 0 { + break; + } + + if !args.parseable { + print_header(); + } + } + + Ok(()) +} diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index e7ca4323d9..aed7d86ba0 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -639,6 +639,32 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ); } }; + } else if name == "region_replacement" { + #[derive(Deserialize)] + struct TaskSuccess { + /// how many region replacements were started ok + region_replacement_started_ok: usize, + + /// how many region replacements could not be started + region_replacement_started_err: usize, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(success) => { + println!( + " number of region replacements started ok: {}", + success.region_replacement_started_ok + ); + println!( + " number of region replacement start errors: {}", + success.region_replacement_started_err + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/bin/omdb/sled_agent.rs index c413a2ba43..2d9e19d253 100644 --- a/dev-tools/omdb/src/bin/omdb/sled_agent.rs +++ b/dev-tools/omdb/src/bin/omdb/sled_agent.rs @@ -31,6 +31,10 @@ enum SledAgentCommands { /// print information about zpools #[clap(subcommand)] Zpools(ZpoolCommands), + + /// print information about the local bootstore node + #[clap(subcommand)] + Bootstore(BootstoreCommands), } #[derive(Debug, Subcommand)] @@ -45,6 +49,12 @@ enum ZpoolCommands { List, } +#[derive(Debug, Subcommand)] +enum BootstoreCommands { + /// Show the internal state of the local bootstore node + Status, +} + impl SledAgentArgs { /// Run a `omdb sled-agent` subcommand. pub(crate) async fn run_cmd( @@ -70,6 +80,9 @@ impl SledAgentArgs { SledAgentCommands::Zpools(ZpoolCommands::List) => { cmd_zpools_list(&client).await } + SledAgentCommands::Bootstore(BootstoreCommands::Status) => { + cmd_bootstore_status(&client).await + } } } } @@ -110,3 +123,46 @@ async fn cmd_zpools_list( Ok(()) } + +/// Runs `omdb sled-agent bootstore status` +async fn cmd_bootstore_status( + client: &sled_agent_client::Client, +) -> Result<(), anyhow::Error> { + let status = client.bootstore_status().await.context("bootstore status")?; + println!("fsm ledger generation: {}", status.fsm_ledger_generation); + println!( + "network config ledger generation: {:?}", + status.network_config_ledger_generation + ); + println!("fsm state: {}", status.fsm_state); + println!("peers (found by ddmd):"); + if status.peers.is_empty() { + println!(" "); + } + for peer in status.peers.iter() { + println!(" {peer}"); + } + println!("established connections:"); + if status.established_connections.is_empty() { + println!(" "); + } + for c in status.established_connections.iter() { + println!(" {:?} : {}", c.baseboard, c.addr); + } + println!("accepted connections:"); + if status.accepted_connections.is_empty() { + println!(" "); + } + for addr in status.accepted_connections.iter() { + println!(" {addr}"); + } + println!("negotiating connections:"); + if status.negotiating_connections.is_empty() { + println!(" "); + } + for addr in status.negotiating_connections.iter() { + println!(" {addr}"); + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 72e9d2e8fc..3e6e89d508 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,11 +57,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -83,6 +83,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -143,11 +147,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -169,6 +173,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -216,11 +224,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -242,6 +250,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 416b669068..3086c98f32 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -251,11 +251,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -277,6 +277,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -309,7 +313,7 @@ task: "dns_servers_internal" task: "dns_propagation_internal" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 1 @@ -337,7 +341,7 @@ task: "dns_servers_external" task: "dns_propagation_external" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 2 @@ -407,6 +411,14 @@ task: "phantom_disks" number of phantom disks deleted: 0 number of phantom disk delete errors: 0 +task: "region_replacement" + configured period: every 30s + currently executing: no + last completed activation: iter 2, triggered by an explicit signal + started at (s ago) and ran for ms + number of region replacements started ok: 0 + number of region replacement start errors: 0 + task: "service_zone_nat_tracker" configured period: every 30s currently executing: no diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 2790b0ef83..bb7da1be57 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -90,6 +90,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory @@ -118,6 +119,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory @@ -270,7 +272,9 @@ Debug a specific Management Gateway Service instance Usage: omdb mgs [OPTIONS] Commands: + dashboard Dashboard of SPs inventory Show information about devices and components visible to MGS + sensors Show information about sensors, as gleaned by MGS help Print this message or the help of the given subcommand(s) Options: @@ -327,9 +331,10 @@ Debug a specific Sled Usage: omdb sled-agent [OPTIONS] Commands: - zones print information about zones - zpools print information about zpools - help Print this message or the help of the given subcommand(s) + zones print information about zones + zpools print information about zpools + bootstore print information about the local bootstore node + help Print this message or the help of the given subcommand(s) Options: --sled-agent-url URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=] diff --git a/dev-tools/thing-flinger/.gitignore b/dev-tools/thing-flinger/.gitignore deleted file mode 100644 index ea8c4bf7f3..0000000000 --- a/dev-tools/thing-flinger/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target diff --git a/dev-tools/thing-flinger/Cargo.toml b/dev-tools/thing-flinger/Cargo.toml deleted file mode 100644 index a427685871..0000000000 --- a/dev-tools/thing-flinger/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "omicron-deploy" -description = "Tools for deploying Omicron software to target machines" -version = "0.1.0" -edition = "2021" -license = "MPL-2.0" - -[dependencies] -anyhow.workspace = true -camino.workspace = true -clap.workspace = true -crossbeam.workspace = true -omicron-package.workspace = true -serde.workspace = true -serde_derive.workspace = true -thiserror.workspace = true -omicron-workspace-hack.workspace = true - -[[bin]] -name = "thing-flinger" -doc = false diff --git a/dev-tools/thing-flinger/README.adoc b/dev-tools/thing-flinger/README.adoc deleted file mode 100644 index 9966a7b747..0000000000 --- a/dev-tools/thing-flinger/README.adoc +++ /dev/null @@ -1,222 +0,0 @@ -Omicron is a complex piece of software consisting of many build and install-time dependencies. It's -intended to run primarily on illumos based systems, and as such is built to use runtime facilities -of illumos, such as https://illumos.org/man/5/smf[SMF]. Furthermore, Omicron is fundamentally a -distributed system, with its components intended to run on multiple servers communicating over the -network. In order to secure the system, certain cryptographic primitives, such as asymmetric key -pairs and shared secrets are required. Due to the nature of these cryptographic primitives, there is -a requirement for the distribution or creation of files unique to a specific server, such that no -other server has access to those files. Examples of this are private keys, and threshold key -shares, although other non-cryptographic unique files may also become necessary over time. - -In order to satisfy the above requirements of building and deploying a complex distributed system -consisting of unique, private files, two CLI tools have been created: - - . link:src/bin/omicron-package.rs[omicron-package] - build, package, install on local machine - . link:src/bin/thing-flinger.rs[thing-flinger] - build, package, deploy to remote machines - - -If a user is working on their local illumos based machine, and only wants to run -omicron in single node mode, they should follow the install instruction in -the link:../README.adoc[Omicron README] and use `omicron-package`. If the user -wishes for a more complete workflow, where they can code on their local laptop, -use a remote build machine, and install to multiple machines for a more realistic -deployment, they should use `thing-flinger`. - -The remainder of this document will describe a typical workflow for using -thing-flinger, pointing out room for improvement. - -== Environment and Configuration - - - +------------------+ +------------------+ - | | | | - | | | | - | Client |----------------> Builder | - | | | | - | | | | - +------------------+ +------------------+ - | - | - | - | - +---------------------------+--------------------------+ - | | | - | | | - | | | - +--------v---------+ +---------v--------+ +---------v--------+ - | | | | | | - | | | | | | - | Deployed Server | | Deployed Server | | Deployed Server | - | | | | | | - | | | | | | - +------------------+ +------------------+ +------------------+ - - -`thing-flinger` defines three types of nodes: - - * Client - Where a user typically edits their code and runs thing-flinger. This can run any OS. - * Builder - A Helios box where Omicron is built and packaged - * Deployed Server - Helios machines where Omicron will be installed and run - -It's not at all necessary for these to be separate nodes. For example, a client and builder can be -the same machine, as long as it's a Helios box. Same goes for Builder and a deployment server. The -benefit of this separation though, is that it allows editing on something like a laptop, without -having to worry about setting up a development environment on an illumos based host. - -Machine topology is configured in a `TOML` file that is passed on the command line. All illumos -machines are listed under `servers`, and just the names are used for configuring a builder and -deployment servers. An link:src/bin/deployment-example.toml[example] is provided. - -Thing flinger works over SSH, and so the user must have the public key of their client configured -for their account on all servers. SSH agent forwarding is used to prevent the need for the keys of -the builder to also be on the other servers, thus minimizing needed server configuration. - -== Typical Workflow - -=== Prerequisites - -Ensure you have an account on all illumos boxes, with the client public key in -`~/.ssh/authorized_keys`. - -.The build machine must have Rust and cargo installed, as well as -all the dependencies for Omicron installed. Following the *prerequisites* in the -https://github.com/oxidecomputer/omicron/#build-and-run[Build and run] section of the main Omicron -README is probably a good idea. - -==== Update `config-rss.toml` - -Currently rack setup is driven by a configuration file that lives at -`smf/sled-agent/non-gimlet/config-rss.toml` in the root of this repository. The committed -configuration of that file contains a single `requests` entry (with many -services inside it), which means it will start services on only one sled. To -start services (e.g., nexus) on multiple sleds, add additional entries to that -configuration file before proceeding. - -=== Command Based Workflow - -==== sync -Copy your source code to the builder. - -`+cargo run --bin thing-flinger -- -c sync+` - -==== Install Prerequisites -Install necessary build and runtime dependencies (including downloading prebuilt -binaries like Clickhouse and CockroachDB) on the builder and all deployment -targets. This step only needs to be performed once, absent any changes to the -dependencies, but is idempotent so may be run multiple times. - -`+cargo run --bin thing-flinger -- -c install-prereqs+` - -==== check (optional) -Run `cargo check` on the builder against the copy of `omicron` that was sync'd -to it in the previous step. - -`+cargo run --bin thing-flinger -- -c build check+` - -==== package -Build and package omicron using `omicron-package` on the builder. - -`+cargo run --bin thing-flinger -- -c build package+` - -==== overlay -Create files that are unique to each deployment server. - -`+cargo run --bin thing-flinger -- -c overlay+` - -==== install -Install omicron to all machines, in parallel. This consists of copying the packaged omicron tarballs -along with overlay files, and omicron-package and its manifest to a `staging` directory on each -deployment server, and then running omicron-package, installing overlay files, and restarting -services. - -`+cargo run --bin thing-flinger -- -c deploy install+` - -==== uninstall -Uninstall omicron from all machines. - -`+cargo run --bin thing-flinger -- -c deploy uninstall+` - -=== Current Limitations - -`thing-flinger` is an early prototype. It has served so far to demonstrate that unique files, -specifically secret shares, can be created and distributed over ssh, and that omicron can be -installed remotely using `omicron-package`. It is not currently complete enough to fully test a -distributed omicron setup, as the underlying dependencies are not configured yet. Specifically, -`CockroachDB` and perhaps `Clickhouse`, need to be configured to run in multiple server mode. It's -anticipated that the `overlay` feature of `thing-flinger` can be used to generate and distribute -configs for this. - -=== Design rationale - -`thing-flinger` is a command line program written in rust. It was written this way to build upon -`omicron-package`, which is also in rust, as that is our default language of choice at Oxide. -`thing-flinger` is based around SSH, as that is the minimal viable requirement for a test tool such -as this. Additionally, it provides for the most straightforward implementation, and takes the least -effort to use securely. This particular implementation wraps the openssh ssh client via -`std::process::Command`, rather than using the `ssh2` crate, because ssh2, as a wrapper around -`libssh`, does not support agent-forwarding. - -== Notes on Using VMs as Deployed Servers on a Linux Host - -TODO: This section should be fleshed out more and potentially lifted to its own -document; for now this is a collection of rough notes. - ---- - -It's possible to use a Linux libvirt host running multiple helios VMs as the -builder/deployment server targets, but it requires some additional setup beyond -`https://github.com/oxidecomputer/helios-engvm[helios-engvm]`. - -`thing-flinger` does not have any support for running the -`tools/create_virtual_hardware.sh` script; this will need to be done by hand on -each VM. - ---- - -To enable communication between the VMs over their IPv6 bootstrap networks: - -1. Enable IPv6 and DHCP on the virtual network libvirt uses for the VMs; e.g., - -```xml - - - - - -``` - -After booting the VMs with this enabled, they should be able to ping each other -over their acquired IPv6 addresses, but connecting to each other over the -`bootstrap6` interface that sled-agent creates will fail. - -2. Explicitly add routes in the Linux host for the `bootstrap6` addresses, -specifying the virtual interface libvirt created that is used by the VMs. - -``` -bash% sudo ip -6 route add fdb0:5254:13:7331::1/64 dev virbr1 -bash% sudo ip -6 route add fdb0:5254:f0:acfd::1/64 dev virbr1 -``` - -3. Once the sled-agents advance sufficiently to set up `sled6` interfaces, -routes need to be added for them both in the Linux host and in the Helios VMs. -Assuming two sleds with these interfaces: - -``` -# VM 1 -vioif0/sled6 static ok fd00:1122:3344:1::1/64 -# VM 2 -vioif0/sled6 static ok fd00:1122:3344:2::1/64 -``` - -The Linux host needs to be told to route that subnet to the appropriate virtual -interface: - -``` -bash% ip -6 route add fd00:1122:3344::1/48 dev virbr1 -``` - -and each Helios VM needs to be told to route that subnet to the host gateway: - -``` -vm% pfexec route add -inet6 fd00:1122:3344::/48 $IPV6_HOST_GATEWAY_ADDR -``` diff --git a/dev-tools/thing-flinger/src/bin/deployment-example.toml b/dev-tools/thing-flinger/src/bin/deployment-example.toml deleted file mode 100644 index 6d85de2ba6..0000000000 --- a/dev-tools/thing-flinger/src/bin/deployment-example.toml +++ /dev/null @@ -1,36 +0,0 @@ -# This manifest describes the servers that omicron will be installed to, along -# with any ancillary information specific to a given server. -# -# It is ingested by the `thing-flinger` tool. - -# This must be an absolute path. It refers to the path to Omicron on the -# machine where thing-flinger is being executed. -omicron_path = "/local/path/to/omicron" - -[builder] -# `server` must refer to one of the `servers` in the servers table -server = "foo" -# This must be an absolute path. It refers to the path to Omicron on the -# builder server. -omicron_path = "/remote/path/to/omicron" - -[deployment] -# which server is responsible for running the rack setup service; must -# refer to one of the `servers` in the servers table -rss_server = "foo" -# Location where files to install will be placed before running -# `omicron-package install` -# -# This must be an absolute path -# We specifically allow for $HOME in validating the absolute path -staging_dir = "$HOME/omicron_staging" -# which servers to deploy -servers = ["foo", "bar"] - -[servers.foo] -username = "me" -addr = "foo" - -[servers.bar] -username = "me" -addr = "bar" diff --git a/dev-tools/thing-flinger/src/bin/thing-flinger.rs b/dev-tools/thing-flinger/src/bin/thing-flinger.rs deleted file mode 100644 index 43b137790d..0000000000 --- a/dev-tools/thing-flinger/src/bin/thing-flinger.rs +++ /dev/null @@ -1,968 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Utility for deploying Omicron to remote machines - -use omicron_package::{parse, BuildCommand, DeployCommand}; - -use camino::{Utf8Path, Utf8PathBuf}; -use std::collections::{BTreeMap, BTreeSet}; -use std::process::Command; - -use anyhow::{Context, Result}; -use clap::{Parser, Subcommand}; -use crossbeam::thread::{self, ScopedJoinHandle}; -use serde_derive::Deserialize; -use thiserror::Error; - -// A server on which omicron source should be compiled into packages. -#[derive(Deserialize, Debug)] -struct Builder { - server: String, - omicron_path: Utf8PathBuf, -} - -// A server on which an omicron package is deployed. -#[derive(Deserialize, Debug, Eq, PartialEq)] -struct Server { - username: String, - addr: String, -} - -#[derive(Deserialize, Debug)] -struct Deployment { - rss_server: String, - staging_dir: Utf8PathBuf, - servers: BTreeSet, -} - -#[derive(Debug, Deserialize)] -struct Config { - omicron_path: Utf8PathBuf, - builder: Builder, - servers: BTreeMap, - deployment: Deployment, - - #[serde(default)] - rss_config_path: Option, - - #[serde(default)] - debug: bool, -} - -impl Config { - fn release_arg(&self) -> &str { - if self.debug { - "" - } else { - "--release" - } - } - - fn deployment_servers(&self) -> impl Iterator { - self.servers.iter().filter_map(|(name, s)| { - if self.deployment.servers.contains(name) { - Some(s) - } else { - None - } - }) - } -} - -fn parse_into_set(src: &str) -> Result, &'static str> { - Ok(src.split_whitespace().map(|s| s.to_owned()).collect()) -} - -#[derive(Debug, Subcommand)] -enum SubCommand { - /// Run the given command on the given servers, or all servers if none are - /// specified. - /// - /// Be careful! - Exec { - /// The command to run - #[clap(short, long, action)] - cmd: String, - - /// The servers to run the command on - #[clap(short, long, value_parser = parse_into_set)] - servers: Option>, - }, - - /// Install necessary prerequisites on the "builder" server and all "deploy" - /// servers. - InstallPrereqs, - - /// Sync our local source to the build host - Sync, - - /// Runs a command on the "builder" server. - #[clap(name = "build", subcommand)] - Builder(BuildCommand), - - /// Runs a command on all the "deploy" servers. - #[clap(subcommand)] - Deploy(DeployCommand), - - /// Create an overlay directory tree for each deployment server - /// - /// Each directory tree contains unique files for the given server that will - /// be populated in the svc/pkg dir. - /// - /// This is a separate subcommand so that we can reconstruct overlays - /// without rebuilding or repackaging. - Overlay, -} - -#[derive(Debug, Parser)] -#[clap( - name = "thing-flinger", - about = "A tool for synchronizing packages and configs between machines" -)] -struct Args { - /// The path to the deployment manifest TOML file - #[clap( - short, - long, - help = "Path to deployment manifest toml file", - action - )] - config: Utf8PathBuf, - - #[clap( - short, - long, - help = "The name of the build target to use for this command" - )] - target: String, - - /// The output directory, where artifacts should be built and staged - #[clap(long = "artifacts", default_value = "out/")] - artifact_dir: Utf8PathBuf, - - #[clap(subcommand)] - subcommand: SubCommand, -} - -/// Errors which can be returned when executing subcommands -#[derive(Error, Debug)] -enum FlingError { - #[error("Servers not listed in configuration: {0:?}")] - InvalidServers(Vec), - - /// Failed to rsync omicron to build host - #[error("Failed to sync {src} with {dst}")] - FailedSync { src: String, dst: String }, - - /// The given path must be absolute - #[error("Path for {field} must be absolute")] - NotAbsolutePath { field: &'static str }, -} - -// How should `ssh_exec` be run? -enum SshStrategy { - // Forward agent and source .profile - Forward, - - // Don't forward agent, but source .profile - NoForward, - - // Don't forward agent and don't source .profile - NoForwardNoProfile, -} - -impl SshStrategy { - fn forward_agent(&self) -> bool { - match self { - SshStrategy::Forward => true, - _ => false, - } - } - - fn source_profile(&self) -> bool { - match self { - SshStrategy::Forward | &SshStrategy::NoForward => true, - _ => false, - } - } -} - -// TODO: run in parallel when that option is given -fn do_exec( - config: &Config, - cmd: String, - servers: Option>, -) -> Result<()> { - if let Some(ref servers) = servers { - validate_servers(servers, &config.servers)?; - - for name in servers { - let server = &config.servers[name]; - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } else { - for (_, server) in config.servers.iter() { - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } - Ok(()) -} - -// start an `rsync` command with args common to all our uses -fn rsync_common() -> Command { - let mut cmd = Command::new("rsync"); - cmd.arg("-az") - .arg("-e") - .arg("ssh -o StrictHostKeyChecking=no") - .arg("--delete") - .arg("--progress") - .arg("--out-format") - .arg("File changed: %o %t %f"); - cmd -} - -fn do_sync(config: &Config) -> Result<()> { - let builder = - config.servers.get(&config.builder.server).ok_or_else(|| { - FlingError::InvalidServers(vec![config.builder.server.clone()]) - })?; - - // For rsync to copy from the source appropriately we must guarantee a - // trailing slash. - let src = format!( - "{}/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - let dst = format!( - "{}@{}:{}", - builder.username, builder.addr, config.builder.omicron_path - ); - - println!("Synchronizing source files to: {}", dst); - let mut cmd = rsync_common(); - - // exclude build and development environment artifacts - cmd.arg("--exclude") - .arg("target/") - .arg("--exclude") - .arg("*.vdev") - .arg("--exclude") - .arg("*.swp") - .arg("--exclude") - .arg(".git/") - .arg("--exclude") - .arg("out/"); - - // exclude `config-rss.toml`, which needs to be sent to only one target - // system. we handle this in `do_overlay` below. - cmd.arg("--exclude").arg("**/config-rss.toml"); - - // finish with src/dst - cmd.arg(&src).arg(&dst); - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src, dst }.into()); - } - - Ok(()) -} - -fn copy_to_deployment_staging_dir( - config: &Config, - src: String, - description: &str, -) -> Result<()> { - let partial_cmd = || { - let mut cmd = rsync_common(); - cmd.arg("--relative"); - cmd.arg(&src); - cmd - }; - - // A function for each deployment server to run in parallel - let fns = config.deployment_servers().map(|server| { - || { - let dst = format!( - "{}@{}:{}", - server.username, server.addr, config.deployment.staging_dir - ); - let mut cmd = partial_cmd(); - cmd.arg(&dst); - let status = cmd - .status() - .context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err( - FlingError::FailedSync { src: src.clone(), dst }.into() - ); - } - Ok(()) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel(description, named_fns); - - Ok(()) -} - -fn rsync_config_needed_for_tools(config: &Config) -> Result<()> { - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `smf/sled-agent` but none of its parents - "{}/./smf/sled-agent/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - - copy_to_deployment_staging_dir(config, src, "Copy smf/sled-agent dir") -} - -fn rsync_tools_dir_to_deployment_servers(config: &Config) -> Result<()> { - // we need to rsync `./tools/*` to each of the deployment targets (the - // "builder" already has it via `do_sync()`), and then run `pfexec - // tools/install_prerequisites.sh` on each system. - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `tools` but none of its parents - "{}/./tools/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - copy_to_deployment_staging_dir(config, src, "Copy tools dir") -} - -fn do_install_prereqs(config: &Config) -> Result<()> { - rsync_config_needed_for_tools(config)?; - rsync_tools_dir_to_deployment_servers(config)?; - install_rustup_on_deployment_servers(config); - create_virtual_hardware_on_deployment_servers(config); - create_external_tls_cert_on_builder(config)?; - - // Create a set of servers to install prereqs to - let builder = &config.servers[&config.builder.server]; - let build_server = (builder, &config.builder.omicron_path); - let all_servers = std::iter::once(build_server).chain( - config.deployment_servers().filter_map(|server| { - // Don't duplicate the builder - if server.addr != builder.addr { - Some((server, &config.deployment.staging_dir)) - } else { - None - } - }), - ); - - let server_names = std::iter::once(&config.builder.server).chain( - config - .deployment - .servers - .iter() - .filter(|s| **s != config.builder.server), - ); - - // Install functions to run in parallel on each server - let fns = all_servers.map(|(server, root_path)| { - || { - // -y: assume yes instead of prompting - // -p: skip check that deps end up in $PATH - let (script, script_type) = if *server == *builder { - ("install_builder_prerequisites.sh -y -p", "builder") - } else { - ("install_runner_prerequisites.sh -y", "runner") - }; - - let cmd = format!( - "cd {} && mkdir -p out && pfexec ./tools/{}", - root_path.clone(), - script - ); - println!( - "Install {} prerequisites on {}", - script_type, server.addr - ); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = server_names.zip(fns); - run_in_parallel("Install prerequisites", named_fns); - - Ok(()) -} - -fn create_external_tls_cert_on_builder(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && ./tools/create_self_signed_cert.sh", - config.builder.omicron_path, - ); - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn create_virtual_hardware_on_deployment_servers(config: &Config) { - let cmd = format!( - "cd {} && pfexec ./tools/create_virtual_hardware.sh", - config.deployment.staging_dir - ); - let fns = config.deployment_servers().map(|server| { - || { - println!("Create virtual hardware on {}", server.addr); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Create virtual hardware", named_fns); -} - -fn install_rustup_on_deployment_servers(config: &Config) { - let cmd = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y"; - let fns = config.deployment_servers().map(|server| { - || ssh_exec(server, cmd, SshStrategy::NoForwardNoProfile) - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Install rustup", named_fns); -} - -// Build omicron-package and omicron-deploy on the builder -// -// We need to build omicron-deploy for overlay file generation -fn do_build_minimal(config: &Config) -> Result<()> { - let server = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && cargo build {} -p {} -p {}", - config.builder.omicron_path, - config.release_arg(), - "omicron-package", - "omicron-deploy" - ); - ssh_exec(&server, &cmd, SshStrategy::NoForward) -} - -fn do_package(config: &Config, artifact_dir: Utf8PathBuf) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - // We use a bash login shell to get a proper environment, so we have a path to - // postgres, and $DEP_PQ_LIBDIRS is filled in. This is required for building - // nexus. - // - // See https://github.com/oxidecomputer/omicron/blob/8757ec542ea4ffbadd6f26094ed4ba357715d70d/rpaths/src/lib.rs - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- package --out {}'", - config.builder.omicron_path, - config.release_arg(), - artifact_dir, - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_dot(_config: &Config) -> Result<()> { - anyhow::bail!("\"dot\" command is not supported for thing-flinger"); -} - -fn do_check(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- check'", - config.builder.omicron_path, - config.release_arg(), - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_uninstall(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package uninstall", - config.deployment.staging_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn do_clean( - config: &Config, - artifact_dir: Utf8PathBuf, - install_dir: Utf8PathBuf, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package clean --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn run_in_parallel<'a, F>(op: &str, cmds: impl Iterator) -where - F: FnOnce() -> Result<()> + Send, -{ - thread::scope(|s| { - let named_handles: Vec<(_, ScopedJoinHandle<'_, Result<()>>)> = cmds - .map(|(server_name, f)| (server_name, s.spawn(|_| f()))) - .collect(); - - // Join all the handles and print the install status - for (server_name, handle) in named_handles { - match handle.join() { - Ok(Ok(())) => { - println!("{} completed for server: {}", op, server_name) - } - Ok(Err(e)) => { - println!( - "{} failed for server: {} with error: {}", - op, server_name, e - ) - } - Err(_) => { - println!( - "{} failed for server: {}. Thread panicked.", - op, server_name - ) - } - } - } - }) - .unwrap(); -} - -fn do_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) { - let builder = &config.servers[&config.builder.server]; - let mut pkg_dir = Utf8PathBuf::from(&config.builder.omicron_path); - pkg_dir.push(artifact_dir); - - let fns = config.deployment.servers.iter().map(|server_name| { - (server_name, || { - single_server_install( - config, - &artifact_dir, - &install_dir, - pkg_dir.as_str(), - builder, - server_name, - ) - }) - }); - - run_in_parallel("Install", fns); -} - -fn do_overlay(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let mut root_path = Utf8PathBuf::from(&config.builder.omicron_path); - // TODO: This needs to match the artifact_dir in `package` - root_path.push("out/overlay"); - - // Build a list of directories for each server to be deployed and tag which - // one is the server to run RSS; e.g., for servers ["foo", "bar", "baz"] - // with root_path "/my/path", we produce - // [ - // "/my/path/foo/sled-agent/pkg", - // "/my/path/bar/sled-agent/pkg", - // "/my/path/baz/sled-agent/pkg", - // ] - // As we're doing so, record which directory is the one for the server that - // will run RSS. - let mut rss_server_dir = None; - - for server_name in &config.deployment.servers { - let mut dir = root_path.clone(); - dir.push(server_name); - dir.push("sled-agent/pkg"); - if *server_name == config.deployment.rss_server { - rss_server_dir = Some(dir.clone()); - break; - } - } - - // we know exactly one of the servers matches `rss_server` from our config - // validation, so we can unwrap here - let rss_server_dir = rss_server_dir.unwrap(); - - overlay_rss_config(builder, config, &rss_server_dir)?; - - Ok(()) -} - -fn overlay_rss_config( - builder: &Server, - config: &Config, - rss_server_dir: &Utf8Path, -) -> Result<()> { - // Sync `config-rss.toml` to the directory for the RSS server on the - // builder. - let src = if let Some(src) = &config.rss_config_path { - src.clone() - } else { - config.omicron_path.join("smf/sled-agent/non-gimlet/config-rss.toml") - }; - let dst = format!( - "{}@{}:{}/config-rss.toml", - builder.username, builder.addr, rss_server_dir - ); - - let mut cmd = rsync_common(); - cmd.arg(&src).arg(&dst); - - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src: src.to_string(), dst }.into()); - } - - Ok(()) -} - -fn single_server_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, - pkg_dir: &str, - builder: &Server, - server_name: &str, -) -> Result<()> { - let server = &config.servers[server_name]; - - println!( - "COPYING packages from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_artifacts_to_staging(config, pkg_dir, builder, server)?; - - println!( - "COPYING deploy tool from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_omicron_package_binary_to_staging(config, builder, server)?; - - println!( - "COPYING manifest from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_manifest_to_staging(config, builder, server)?; - - println!("UNPACKING packages on deploy server ({})", server_name); - run_omicron_package_unpack_from_staging( - config, - server, - &artifact_dir, - &install_dir, - )?; - - println!( - "COPYING overlay files from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_overlay_files_to_staging( - config, - pkg_dir, - builder, - server, - server_name, - )?; - - println!("INSTALLING overlay files into the install directory of the deploy server ({})", server_name); - install_overlay_files_from_staging(config, server, &install_dir)?; - - println!("STARTING services on the deploy server ({})", server_name); - run_omicron_package_activate_from_staging(config, server, &install_dir) -} - -// Copy package artifacts as a result of `omicron-package package` from the -// builder to the deployment server staging directory. -// -// This staging directory acts as an intermediate location where -// packages may reside prior to being installed. -fn copy_package_artifacts_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, -) -> Result<()> { - let cmd = format!( - "rsync -avz -e 'ssh -o StrictHostKeyChecking=no' \ - --include 'out/' \ - --include 'out/*.tar' \ - --include 'out/*.tar.gz' \ - --exclude '*' \ - {} {}@{}:{}", - pkg_dir, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_omicron_package_binary_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut bin_path = Utf8PathBuf::from(&config.builder.omicron_path); - bin_path.push(format!( - "target/{}/omicron-package", - if config.debug { "debug" } else { "release" } - )); - let cmd = format!( - "rsync -avz {} {}@{}:{}", - bin_path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_package_manifest_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut path = Utf8PathBuf::from(&config.builder.omicron_path); - path.push("package-manifest.toml"); - let cmd = format!( - "rsync {} {}@{}:{}", - path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_activate_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - // Run `omicron-package activate` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package activate --out {}", - config.deployment.staging_dir, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_unpack_from_staging( - config: &Config, - destination: &Server, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - - // Run `omicron-package unpack` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package unpack --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn copy_overlay_files_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, - destination_name: &str, -) -> Result<()> { - let cmd = format!( - "rsync -avz {}/overlay/{}/ {}@{}:{}/overlay/", - pkg_dir, - destination_name, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn install_overlay_files_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - let cmd = format!( - "pfexec cp -r {}/overlay/* {}", - config.deployment.staging_dir, install_dir - ); - println!("$ {}", cmd); - ssh_exec(&destination, &cmd, SshStrategy::NoForward) -} - -fn ssh_exec( - server: &Server, - remote_cmd: &str, - strategy: SshStrategy, -) -> Result<()> { - let remote_cmd = if strategy.source_profile() { - // Source .profile, so we have access to cargo. Rustup installs knowledge - // about the cargo path here. - String::from(". $HOME/.profile && ") + remote_cmd - } else { - remote_cmd.into() - }; - - let mut cmd = Command::new("ssh"); - if strategy.forward_agent() { - cmd.arg("-A"); - } - cmd.arg("-o") - .arg("StrictHostKeyChecking=no") - .arg("-l") - .arg(&server.username) - .arg(&server.addr) - .arg(&remote_cmd); - - // If the builder is the same as the client, this will likely not be set, - // as the keys will reside on the builder. - if let Some(auth_sock) = std::env::var_os("SSH_AUTH_SOCK") { - cmd.env("SSH_AUTH_SOCK", auth_sock); - } - let exit_status = cmd - .status() - .context(format!("Failed to run {} on {}", remote_cmd, server.addr))?; - if !exit_status.success() { - anyhow::bail!("Command failed: {}", exit_status); - } - - Ok(()) -} - -fn validate_servers( - chosen: &BTreeSet, - all: &BTreeMap, -) -> Result<(), FlingError> { - let all = all.keys().cloned().collect(); - let diff: Vec = chosen.difference(&all).cloned().collect(); - if !diff.is_empty() { - Err(FlingError::InvalidServers(diff)) - } else { - Ok(()) - } -} - -fn validate_absolute_path( - path: &Utf8Path, - field: &'static str, -) -> Result<(), FlingError> { - if path.is_absolute() || path.starts_with("$HOME") { - Ok(()) - } else { - Err(FlingError::NotAbsolutePath { field }) - } -} - -fn validate(config: &Config) -> Result<(), FlingError> { - validate_absolute_path(&config.omicron_path, "omicron_path")?; - validate_absolute_path( - &config.builder.omicron_path, - "builder.omicron_path", - )?; - validate_absolute_path( - &config.deployment.staging_dir, - "deployment.staging_dir", - )?; - - validate_servers( - &BTreeSet::from([ - config.builder.server.clone(), - config.deployment.rss_server.clone(), - ]), - &config.servers, - ) -} - -fn main() -> Result<()> { - let args = Args::try_parse()?; - let config = parse::<_, Config>(args.config)?; - - validate(&config)?; - - match args.subcommand { - SubCommand::Exec { cmd, servers } => { - do_exec(&config, cmd, servers)?; - } - SubCommand::Sync => do_sync(&config)?, - SubCommand::InstallPrereqs => do_install_prereqs(&config)?, - SubCommand::Builder(BuildCommand::Target { .. }) => { - todo!("Setting target not supported through thing-flinger") - } - SubCommand::Builder(BuildCommand::Package { .. }) => { - do_package(&config, args.artifact_dir)?; - } - SubCommand::Builder(BuildCommand::Stamp { .. }) => { - anyhow::bail!("Distributed package stamping not supported") - } - SubCommand::Builder(BuildCommand::Check) => do_check(&config)?, - SubCommand::Builder(BuildCommand::Dot) => { - do_dot(&config)?; - } - SubCommand::Deploy(DeployCommand::Install { install_dir }) => { - do_build_minimal(&config)?; - do_install(&config, &args.artifact_dir, &install_dir); - } - SubCommand::Deploy(DeployCommand::Uninstall) => { - do_build_minimal(&config)?; - do_uninstall(&config)?; - } - SubCommand::Deploy(DeployCommand::Clean { install_dir }) => { - do_build_minimal(&config)?; - do_clean(&config, args.artifact_dir, install_dir)?; - } - // TODO: It doesn't really make sense to allow the user direct access - // to these low level operations in thing-flinger. Should we not use - // the DeployCommand from omicron-package directly? - SubCommand::Deploy(_) => anyhow::bail!("Unsupported action"), - SubCommand::Overlay => do_overlay(&config)?, - } - Ok(()) -} diff --git a/docs/clickhouse-debugging.adoc b/docs/clickhouse-debugging.adoc new file mode 100644 index 0000000000..a906d1841f --- /dev/null +++ b/docs/clickhouse-debugging.adoc @@ -0,0 +1,199 @@ +:showtitle: +:numbered: +:toc: left + += Omicron Clickhouse Debugging Guide + +This is a guide for debugging Clickhouse on a variety of environments. + +If you have advice that is not covered here, consider adding it! + +== Debugging on a Live System + +The following provides instructions for connecting to a Clickhouse shell on a running system. + +. **Find the zone running Clickhouse**. This can be accomplished by running `zoneadm list -cv`, and finding the zone with a prefix of `oxz_clickhouse`. If you're running on a multi-machine system (e.g., dogfood, colo, etc) and you have access to the `pilot` binary, you can ask all sleds at once for the location of Clickhouse with: +// '+' for list continuation to insert code blocks while keeping the list order ++ +[source,bash] +---- +# Run from the switch zone. +$ pilot host exec -c "zoneadm list -c | grep clickhouse" 0-31 +---- +. **Log into that zone**. This can be done using: ++ +[source,bash] +---- +# Run from the switch zone +$ pilot host login + +# Run from the machine with the Clichouse zone +$ pfexec zlogin oxz_clickhouse_ +---- + +. **Identify the IP address of Clickhouse**. This is possible using `ipadm`: ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ ipadm +ADDROBJ TYPE STATE ADDR +lo0/v4 static ok 127.0.0.1/8 +lo0/v6 static ok ::1/128 +oxControlService8/ll addrconf ok fe80::8:20ff:fe35:6b0a%oxControlService8/10 +oxControlService8/omicron6 static ok fd00:1122:3344:107::4/64 <-- It's this one! +---- +. **Log into Clickhouse using the CLI** ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ /opt/oxide/clickhouse/clickhouse client --host fd00:1122:3344:107::4 +ClickHouse client version 22.8.9.1. +Connecting to fd00:1122:3344:107::4:9000 as user default. +Connected to ClickHouse server version 22.8.9 revision 54460. + +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) +---- +. **Inspect the database**. At this point, you've successfully accessed the Clichouse shell. +The `oximeter` database is likely the most useful one for inspection: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) USE oximeter; +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SHOW TABLES + +SHOW TABLES + +Query id: a8c82507-6179-40ee-8e51-4801ca5ff6f8 + +┌─name───────────────────────┐ +│ fields_bool │ +│ fields_i16 │ +│ fields_i32 │ +│ fields_i64 │ +│ fields_i8 │ +│ fields_ipaddr │ +│ fields_string │ +│ fields_u16 │ +│ fields_u32 │ +│ fields_u64 │ +│ fields_u8 │ +│ fields_uuid │ +│ measurements_bool │ +│ measurements_bytes │ +│ measurements_cumulativef32 │ +│ measurements_cumulativef64 │ +│ measurements_cumulativei64 │ +│ measurements_cumulativeu64 │ +│ measurements_f32 │ +│ measurements_f64 │ +│ measurements_histogramf32 │ +│ measurements_histogramf64 │ +│ measurements_histogrami16 │ +│ measurements_histogrami32 │ +│ measurements_histogrami64 │ +│ measurements_histogrami8 │ +│ measurements_histogramu16 │ +│ measurements_histogramu32 │ +│ measurements_histogramu64 │ +│ measurements_histogramu8 │ +│ measurements_i16 │ +│ measurements_i32 │ +│ measurements_i64 │ +│ measurements_i8 │ +│ measurements_string │ +│ measurements_u16 │ +│ measurements_u32 │ +│ measurements_u64 │ +│ measurements_u8 │ +│ timeseries_schema │ +│ version │ +└────────────────────────────┘ +41 rows in set. Elapsed: 0.002 sec. +---- +. **Query for your schema**. The `timeseries_schema` table can provide some additional context for your particular +measurement. The rest of this document will contain an example looking for a very specific "transaction retry" +timeseries, but you can substitute these values with your own. If we know even part of the timeseries name (like the word "transaction") we can search for it with the following: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT timeseries_name,fields.type,fields.source,datum_type FROM timeseries_schema WHERE timeseries_name LIKE '%transaction%' + +SELECT + timeseries_name, + fields.type, + fields.source, + datum_type +FROM timeseries_schema +WHERE timeseries_name LIKE '%transaction%' + +Query id: 09e6086f-fc5d-4905-abed-013be55d6706 + +┌─timeseries_name─────────────────┬─fields.type──────┬─fields.source───────┬─datum_type─┐ +│ database_transaction:retry_data │ ['U32','String'] │ ['Metric','Target'] │ F64 │ +└─────────────────────────────────┴──────────────────┴─────────────────────┴────────────┘ + +1 row in set. Elapsed: 0.003 sec. +---- +This tells us the following: first, our timeseries has fields (see: `fields.type`) from `fields_u32` and `fields_string`. Next, it also emits measurements (see: `datum_type`) into `measurements_f64`. + +. **Query for your data**. This next step is extremely specific to your particular timeseries. +However, for this "database_transaction:retry_data" example, we need to query for data related +to this timeseries from `fields_u32`, `fields_string`, and `measurements_f64`. This information +should be inferable from the query to the `timeseries_schema` table. + ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT + fields_string.field_value as transaction_name, + fields_u32.field_value as attempt, + measurements_f64.datum as attempt_duration, + measurements_f64.timestamp +FROM measurements_f64 +INNER JOIN fields_string ON fields_string.timeseries_key = measurements_f64.timeseries_key +INNER JOIN fields_u32 ON fields_u32.timeseries_key = measurements_f64.timeseries_key +WHERE measurements_f64.timeseries_name = 'database_transaction:retry_data' +ORDER BY measurements_f64.timestamp ASC + +Query id: 813c994e-058c-4af2-9d3a-11cf9f222cbf + +┌─transaction_name─────────┬─attempt─┬─attempt_duration─┬────measurements_f64.timestamp─┐ +│ sled_reservation_create │ 1 │ 0.014977911 │ 2024-01-11 22:41:13.667101491 │ +│ sled_reservation_create │ 1 │ 0.01660099 │ 2024-01-11 22:41:13.667610290 │ +│ sled_reservation_create │ 1 │ 0.014088819 │ 2024-01-11 22:41:13.672007505 │ +│ sled_reservation_create │ 1 │ 0.01501511 │ 2024-01-11 22:41:13.673713738 │ +│ sled_reservation_create │ 2 │ 0.156134143 │ 2024-01-11 22:41:13.843218486 │ +│ sled_reservation_create │ 2 │ 0.150804944 │ 2024-01-11 22:41:13.855771487 │ +│ sled_reservation_create │ 2 │ 0.17012195 │ 2024-01-11 22:41:13.855798649 │ +│ sled_reservation_create │ 1 │ 0.205570224 │ 2024-01-11 22:41:13.872957153 │ +│ sled_reservation_create │ 3 │ 0.006690087 │ 2024-01-11 22:41:13.891856215 │ +│ sled_reservation_create │ 4 │ 0.012846307 │ 2024-01-11 22:41:13.955465361 │ +│ sled_reservation_create │ 1 │ 0.020482506 │ 2024-01-18 23:22:48.146559108 │ +│ sled_reservation_create │ 1 │ 0.008722631 │ 2024-01-19 05:26:07.397242186 │ +│ sled_reservation_create │ 1 │ 0.007484627 │ 2024-01-19 05:26:07.590876948 │ +│ sled_reservation_create │ 1 │ 0.008384388 │ 2024-01-19 05:27:42.833060701 │ +│ sled_reservation_create │ 1 │ 0.009016489 │ 2024-01-19 05:28:15.860577501 │ +│ sled_reservation_create │ 1 │ 0.017649607 │ 2024-01-29 08:21:59.599608552 │ +│ sled_reservation_create │ 1 │ 0.017026628 │ 2024-01-29 08:23:30.278820785 │ +│ volume_create │ 1 │ 0.025257548 │ 2024-01-29 13:03:44.799614376 │ +│ volume_checkout │ 1 │ 0.009869392 │ 2024-01-29 13:03:49.827578682 │ +│ sled_reservation_create │ 1 │ 0.018168935 │ 2024-01-29 13:03:56.876826535 │ +│ volume_checkout │ 1 │ 0.007425083 │ 2024-01-29 13:27:17.949365703 │ +│ sled_reservation_create │ 1 │ 0.017133937 │ 2024-01-29 13:27:39.534955222 │ +│ sled_reservation_create │ 1 │ 0.028159647 │ 2024-01-29 13:27:39.593375890 │ +│ sled_reservation_create │ 1 │ 0.053410541 │ 2024-01-29 13:27:39.593709195 │ +│ sled_reservation_create │ 2 │ 0.080795694 │ 2024-01-29 13:27:39.717689230 │ +│ sled_reservation_create │ 1 │ 0.071597836 │ 2024-01-29 13:27:39.722071303 │ +│ regions_hard_delete │ 1 │ 0.019350474 │ 2024-01-31 13:51:58.056808199 │ +│ sled_reservation_create │ 1 │ 0.032482692 │ 2024-02-01 06:41:51.647937599 │ +│ volume_checkout │ 1 │ 0.009380859 │ 2024-02-01 07:03:04.971258393 │ +│ sled_reservation_create │ 1 │ 0.018020138 │ 2024-02-01 07:04:17.110928203 │ +│ regions_hard_delete │ 1 │ 0.011993838 │ 2024-02-01 08:32:56.113587884 │ +│ volume_checkout │ 1 │ 0.223425122 │ 2024-02-01 15:47:31.240008185 │ +│ volume_checkout │ 1 │ 0.454675525 │ 2024-02-01 15:47:31.480408091 │ +│ volume_checkout │ 1 │ 0.445790132 │ 2024-02-01 15:47:31.480943824 │ +│ volume_checkout │ 2 │ 0.206526747 │ 2024-02-01 15:47:31.481037611 │ +└──────────────────────────┴─────────┴──────────────────┴───────────────────────────────┘ +---- diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 6a0b8f79d5..e286fe3730 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -277,7 +277,7 @@ The below example demonstrates a single static gateway route; in-depth explanati [rack_network_config] # An internal-only IPv6 address block which contains AZ-wide services. # This does not need to be changed. -rack_subnet = "fd00:1122:3344:01::/56" +rack_subnet = "fd00:1122:3344:0100::/56" # A range of IP addresses used by Boundary Services on the network. In a real # system, these would be addresses of the uplink ports on the Sidecar. With # softnpu, only one address is used. diff --git a/flake.lock b/flake.lock index 2c24a13714..f2dfc1b532 100644 --- a/flake.lock +++ b/flake.lock @@ -36,16 +36,13 @@ }, "root": { "inputs": { - "flake-utils": "flake-utils", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" } }, "rust-overlay": { "inputs": { - "flake-utils": [ - "flake-utils" - ], + "flake-utils": "flake-utils", "nixpkgs": [ "nixpkgs" ] diff --git a/flake.nix b/flake.nix index 65329cbbf7..8897d9428d 100644 --- a/flake.nix +++ b/flake.nix @@ -3,62 +3,435 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { url = "github:oxalica/rust-overlay"; - inputs = { - nixpkgs.follows = "nixpkgs"; - flake-utils.follows = "flake-utils"; - }; + inputs.nixpkgs.follows = "nixpkgs"; }; }; - outputs = { self, nixpkgs, flake-utils, rust-overlay }: - flake-utils.lib.eachDefaultSystem - (system: + outputs = { self, nixpkgs, rust-overlay, ... }: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { + inherit overlays; + system = "x86_64-linux"; + }; + # use the Rust toolchain defined in the `rust-toolchain.toml` file. + rustToolchain = (pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml).override { + extensions = [ + "rust-src" # for rust-analyzer + ]; + }; + + buildInputs = with pkgs; [ + # libs + openssl + postgresql + xmlsec + sqlite + libclang + libxml2 + libtool + ]; + + nativeBuildInputs = with pkgs; [ + rustToolchain + cmake + stdenv + pkg-config + ]; + + openAPIVersion = with pkgs.lib; path: + let + file = strings.fileContents path; + parts = strings.splitString "\n" file; + extractHash = prefix: (line: trivial.pipe line [ + (elemAt parts) + (strings.removeSuffix "\"") + (strings.removePrefix "${prefix}=\"") + ]); + in + { + commit = extractHash "COMMIT" 0; + sha = extractHash "SHA2" 1; + }; + + downloadBuildomat = + let baseURL = "https://buildomat.eng.oxide.computer/public/file/oxidecomputer"; + in { kind, repo, file, commit, sha }: + builtins.fetchurl { + url = "${baseURL}/${repo}/${kind}/${commit}/${file}"; + sha256 = sha; + }; + + downloadOpenAPI = { repo, file, version }: + downloadBuildomat + { + inherit repo file; + kind = "openapi"; + commit = pkgs.lib.debug.traceValFn + (v: "${file}: commit=${v}") + version.commit; + sha = version.sha; + }; + + dendriteVersion = openAPIVersion + ./tools/dendrite_openapi_version; + mgVersion = openAPIVersion + ./tools/maghemite_mg_openapi_version; + + + dendriteOpenAPI = downloadOpenAPI + { + repo = "dendrite"; + file = "dpd.json"; + version = dendriteVersion; + }; + + ddmOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "ddm-admin.json"; + version = openAPIVersion ./tools/maghemite_ddm_openapi_version; + }; + + mgOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "mg-admin.json"; + version = mgVersion; + }; + + # given a list of strings of the form `PREFIX="SHA256"`, finds the string + # starting with the provided `name` and returns the hash for that prefix. + findSha = with pkgs.lib; + shas: (name: + let + upperName = strings.toUpper name; + prefix = "${upperName}=\""; + in + trivial.pipe shas [ + (lists.findFirst (strings.hasPrefix prefix) "") + (strings.removePrefix prefix) + (strings.removeSuffix "\"") + ]); + + dendrite-stub = with pkgs.lib; let - overlays = [ (import rust-overlay) ]; - pkgs = import nixpkgs { - inherit system overlays; + commit = dendriteVersion.commit; + repo = "dendrite"; + stubShas = + let + file = builtins.readFile + ./tools/dendrite_stub_checksums; + in + strings.splitString "\n" file; + findStubSha = name: findSha stubShas "CIDL_SHA256_${name}"; + fetchLinuxBin = file: + downloadBuildomat { + inherit commit file repo; + sha = findStubSha "linux_${file}"; + kind = "linux-bin"; + }; + + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findStubSha "illumos"; + kind = "image"; + file = "dendrite-stub.tar.gz"; + }; + swadm = fetchLinuxBin + "swadm"; + dpd = fetchLinuxBin + "dpd"; + in + with pkgs; stdenv.mkDerivation + { + name = "dendrite-stub"; + version = commit; + src = tarball; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + openssl + ]; + + installPhase = + let + binPath = "root/opt/oxide/dendrite/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${swadm} $out/${binPath}/swadm + chmod +x $out/${binPath}/swadm + cp ${dpd} $out/${binPath}/dpd + chmod +x $out/${binPath}/dpd + + mkdir -p $out/bin + ln -s $out/${binPath}/swadm $out/bin/swadm + ln -s $out/${binPath}/dpd $out/bin/dpd + ''; }; - # use the Rust toolchain defined in the `rust-toolchain.toml` file. - rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; - nativeBuildInputs = with pkgs; [ - rustToolchain - cmake - stdenv - pkg-config - ]; - buildInputs = with pkgs; [ - # libs - openssl - postgresql - xmlsec - sqlite - libclang - libxml2 - ]; + + mgd = with pkgs.lib; + let + commit = mgVersion.commit; + repo = "maghemite"; + shas = + let + file = builtins.readFile + ./tools/maghemite_mgd_checksums; + in + strings.splitString + "\n" + file; + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findSha shas "CIDL_SHA256"; + kind = "image"; + file = "mgd.tar.gz"; + }; + linuxBin = + downloadBuildomat + { + inherit commit repo; + sha = findSha shas "MGD_LINUX_SHA256"; + kind = "linux"; + file = "mgd"; + }; in with pkgs; - { - devShells.default = mkShell.override + stdenv.mkDerivation + { + name = "mgd"; + src = tarball; + version = commit; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + + installPhase = + let + binPath = "root/opt/oxide/mgd/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${linuxBin} $out/${binPath}/mgd + chmod +x $out/${binPath}/mgd + + mkdir -p $out/bin + ln -s $out/${binPath}/mgd $out/bin/mgd + ''; + }; + + # reads the version for Clickhouse or Cockroachdb from the + # `tools/clickhouse_version` and `tools/cockroachdb_version` files. + readVersionFile = with pkgs.lib; file: trivial.pipe ./tools/${file} [ + (builtins.readFile) + (strings.removeSuffix "\n") + (strings.removePrefix "v") + (debug.traceValFn (v: "${file}: ${v}")) + ]; + + clickhouse = with pkgs; + let + name = "clickhouse"; + version = readVersionFile "${name}_version"; + # N.B. that unlike maghemite and dendrite, the Clickhouse hashes + # in `tools/clickhouse_checksums` are MD5 rather than SHA256, so we + # can't give Nix those hashes and must instead determine it ourselves. + # this means that we will have to update this SHA if the clickhouse + # version changes. + sha256 = "1lgxwh67apgl386ilpg0iy5xkyz12q4lgnz08zswjbxv88ra0qxj"; + src = builtins.fetchurl { - # use Clang as the C compiler for all C libraries - stdenv = clangStdenv; - } + inherit sha256; + url = "https://oxide-clickhouse-build.s3.us-west-2.amazonaws.com/${name}-v${version}.linux.tar.gz"; + }; + in + stdenv.mkDerivation + { + inherit src name version; + sourceRoot = "."; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + mkdir -p $out/etc + cp ./${name} $out/bin/${name} + cp ./._config.xml $out/bin/config.xml + ''; + }; + + cockroachdb = with pkgs; + let + name = "cockroachdb"; + binName = "cockroach"; + version = readVersionFile "${name}_version"; + sha256 = + let + shaFile = builtins.readFile ./tools/${name}_checksums; + shas = lib.strings.splitString "\n" shaFile; + in + findSha shas "CIDL_SHA256_LINUX"; + src = builtins.fetchurl { - inherit buildInputs nativeBuildInputs; + inherit sha256; + url = "https://binaries.cockroachdb.com/${binName}-v${version}.linux-amd64.tgz"; + }; + in + stdenv.mkDerivation + { + inherit name src version; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; - name = "omicron"; - DEP_PQ_LIBDIRS = " ${postgresql.lib}/lib"; - LIBCLANG_PATH = "${libclang.lib}/lib"; - OPENSSL_DIR = "${openssl.dev}"; - OPENSSL_LIB_DIR = "${openssl.out}/lib"; + buildInputs = [ + glibc + # gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + cp ./${binName} $out/bin/${binName} + ''; + }; + in + { + packages.x86_64-linux = { + inherit dendrite-stub mgd cockroachdb clickhouse; + }; - # Needed by rustfmt-wrapper, see: - # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs - RUSTFMT = "${rustToolchain}/bin/rustfmt"; + checks.x86_64-linux = with pkgs; + let + # produces a check derivation that ensures a package's executable has + # the expected version. + mkVersionCheck = { pkg, cmd }: runCommand "check-${pkg.name}-version" + { + PATH = "${pkg.out}"; + } '' + actualVersion=$(${pkg.out}/bin/${cmd}) + if [ "$actualVersion" != "${pkg.version}" ]; then + echo "expected ${pkg.name} version \"${pkg.version}\", got \"$actualVersion\"" + exit 1 + fi + + # the check derivation must have an output. + touch $out + ''; + # produces a check derivation that ensures a package's executable + # runs. + mkExecCheck = { pkg, cmd }: runCommand "check-${pkg.name}-${cmd}-exec" + { } '' + ${pkg.out}/bin/${cmd} && touch $out + ''; + in + { + clickhouseVersion = mkVersionCheck + { + pkg = clickhouse; + cmd = "clickhouse server --version | cut -d ' ' -f 4"; }; - } - ); + + cockroachdbVersion = mkVersionCheck + { + pkg = cockroachdb; + cmd = "cockroach version --build-tag | tr -d 'v'"; + }; + + mgdCanExec = mkExecCheck { + pkg = mgd; + cmd = "mgd help"; + }; + + dpdCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "dpd help"; + }; + + swadmCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "swadm help"; + }; + }; + + devShells.x86_64-linux.default = + pkgs.mkShell.override + { + # use Clang as the C compiler for all C libraries + stdenv = pkgs.clangStdenv; + } + { + inherit buildInputs; + nativeBuildInputs = nativeBuildInputs ++ [ + # Dendrite and maghemite, for running tests. + dendrite-stub + mgd + clickhouse + cockroachdb + ]; + + name = "omicron"; + DEP_PQ_LIBDIRS = "${pkgs.postgresql.lib}/lib"; + LIBCLANG_PATH = "${pkgs.libclang.lib}/lib"; + OPENSSL_DIR = "${pkgs.openssl.dev}"; + OPENSSL_LIB_DIR = "${pkgs.openssl.out}/lib"; + + MG_OPENAPI_PATH = mgOpenAPI; + DDM_OPENAPI_PATH = ddmOpenAPI; + DPD_OPENAPI_PATH = dendriteOpenAPI; + + # Needed by rustfmt-wrapper, see: + # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs + RUSTFMT = "${rustToolchain}/bin/rustfmt"; + + shellHook = '' + rm out/mgd + rm out/dendrite-stub + rm -r out/clickhouse + rm -r out/cockroachdb + + mkdir -p out/clickhouse + mkdir -p out/cockroachdb/ + + ln -s ${mgd.out} -T out/mgd + ln -s ${dendrite-stub.out} -T out/dendrite-stub + ln -s ${clickhouse.out}/bin/clickhouse out/clickhouse/clickhouse + ln -s ${clickhouse.out}/etc/config.xml out/clickhouse + ln -s ${cockroachdb.out}/bin out/cockroachdb/bin + ''; + }; + }; } + + + + + + + diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index d88ac2f8ac..8cafe4ac97 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -62,7 +62,7 @@ impl ServiceName { /// Returns the DNS name for this service, ignoring the zone part of the DNS /// name - pub(crate) fn dns_name(&self) -> String { + pub fn dns_name(&self) -> String { match self { ServiceName::Clickhouse | ServiceName::ClickhouseKeeper diff --git a/nexus/db-model/src/ipv4_nat_entry.rs b/nexus/db-model/src/ipv4_nat_entry.rs index 6a74444411..c3763346c6 100644 --- a/nexus/db-model/src/ipv4_nat_entry.rs +++ b/nexus/db-model/src/ipv4_nat_entry.rs @@ -1,7 +1,10 @@ use std::net::{Ipv4Addr, Ipv6Addr}; use super::MacAddr; -use crate::{schema::ipv4_nat_entry, Ipv4Net, Ipv6Net, SqlU16, Vni}; +use crate::{ + schema::ipv4_nat_changes, schema::ipv4_nat_entry, Ipv4Net, Ipv6Net, SqlU16, + Vni, +}; use chrono::{DateTime, Utc}; use omicron_common::api::external; use schemars::JsonSchema; @@ -48,6 +51,20 @@ impl Ipv4NatEntry { } } +/// Summary of changes to ipv4 nat entries. +#[derive(Queryable, Debug, Clone, Selectable, Serialize, Deserialize)] +#[diesel(table_name = ipv4_nat_changes)] +pub struct Ipv4NatChange { + pub external_address: Ipv4Net, + pub first_port: SqlU16, + pub last_port: SqlU16, + pub sled_address: Ipv6Net, + pub vni: Vni, + pub mac: MacAddr, + pub version: i64, + pub deleted: bool, +} + /// NAT Record #[derive(Clone, Debug, Serialize, JsonSchema)] pub struct Ipv4NatEntryView { @@ -61,22 +78,17 @@ pub struct Ipv4NatEntryView { pub deleted: bool, } -impl From for Ipv4NatEntryView { - fn from(value: Ipv4NatEntry) -> Self { - let (gen, deleted) = match value.version_removed { - Some(gen) => (gen, true), - None => (value.version_added, false), - }; - +impl From for Ipv4NatEntryView { + fn from(value: Ipv4NatChange) -> Self { Self { external_address: value.external_address.ip(), - first_port: value.first_port(), - last_port: value.last_port(), + first_port: value.first_port.into(), + last_port: value.last_port.into(), sled_address: value.sled_address.ip(), vni: value.vni.0, mac: *value.mac, - gen, - deleted, + gen: value.version, + deleted: value.deleted, } } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index a2b5a539c5..7e460006da 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(32, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(33, 0, 1); table! { disk (id) { @@ -546,6 +546,20 @@ table! { } } +// View used for summarizing changes to ipv4_nat_entry +table! { + ipv4_nat_changes (version) { + external_address -> Inet, + first_port -> Int4, + last_port -> Int4, + sled_address -> Inet, + vni -> Int4, + mac -> Int8, + version -> Int8, + deleted -> Bool, + } +} + // This is the sequence used for the version number // in ipv4_nat_entry. table! { diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index e4625222bf..bf037d53f5 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1061,6 +1061,7 @@ mod tests { use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use omicron_common::address::Ipv6Subnet; use omicron_common::api::external::Generation; @@ -1070,7 +1071,11 @@ mod tests { use std::mem; use std::net::Ipv6Addr; - static EMPTY_POLICY: Policy = Policy { sleds: BTreeMap::new() }; + static EMPTY_POLICY: Policy = Policy { + sleds: BTreeMap::new(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: 0, + }; // This is a not-super-future-maintainer-friendly helper to check that all // the subtables related to blueprints have been pruned of a specific @@ -1120,7 +1125,11 @@ mod tests { }) .collect(); let ip = ip.unwrap_or_else(|| thread_rng().gen::().into()); - SledResources { zpools, subnet: Ipv6Subnet::new(ip) } + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet: Ipv6Subnet::new(ip), + } } // Create a `Policy` that contains all the sleds found in `collection` @@ -1140,6 +1149,11 @@ mod tests { ) }) .collect(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: collection + .all_omicron_zones() + .filter(|z| z.zone_type.is_nexus()) + .count(), } } @@ -1335,7 +1349,8 @@ mod tests { Generation::new(), &policy, "test", - ); + ) + .expect("failed to create builder"); // Add zones to our new sled. assert_eq!( @@ -1485,6 +1500,7 @@ mod tests { &EMPTY_POLICY, "test2", ) + .expect("failed to create builder") .build(); let blueprint3 = BlueprintBuilder::new_based_on( &blueprint1, @@ -1492,6 +1508,7 @@ mod tests { &EMPTY_POLICY, "test3", ) + .expect("failed to create builder") .build(); assert_eq!(blueprint1.parent_blueprint_id, None); assert_eq!(blueprint2.parent_blueprint_id, Some(blueprint1.id)); @@ -1587,6 +1604,7 @@ mod tests { &EMPTY_POLICY, "test3", ) + .expect("failed to create builder") .build(); assert_eq!(blueprint4.parent_blueprint_id, Some(blueprint3.id)); datastore.blueprint_insert(&opctx, &blueprint4).await.unwrap(); diff --git a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs index 81229162d0..27a6bad32f 100644 --- a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs +++ b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs @@ -9,6 +9,7 @@ use chrono::{DateTime, Utc}; use diesel::prelude::*; use diesel::sql_types::BigInt; use nexus_db_model::ExternalIp; +use nexus_db_model::Ipv4NatChange; use nexus_db_model::Ipv4NatEntryView; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -317,10 +318,19 @@ impl DataStore { version: i64, limit: u32, ) -> ListResultVec { - let nat_entries = - self.ipv4_nat_list_since_version(opctx, version, limit).await?; + use db::schema::ipv4_nat_changes::dsl; + + let nat_changes = dsl::ipv4_nat_changes + .filter(dsl::version.gt(version)) + .limit(limit as i64) + .order_by(dsl::version) + .select(Ipv4NatChange::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let nat_entries: Vec = - nat_entries.iter().map(|e| e.clone().into()).collect(); + nat_changes.iter().map(|e| e.clone().into()).collect(); Ok(nat_entries) } @@ -367,7 +377,7 @@ fn ipv4_nat_next_version() -> diesel::expression::SqlLiteral { #[cfg(test)] mod test { - use std::str::FromStr; + use std::{net::Ipv4Addr, str::FromStr}; use crate::db::datastore::datastore_test; use chrono::Utc; @@ -375,6 +385,7 @@ mod test { use nexus_test_utils::db::test_setup_database; use omicron_common::api::external; use omicron_test_utils::dev; + use rand::seq::IteratorRandom; // Test our ability to track additions and deletions since a given version number #[tokio::test] @@ -802,4 +813,154 @@ mod test { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + // Test our ability to return all changes interleaved in the correct order + #[tokio::test] + async fn ipv4_nat_changeset() { + let logctx = dev::test_setup_log("test_nat_version_tracking"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // We should not have any NAT entries at this moment + let initial_state = + datastore.ipv4_nat_list_since_version(&opctx, 0, 10).await.unwrap(); + + assert!(initial_state.is_empty()); + assert_eq!( + datastore.ipv4_nat_current_version(&opctx).await.unwrap(), + 0 + ); + + let addresses = (0..=255).map(|i| { + let addr = Ipv4Addr::new(10, 0, 0, i); + let net = ipnetwork::Ipv4Network::new(addr, 32).unwrap(); + external::Ipv4Net(net) + }); + + let sled_address = external::Ipv6Net( + ipnetwork::Ipv6Network::try_from("fd00:1122:3344:104::1").unwrap(), + ); + + let nat_entries = addresses.map(|external_address| { + // build a bunch of nat entries + Ipv4NatValues { + external_address: external_address.into(), + first_port: u16::MIN.into(), + last_port: u16::MAX.into(), + sled_address: sled_address.into(), + vni: Vni(external::Vni::random()), + mac: MacAddr(external::MacAddr::random_guest()), + } + }); + + let mut db_records = vec![]; + + // create the nat entries + for entry in nat_entries { + let result = datastore + .ensure_ipv4_nat_entry(&opctx, entry.clone()) + .await + .unwrap(); + + db_records.push(result); + } + + // delete a subset of the entries + for entry in + db_records.iter().choose_multiple(&mut rand::thread_rng(), 50) + { + datastore.ipv4_nat_delete(&opctx, entry).await.unwrap(); + } + + // get the new state of all nat entries + // note that this is not the method under test + let db_records = datastore + .ipv4_nat_list_since_version(&opctx, 0, 300) + .await + .unwrap(); + + // Count the actual number of changes seen. + // This check is required because we _were_ getting changes in ascending order, + // but some entries were being skipped. We want to ensure we are getting + // *all* of the changes in ascending order. + let mut total_changes = 0; + + // ensure that the changeset is ordered, displaying the correct + // version numbers, and displaying the correct `deleted` status + let mut version = 0; + let limit = 100; + let mut changes = + datastore.ipv4_nat_changeset(&opctx, version, limit).await.unwrap(); + + while !changes.is_empty() { + // check ordering + assert!(changes + .windows(2) + .all(|entries| entries[0].gen < entries[1].gen)); + + // check deleted status and version numbers + changes.iter().for_each(|change| match change.deleted { + true => { + // version should match a deleted entry + let deleted_nat = db_records + .iter() + .find(|entry| entry.version_removed == Some(change.gen)) + .expect("did not find a deleted nat entry with a matching version number"); + + assert_eq!( + deleted_nat.external_address.ip(), + change.external_address + ); + assert_eq!( + deleted_nat.first_port, + change.first_port.into() + ); + assert_eq!(deleted_nat.last_port, change.last_port.into()); + assert_eq!( + deleted_nat.sled_address.ip(), + change.sled_address + ); + assert_eq!(*deleted_nat.mac, change.mac); + assert_eq!(deleted_nat.vni.0, change.vni); + } + false => { + // version should match an active nat entry + let added_nat = db_records + .iter() + .find(|entry| entry.version_added == change.gen) + .expect("did not find an active nat entry with a matching version number"); + + assert!(added_nat.version_removed.is_none()); + + assert_eq!( + added_nat.external_address.ip(), + change.external_address + ); + assert_eq!(added_nat.first_port, change.first_port.into()); + assert_eq!(added_nat.last_port, change.last_port.into()); + assert_eq!( + added_nat.sled_address.ip(), + change.sled_address + ); + assert_eq!(*added_nat.mac, change.mac); + assert_eq!(added_nat.vni.0, change.vni); + } + }); + + // bump the count of changes seen + total_changes += changes.len(); + + version = changes.last().unwrap().gen; + changes = datastore + .ipv4_nat_changeset(&opctx, version, limit) + .await + .unwrap(); + } + + // did we see everything? + assert_eq!(total_changes, db_records.len()); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 4dfee7f2a5..5f05aa1760 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -159,6 +159,7 @@ pub type DataStoreConnection<'a> = bb8::PooledConnection<'a, ConnectionManager>; pub struct DataStore { + log: Logger, pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, transaction_retry_producer: crate::transaction_retry::Producer, @@ -173,8 +174,9 @@ impl DataStore { /// Ignores the underlying DB version. Should be used with caution, as usage /// of this method can construct a Datastore which does not understand /// the underlying CockroachDB schema. Data corruption could result. - pub fn new_unchecked(pool: Arc) -> Result { + pub fn new_unchecked(log: Logger, pool: Arc) -> Result { let datastore = DataStore { + log, pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), @@ -193,7 +195,8 @@ impl DataStore { pool: Arc, config: Option<&SchemaConfig>, ) -> Result { - let datastore = Self::new_unchecked(pool)?; + let datastore = + Self::new_unchecked(log.new(o!("component" => "datastore")), pool)?; // Keep looping until we find that the schema matches our expectation. const EXPECTED_VERSION: SemverVersion = @@ -239,6 +242,7 @@ impl DataStore { name: &'static str, ) -> crate::transaction_retry::RetryHelper { crate::transaction_retry::RetryHelper::new( + &self.log, &self.transaction_retry_producer, name, ) diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index c474b729f8..6b5098158b 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -9,6 +9,7 @@ use chrono::Utc; use diesel::result::Error as DieselError; use oximeter::{types::Sample, Metric, MetricsError, Target}; use rand::{thread_rng, Rng}; +use slog::{info, warn, Logger}; use std::sync::{Arc, Mutex}; use std::time::Duration; @@ -60,6 +61,10 @@ impl RetryHelperInner { Self { start: Utc::now(), attempts: 1 } } + fn has_retried(&self) -> bool { + self.attempts > 1 + } + fn tick(&mut self) -> Self { let start = self.start; let attempts = self.attempts; @@ -74,6 +79,7 @@ impl RetryHelperInner { /// Helper utility for tracking retry attempts and latency. /// Intended to be used from within "transaction_async_with_retry". pub struct RetryHelper { + log: Logger, producer: Producer, name: &'static str, inner: Mutex, @@ -86,8 +92,13 @@ const MAX_RETRY_ATTEMPTS: u32 = 10; impl RetryHelper { /// Creates a new RetryHelper, and starts a timer tracking the transaction /// duration. - pub(crate) fn new(producer: &Producer, name: &'static str) -> Self { + pub(crate) fn new( + log: &Logger, + producer: &Producer, + name: &'static str, + ) -> Self { Self { + log: log.new(o!("transaction" => name)), producer: producer.clone(), name, inner: Mutex::new(RetryHelperInner::new()), @@ -107,7 +118,21 @@ impl RetryHelper { + Send + Sync, { - conn.transaction_async_with_retry(f, self.as_callback()).await + let slef = Arc::new(self); + let result = conn + .transaction_async_with_retry(f, slef.clone().as_callback()) + .await; + + let retry_info = slef.inner.lock().unwrap(); + if retry_info.has_retried() { + info!( + slef.log, + "transaction completed"; + "attempts" => retry_info.attempts, + ); + } + + result } // Called upon retryable transaction failure. @@ -143,6 +168,12 @@ impl RetryHelper { let mut rng = thread_rng(); rng.gen_range(MIN_RETRY_BACKOFF..MAX_RETRY_BACKOFF) }; + + warn!( + self.log, + "Retryable transaction failure"; + "retry_after (ms)" => duration.as_millis(), + ); tokio::time::sleep(duration).await; // Now that we've finished sleeping, reset the timer and bump the number @@ -151,14 +182,13 @@ impl RetryHelper { return inner.attempts < MAX_RETRY_ATTEMPTS; } - /// Converts this function to a retryable callback that can be used from - /// "transaction_async_with_retry". - pub(crate) fn as_callback( - self, + // Converts this function to a retryable callback that can be used from + // "transaction_async_with_retry". + fn as_callback( + self: Arc, ) -> impl Fn() -> futures::future::BoxFuture<'static, bool> { - let r = Arc::new(self); move || { - let r = r.clone(); + let r = self.clone(); Box::pin(async move { r.retry_callback().await }) } } diff --git a/nexus/deployment/src/blueprint_builder.rs b/nexus/deployment/src/blueprint_builder.rs index 9d6df2218f..86a9b8da6e 100644 --- a/nexus/deployment/src/blueprint_builder.rs +++ b/nexus/deployment/src/blueprint_builder.rs @@ -6,11 +6,14 @@ use crate::ip_allocator::IpAllocator; use anyhow::anyhow; +use anyhow::bail; use internal_dns::config::Host; use internal_dns::config::ZoneVariant; use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::NetworkInterface; +use nexus_types::deployment::NetworkInterfaceKind; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneDataset; use nexus_types::deployment::OmicronZoneType; @@ -23,11 +26,20 @@ use omicron_common::address::get_internal_dns_server_addresses; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CP_SERVICES_RESERVED_ADDRESSES; +use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; +use omicron_common::address::NEXUS_OPTE_IPV6_SUBNET; use omicron_common::address::NTP_PORT; use omicron_common::address::SLED_RESERVED_ADDRESSES; use omicron_common::api::external::Generation; +use omicron_common::api::external::IpNet; +use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; +use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::collections::HashSet; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use thiserror::Error; @@ -38,6 +50,14 @@ use uuid::Uuid; pub enum Error { #[error("sled {sled_id}: ran out of available addresses for sled")] OutOfAddresses { sled_id: Uuid }, + #[error("no Nexus zones exist in parent blueprint")] + NoNexusZonesInParentBlueprint, + #[error("no external service IP addresses are available")] + NoExternalServiceIpAvailable, + #[error("no system MAC addresses are available")] + NoSystemMacAddressAvailable, + #[error("exhausted available Nexus IP addresses")] + ExhaustedNexusIps, #[error("programming error in planner")] Planner(#[from] anyhow::Error), } @@ -52,6 +72,16 @@ pub enum Ensure { NotNeeded, } +/// Describes whether an idempotent "ensure" operation resulted in multiple +/// actions taken or no action was necessary +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum EnsureMultiple { + /// action was taken, and multiple items were added + Added(usize), + /// no action was necessary + NotNeeded, +} + /// Helper for assembling a blueprint /// /// There are two basic ways to assemble a new blueprint: @@ -78,10 +108,20 @@ pub struct BlueprintBuilder<'a> { // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. - omicron_zones: BTreeMap, + zones: BlueprintZones<'a>, zones_in_service: BTreeSet, creator: String, comments: Vec, + + // These fields mirror how RSS chooses addresses for zone NICs. + nexus_v4_ips: Box + Send>, + nexus_v6_ips: Box + Send>, + + // Iterator of available external IPs for service zones + available_external_ips: Box + Send + 'a>, + + // Iterator of available MAC addresses in the system address range + available_system_macs: Box>, } impl<'a> BlueprintBuilder<'a> { @@ -150,55 +190,123 @@ impl<'a> BlueprintBuilder<'a> { internal_dns_version: Generation, policy: &'a Policy, creator: &str, - ) -> BlueprintBuilder<'a> { - BlueprintBuilder { + ) -> anyhow::Result> { + // Scan through the parent blueprint and build several sets of "used + // resources". When adding new control plane zones to a sled, we may + // need to allocate new resources to that zone. However, allocation at + // this point is entirely optimistic and theoretical: our caller may + // discard the blueprint we create without ever making it the new + // target, or it might be an arbitrarily long time before it becomes the + // target. We need to be able to make allocation decisions that we + // expect the blueprint executor to be able to realize successfully if + // and when we become the target, but we cannot _actually_ perform + // resource allocation. + // + // To do this, we look at our parent blueprint's used resources, and + // then choose new resources that aren't already in use (if possible; if + // we need to allocate a new resource and the parent blueprint appears + // to be using all the resources of that kind, our blueprint generation + // will fail). + // + // For example, RSS assigns Nexus NIC IPs by stepping through a list of + // addresses based on `NEXUS_OPTE_IPVx_SUBNET` (as in the iterators + // below). We use the same list of addresses, but additionally need to + // filter out the existing IPs for any Nexus instances that already + // exist. + // + // Note that by building these iterators up front based on + // `parent_blueprint`, we cannot reuse resources in a case where we + // remove a zone that used a resource and then add another zone that + // wants the same kind of resource. We don't support zone removal yet, + // but expect this to be okay: we don't anticipate removal and addition + // to frequently be combined into the exact same blueprint, particularly + // in a way that expects the addition to reuse resources from the + // removal; we won't want to attempt to reuse resources from a zone + // until we know it's been fully removed. + let mut existing_nexus_v4_ips: HashSet = HashSet::new(); + let mut existing_nexus_v6_ips: HashSet = HashSet::new(); + let mut used_external_ips: HashSet = HashSet::new(); + let mut used_macs: HashSet = HashSet::new(); + + for (_, z) in parent_blueprint.all_omicron_zones() { + if let OmicronZoneType::Nexus { nic, .. } = &z.zone_type { + match nic.ip { + IpAddr::V4(ip) => { + if !existing_nexus_v4_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + IpAddr::V6(ip) => { + if !existing_nexus_v6_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + } + } + if let Some(external_ip) = z.zone_type.external_ip()? { + if !used_external_ips.insert(external_ip) { + bail!("duplicate external IP: {external_ip}"); + } + } + if let Some(nic) = z.zone_type.service_vnic() { + if !used_macs.insert(nic.mac) { + bail!("duplicate service vNIC MAC: {}", nic.mac); + } + } + } + + // TODO-performance Building these iterators as "walk through the list + // and skip anything we've used already" is fine as long as we're + // talking about a small number of resources (e.g., single-digit number + // of Nexus instances), but wouldn't be ideal if we have many resources + // we need to skip. We could do something smarter here based on the sets + // of used resources we built above if needed. + let nexus_v4_ips = Box::new( + NEXUS_OPTE_IPV4_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v4_ips.contains(ip)), + ); + let nexus_v6_ips = Box::new( + NEXUS_OPTE_IPV6_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v6_ips.contains(ip)), + ); + let available_external_ips = Box::new( + policy + .service_ip_pool_ranges + .iter() + .flat_map(|r| r.iter()) + .filter(move |ip| !used_external_ips.contains(ip)), + ); + let available_system_macs = Box::new( + MacAddr::iter_system().filter(move |mac| !used_macs.contains(mac)), + ); + + Ok(BlueprintBuilder { parent_blueprint, internal_dns_version, policy, sled_ip_allocators: BTreeMap::new(), - omicron_zones: BTreeMap::new(), + zones: BlueprintZones::new(parent_blueprint), zones_in_service: parent_blueprint.zones_in_service.clone(), creator: creator.to_owned(), comments: Vec::new(), - } + nexus_v4_ips, + nexus_v6_ips, + available_external_ips, + available_system_macs, + }) } /// Assemble a final [`Blueprint`] based on the contents of the builder - pub fn build(mut self) -> Blueprint { + pub fn build(self) -> Blueprint { // Collect the Omicron zones config for each in-service sled. - let omicron_zones = self - .policy - .sleds - .keys() - .map(|sled_id| { - // Start with self.omicron_zones, which contains entries for any - // sled whose zones config is changing in this blueprint. - let mut zones = self - .omicron_zones - .remove(sled_id) - // If it's not there, use the config from the parent - // blueprint. - .or_else(|| { - self.parent_blueprint - .omicron_zones - .get(sled_id) - .cloned() - }) - // If it's not there either, then this must be a new sled - // and we haven't added any zones to it yet. Use the - // standard initial config. - .unwrap_or_else(|| OmicronZonesConfig { - generation: Generation::new(), - zones: vec![], - }); - - // This is not strictly necessary. But for testing, it's - // helpful for things to be in sorted order. - zones.zones.sort_by_key(|zone| zone.id); - - (*sled_id, zones) - }) - .collect(); + let omicron_zones = + self.zones.into_omicron_zones(self.policy.sleds.keys().copied()); Blueprint { id: Uuid::new_v4(), omicron_zones, @@ -228,13 +336,9 @@ impl<'a> BlueprintBuilder<'a> { ) -> Result { // If there's already an NTP zone on this sled, do nothing. let has_ntp = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| z.zone_type.is_ntp()) - }) - .unwrap_or(false); + .zones + .current_sled_zones(sled_id) + .any(|z| z.zone_type.is_ntp()); if has_ntp { return Ok(Ensure::NotNeeded); } @@ -292,20 +396,14 @@ impl<'a> BlueprintBuilder<'a> { pool_name: ZpoolName, ) -> Result { // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| { - matches!( - &z.zone_type, - OmicronZoneType::Crucible { dataset, .. } - if dataset.pool_name == pool_name - ) - }) - }) - .unwrap_or(false); + let has_crucible_on_this_pool = + self.zones.current_sled_zones(sled_id).any(|z| { + matches!( + &z.zone_type, + OmicronZoneType::Crucible { dataset, .. } + if dataset.pool_name == pool_name + ) + }); if has_crucible_on_this_pool { return Ok(Ensure::NotNeeded); } @@ -335,6 +433,127 @@ impl<'a> BlueprintBuilder<'a> { Ok(Ensure::Added) } + /// Return the number of Nexus zones that would be configured to run on the + /// given sled if this builder generated a blueprint + /// + /// This value may change before a blueprint is actually generated if + /// further changes are made to the builder. + pub fn sled_num_nexus_zones(&self, sled_id: Uuid) -> usize { + self.zones + .current_sled_zones(sled_id) + .filter(|z| z.zone_type.is_nexus()) + .count() + } + + pub fn sled_ensure_zone_multiple_nexus( + &mut self, + sled_id: Uuid, + desired_zone_count: usize, + ) -> Result { + // How many Nexus zones do we need to add? + let nexus_count = self.sled_num_nexus_zones(sled_id); + let num_nexus_to_add = match desired_zone_count.checked_sub(nexus_count) + { + Some(0) => return Ok(EnsureMultiple::NotNeeded), + Some(n) => n, + None => { + return Err(Error::Planner(anyhow!( + "removing a Nexus zone not yet supported \ + (sled {sled_id} has {nexus_count}; \ + planner wants {desired_zone_count})" + ))); + } + }; + + // Whether Nexus should use TLS and what the external DNS servers it + // should use are currently provided at rack-setup time, and should be + // consistent across all Nexus instances. We'll assume we can copy them + // from any other Nexus zone in our parent blueprint. + // + // TODO-correctness Once these properties can be changed by a rack + // operator, this will need more work. At a minimum, if such a change + // goes through the blueprint system (which seems likely), we'll need to + // check that we're if this builder is being used to make such a change, + // that change is also reflected here in a new zone. Perhaps these + // settings should be part of `Policy` instead? + let (external_tls, external_dns_servers) = self + .parent_blueprint + .omicron_zones + .values() + .find_map(|sled_zones| { + sled_zones.zones.iter().find_map(|z| match &z.zone_type { + OmicronZoneType::Nexus { + external_tls, + external_dns_servers, + .. + } => Some((*external_tls, external_dns_servers.clone())), + _ => None, + }) + }) + .ok_or(Error::NoNexusZonesInParentBlueprint)?; + + for _ in 0..num_nexus_to_add { + let nexus_id = Uuid::new_v4(); + let external_ip = self + .available_external_ips + .next() + .ok_or(Error::NoExternalServiceIpAvailable)?; + + let nic = { + let (ip, subnet) = match external_ip { + IpAddr::V4(_) => ( + self.nexus_v4_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + ), + IpAddr::V6(_) => ( + self.nexus_v6_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV6_SUBNET).into(), + ), + }; + let mac = self + .available_system_macs + .next() + .ok_or(Error::NoSystemMacAddressAvailable)?; + NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(nexus_id), + name: format!("nexus-{nexus_id}").parse().unwrap(), + ip, + mac, + subnet, + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + } + }; + + let ip = self.sled_alloc_ip(sled_id)?; + let port = omicron_common::address::NEXUS_INTERNAL_PORT; + let internal_address = + SocketAddrV6::new(ip, port, 0, 0).to_string(); + let zone = OmicronZoneConfig { + id: nexus_id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address, + external_ip, + nic, + external_tls, + external_dns_servers: external_dns_servers.clone(), + }, + }; + self.sled_add_zone(sled_id, zone)?; + } + + Ok(EnsureMultiple::Added(num_nexus_to_add)) + } + fn sled_add_zone( &mut self, sled_id: Uuid, @@ -350,27 +569,7 @@ impl<'a> BlueprintBuilder<'a> { ))); } - let sled_zones = - self.omicron_zones.entry(sled_id).or_insert_with(|| { - if let Some(old_sled_zones) = - self.parent_blueprint.omicron_zones.get(&sled_id) - { - OmicronZonesConfig { - generation: old_sled_zones.generation.next(), - zones: old_sled_zones.zones.clone(), - } - } else { - // The first generation is reserved to mean the one - // containing no zones. See - // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start - // with the next one. - OmicronZonesConfig { - generation: Generation::new().next(), - zones: vec![], - } - } - }); - + let sled_zones = self.zones.change_sled_zones(sled_id); sled_zones.zones.push(zone); Ok(()) } @@ -404,16 +603,14 @@ impl<'a> BlueprintBuilder<'a> { // Record each of the sled's zones' underlay addresses as // allocated. - if let Some(sled_zones) = self.omicron_zones.get(&sled_id) { - for z in &sled_zones.zones { - allocator.reserve(z.underlay_address); - } + for z in self.zones.current_sled_zones(sled_id) { + allocator.reserve(z.underlay_address); } allocator }); - allocator.alloc().ok_or_else(|| Error::OutOfAddresses { sled_id }) + allocator.alloc().ok_or(Error::OutOfAddresses { sled_id }) } fn sled_resources(&self, sled_id: Uuid) -> Result<&SledResources, Error> { @@ -426,28 +623,118 @@ impl<'a> BlueprintBuilder<'a> { } } +/// Helper for working with sets of zones on each sled +/// +/// Tracking the set of zones is slightly non-trivial because we need to bump +/// the per-sled generation number iff the zones are changed. So we need to +/// keep track of whether we've changed the zones relative to the parent +/// blueprint. We do this by keeping a copy of any `OmicronZonesConfig` that +/// we've changed and a _reference_ to the parent blueprint's zones. This +/// struct makes it easy for callers iterate over the right set of zones. +struct BlueprintZones<'a> { + changed_zones: BTreeMap, + parent_zones: &'a BTreeMap, +} + +impl<'a> BlueprintZones<'a> { + pub fn new(parent_blueprint: &'a Blueprint) -> BlueprintZones { + BlueprintZones { + changed_zones: BTreeMap::new(), + parent_zones: &parent_blueprint.omicron_zones, + } + } + + /// Returns a mutable reference to a sled's Omicron zones *because* we're + /// going to change them. It's essential that the caller _does_ change them + /// because we will have bumped the generation number and we don't want to + /// do that if no changes are being made. + pub fn change_sled_zones( + &mut self, + sled_id: Uuid, + ) -> &mut OmicronZonesConfig { + self.changed_zones.entry(sled_id).or_insert_with(|| { + if let Some(old_sled_zones) = self.parent_zones.get(&sled_id) { + OmicronZonesConfig { + generation: old_sled_zones.generation.next(), + zones: old_sled_zones.zones.clone(), + } + } else { + // The first generation is reserved to mean the one + // containing no zones. See + // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start + // with the next one. + OmicronZonesConfig { + generation: Generation::new().next(), + zones: vec![], + } + } + }) + } + + /// Iterates over the list of Omicron zones currently configured for this + /// sled in the blueprint that's being built + pub fn current_sled_zones( + &self, + sled_id: Uuid, + ) -> Box + '_> { + if let Some(sled_zones) = self + .changed_zones + .get(&sled_id) + .or_else(|| self.parent_zones.get(&sled_id)) + { + Box::new(sled_zones.zones.iter()) + } else { + Box::new(std::iter::empty()) + } + } + + /// Produces an owned map of zones for the requested sleds + pub fn into_omicron_zones( + mut self, + sled_ids: impl Iterator, + ) -> BTreeMap { + sled_ids + .map(|sled_id| { + // Start with self.changed_zones, which contains entries for any + // sled whose zones config is changing in this blueprint. + let mut zones = self + .changed_zones + .remove(&sled_id) + // If it's not there, use the config from the parent + // blueprint. + .or_else(|| self.parent_zones.get(&sled_id).cloned()) + // If it's not there either, then this must be a new sled + // and we haven't added any zones to it yet. Use the + // standard initial config. + .unwrap_or_else(|| OmicronZonesConfig { + generation: Generation::new(), + zones: vec![], + }); + + // This is not strictly necessary. But for testing, it's + // helpful for things to be in sorted order. + zones.zones.sort_by_key(|zone| zone.id); + + (sled_id, zones) + }) + .collect() + } +} + #[cfg(test)] pub mod test { - use super::BlueprintBuilder; - use ipnet::IpAdd; - use nexus_types::deployment::Policy; - use nexus_types::deployment::SledResources; - use nexus_types::deployment::ZpoolName; - use nexus_types::inventory::Collection; + use super::*; + use nexus_types::external_api::views::SledProvisionState; + use omicron_common::address::IpRange; + use omicron_common::address::Ipv4Range; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::ByteCount; - use omicron_common::api::external::Generation; use sled_agent_client::types::{ Baseboard, Inventory, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, OmicronZonesConfig, SledRole, }; - use std::collections::BTreeMap; - use std::collections::BTreeSet; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; use std::str::FromStr; - use uuid::Uuid; /// Returns a collection and policy describing a pretty simple system pub fn example() -> (Collection, Policy) { @@ -458,7 +745,32 @@ pub mod test { "a5f3db3a-61aa-4f90-ad3e-02833c253bf5", "0d168386-2551-44e8-98dd-ae7a7570f8a0", ]; - let mut policy = Policy { sleds: BTreeMap::new() }; + let mut policy = Policy { + sleds: BTreeMap::new(), + // IPs from TEST-NET-1 (RFC 5737) + service_ip_pool_ranges: vec![Ipv4Range::new( + "192.0.2.2".parse().unwrap(), + "192.0.2.20".parse().unwrap(), + ) + .unwrap() + .into()], + target_nexus_zone_count: 3, + }; + let mut service_ip_pool_range = policy.service_ip_pool_ranges[0].iter(); + let mut nexus_nic_ips = NEXUS_OPTE_IPV4_SUBNET + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES); + let mut nexus_nic_macs = { + let mut used = HashSet::new(); + std::iter::from_fn(move || { + let mut mac = MacAddr::random_system(); + while !used.insert(mac) { + mac = MacAddr::random_system(); + } + Some(mac) + }) + }; + for sled_id_str in sled_ids.iter() { let sled_id: Uuid = sled_id_str.parse().unwrap(); let sled_ip = policy_add_sled(&mut policy, sled_id); @@ -486,19 +798,58 @@ pub mod test { .unwrap(); let zpools = &policy.sleds.get(&sled_id).unwrap().zpools; - let ip1 = sled_ip.saturating_add(1); - let zones: Vec<_> = std::iter::once(OmicronZoneConfig { - id: Uuid::new_v4(), - underlay_address: sled_ip.saturating_add(1), - zone_type: OmicronZoneType::InternalNtp { - address: SocketAddrV6::new(ip1, 12345, 0, 0).to_string(), - dns_servers: vec![], - domain: None, - ntp_servers: vec![], - }, + let mut sled_ips = + std::iter::successors(Some(sled_ip.saturating_add(1)), |ip| { + println!("sled_ips iterator: currently {ip:?}"); + Some(ip.saturating_add(1)) + }); + let zones: Vec<_> = std::iter::once({ + let ip = sled_ips.next().unwrap(); + OmicronZoneConfig { + id: Uuid::new_v4(), + underlay_address: ip, + zone_type: OmicronZoneType::InternalNtp { + address: SocketAddrV6::new(ip, 12345, 0, 0).to_string(), + dns_servers: vec![], + domain: None, + ntp_servers: vec![], + }, + } }) - .chain(zpools.iter().enumerate().map(|(i, zpool_name)| { - let ip = sled_ip.saturating_add(u128::try_from(i + 2).unwrap()); + .chain(std::iter::once({ + let id = Uuid::new_v4(); + let ip = sled_ips.next().unwrap(); + let external_ip = + service_ip_pool_range.next().expect("no service IPs left"); + let nic_ip = + nexus_nic_ips.next().expect("no nexus nic IPs left"); + OmicronZoneConfig { + id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address: SocketAddrV6::new(ip, 12346, 0, 0) + .to_string(), + external_ip, + nic: NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(id), + name: format!("nexus-{id}").parse().unwrap(), + ip: nic_ip.into(), + mac: nexus_nic_macs + .next() + .expect("no nexus nic MACs left"), + subnet: IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_tls: false, + external_dns_servers: Vec::new(), + }, + } + })) + .chain(zpools.iter().map(|zpool_name| { + let ip = sled_ips.next().unwrap(); OmicronZoneConfig { id: Uuid::new_v4(), underlay_address: ip, @@ -546,10 +897,36 @@ pub mod test { .collect(); let subnet = Ipv6Subnet::::new(sled_ip); - policy.sleds.insert(sled_id, SledResources { zpools, subnet }); + policy.sleds.insert( + sled_id, + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet, + }, + ); sled_ip } + /// Checks various conditions that should be true for all blueprints + pub fn verify_blueprint(blueprint: &Blueprint) { + let mut underlay_ips: BTreeMap = + BTreeMap::new(); + for sled_zones in blueprint.omicron_zones.values() { + for zone in &sled_zones.zones { + if let Some(previous) = + underlay_ips.insert(zone.underlay_address, zone) + { + panic!( + "found duplicate underlay IP {} in zones {} and \ + {}\n\nblueprint: {:#?}", + zone.underlay_address, zone.id, previous.id, blueprint + ); + } + } + } + } + #[test] fn test_initial() { // Test creating a blueprint from a collection and verifying that it @@ -563,6 +940,7 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint_initial); // Since collections don't include what was in service, we have to // provide that ourselves. For our purposes though we don't care. @@ -583,8 +961,10 @@ pub mod test { Generation::new(), &policy, "test_basic", - ); + ) + .expect("failed to create builder"); let blueprint = builder.build(); + verify_blueprint(&blueprint); let diff = blueprint_initial.diff_sleds(&blueprint); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -605,13 +985,15 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); let mut builder = BlueprintBuilder::new_based_on( &blueprint1, Generation::new(), &policy, "test_basic", - ); + ) + .expect("failed to create builder"); // The initial blueprint should have internal NTP zones on all the // existing sleds, plus Crucible zones on all pools. So if we ensure @@ -626,6 +1008,7 @@ pub mod test { } let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); let diff = blueprint1.diff_sleds(&blueprint2); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -643,7 +1026,8 @@ pub mod test { Generation::new(), &policy, "test_basic", - ); + ) + .expect("failed to create builder"); builder.sled_ensure_zone_ntp(new_sled_id).unwrap(); let new_sled_resources = policy.sleds.get(&new_sled_id).unwrap(); for pool_name in &new_sled_resources.zpools { @@ -653,6 +1037,7 @@ pub mod test { } let blueprint3 = builder.build(); + verify_blueprint(&blueprint3); let diff = blueprint2.diff_sleds(&blueprint3); println!("expecting new NTP and Crucible zones:\n{}", diff); @@ -708,4 +1093,316 @@ pub mod test { .collect::>(); assert_eq!(crucible_pool_names, new_sled_resources.zpools); } + + #[test] + fn test_add_nexus_with_no_existing_nexus_zones() { + let (mut collection, policy) = example(); + + // We don't care about the internal DNS version here. + let internal_dns_version = Generation::new(); + + // Adding a new Nexus zone currently requires copying settings from an + // existing Nexus zone. If we remove all Nexus zones from the + // collection, create a blueprint, then try to add a Nexus zone, it + // should fail. + for zones in collection.omicron_zones.values_mut() { + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + } + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + internal_dns_version, + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + let mut builder = BlueprintBuilder::new_based_on( + &parent, + internal_dns_version, + &policy, + "test", + ) + .expect("failed to create builder"); + + let err = builder + .sled_ensure_zone_multiple_nexus( + collection + .omicron_zones + .keys() + .next() + .copied() + .expect("no sleds present"), + 1, + ) + .unwrap_err(); + + assert!( + matches!(err, Error::NoNexusZonesInParentBlueprint), + "unexpected error {err}" + ); + } + + #[test] + fn test_add_nexus_error_cases() { + let (mut collection, policy) = example(); + + // We don't care about the internal DNS version here. + let internal_dns_version = Generation::new(); + + // Remove the Nexus zone from one of the sleds so that + // `sled_ensure_zone_nexus` can attempt to add a Nexus zone to + // `sled_id`. + let sled_id = { + let mut selected_sled_id = None; + for (sled_id, zones) in &mut collection.omicron_zones { + let nzones_before_retain = zones.zones.zones.len(); + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + if zones.zones.zones.len() < nzones_before_retain { + selected_sled_id = Some(*sled_id); + break; + } + } + selected_sled_id.expect("found no sleds with Nexus zone") + }; + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + { + // Attempting to add Nexus to the sled we removed it from (with no + // other changes to the environment) should succeed. + let mut builder = BlueprintBuilder::new_based_on( + &parent, + internal_dns_version, + &policy, + "test", + ) + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(1)); + } + + { + // Attempting to add multiple Nexus zones to the sled we removed it + // from (with no other changes to the environment) should also + // succeed. + let mut builder = BlueprintBuilder::new_based_on( + &parent, + internal_dns_version, + &policy, + "test", + ) + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 3) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(3)); + } + + { + // Replace the policy's external service IP pool ranges with ranges + // that are already in use by existing zones. Attempting to add a + // Nexus with no remaining external IPs should fail. + let mut policy = policy.clone(); + let mut used_ip_ranges = Vec::new(); + for (_, z) in parent.all_omicron_zones() { + if let Some(ip) = z + .zone_type + .external_ip() + .expect("failed to check for external IP") + { + used_ip_ranges.push(IpRange::from(ip)); + } + } + assert!(!used_ip_ranges.is_empty()); + policy.service_ip_pool_ranges = used_ip_ranges; + + let mut builder = BlueprintBuilder::new_based_on( + &parent, + internal_dns_version, + &policy, + "test", + ) + .expect("failed to create builder"); + let err = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .unwrap_err(); + + assert!( + matches!(err, Error::NoExternalServiceIpAvailable), + "unexpected error {err}" + ); + } + + // We're not testing the `ExhaustedNexusIps` error case (where we've run + // out of Nexus OPTE addresses), because it's fairly diffiult to induce + // that from outside: we would need to start from a parent blueprint + // that contained a Nexus instance for every IP in the + // `NEXUS_OPTE_*_SUBNET`. We could hack around that by creating the + // `BlueprintBuilder` and mucking with its internals, but that doesn't + // seem like a particularly useful test either. + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_external_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same external IP. Skim through the zones, copy the + // external IP from one Nexus zone, then assign it to a later Nexus + // zone. + let mut found_second_nexus_zone = false; + let mut nexus_external_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { external_ip, .. } = + &mut z.zone_type + { + if let Some(ip) = nexus_external_ip { + *external_ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_external_ip = Some(*external_ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on( + &parent, + Generation::new(), + &policy, + "test", + ) { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate external IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_nexus_zones_with_same_nic_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // Nexus zones with the same NIC IP. Skim through the zones, copy + // the NIC IP from one Nexus zone, then assign it to a later + // Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(ip) = nexus_nic_ip { + nic.ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_ip = Some(nic.ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on( + &parent, + Generation::new(), + &policy, + "test", + ) { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate Nexus NIC IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_vnic_mac() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same service vNIC MAC address. Skim through the + // zones, copy the NIC MAC from one Nexus zone, then assign it to a + // later Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_mac = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(mac) = nexus_nic_mac { + nic.mac = mac; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_mac = Some(nic.mac); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on( + &parent, + Generation::new(), + &policy, + "test", + ) { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate service vNIC MAC"), + "unexpected error: {err:#}" + ), + }; + } } diff --git a/nexus/deployment/src/lib.rs b/nexus/deployment/src/lib.rs index fd182ae613..546f2c1dc1 100644 --- a/nexus/deployment/src/lib.rs +++ b/nexus/deployment/src/lib.rs @@ -57,7 +57,7 @@ //! The Planner //! //! fleet policy (latest inventory) (latest blueprint) -//! \ | / +//! \ | / //! \ | / //! +----------+ | +----------/ //! | | | @@ -85,7 +85,7 @@ //! The Executor (better name?) //! //! latest committed blueprint latest inventory -//! | | +//! | | //! | | //! +----+ +----+ //! | | diff --git a/nexus/deployment/src/planner.rs b/nexus/deployment/src/planner.rs index 8ea6c0ba19..7973157068 100644 --- a/nexus/deployment/src/planner.rs +++ b/nexus/deployment/src/planner.rs @@ -8,12 +8,17 @@ use crate::blueprint_builder::BlueprintBuilder; use crate::blueprint_builder::Ensure; +use crate::blueprint_builder::EnsureMultiple; use crate::blueprint_builder::Error; use nexus_types::deployment::Blueprint; use nexus_types::deployment::Policy; +use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use omicron_common::api::external::Generation; -use slog::{info, Logger}; +use slog::{info, warn, Logger}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use uuid::Uuid; pub struct Planner<'a> { log: Logger, @@ -41,14 +46,14 @@ impl<'a> Planner<'a> { // NOTE: Right now, we just assume that this is the latest inventory // collection. See the comment on the corresponding field in `Planner`. inventory: &'a Collection, - ) -> Planner<'a> { + ) -> anyhow::Result> { let blueprint = BlueprintBuilder::new_based_on( parent_blueprint, internal_dns_version, policy, creator, - ); - Planner { log, policy, blueprint, inventory } + )?; + Ok(Planner { log, policy, blueprint, inventory }) } pub fn plan(mut self) -> Result { @@ -67,6 +72,17 @@ impl<'a> Planner<'a> { // added and where they should go. And the blueprint builder will need // to grow the ability to provision one. + // After we make our initial pass through the sleds below to check for + // zones every sled should have (NTP, Crucible), we'll start making + // decisions about placing other service zones. We need to _exclude_ any + // sleds for which we just added an NTP zone, as we won't be able to add + // additional services to them until that NTP zone has been brought up. + // + // We will not mark sleds getting Crucible zones as ineligible; other + // control plane service zones starting concurrently with Crucible zones + // is fine. + let mut sleds_ineligible_for_services = BTreeSet::new(); + for (sled_id, sled_info) in &self.policy.sleds { // Check for an NTP zone. Every sled should have one. If it's not // there, all we can do is provision that one zone. We have to wait @@ -76,13 +92,14 @@ impl<'a> Planner<'a> { info!( &self.log, "found sled missing NTP zone (will add one)"; - "sled_id" => ?sled_id + "sled_id" => %sled_id ); self.blueprint .comment(&format!("sled {}: add NTP zone", sled_id)); // Don't make any other changes to this sled. However, this // change is compatible with any other changes to other sleds, // so we can "continue" here rather than "break". + sleds_ineligible_for_services.insert(*sled_id); continue; } @@ -106,7 +123,7 @@ impl<'a> Planner<'a> { let has_ntp_inventory = self .inventory .omicron_zones - .get(&sled_id) + .get(sled_id) .map(|sled_zones| { sled_zones.zones.zones.iter().any(|z| z.zone_type.is_ntp()) }) @@ -116,7 +133,7 @@ impl<'a> Planner<'a> { &self.log, "parent blueprint contains NTP zone, but it's not in \ inventory yet"; - "sled_id" => ?sled_id, + "sled_id" => %sled_id, ); continue; } @@ -151,6 +168,139 @@ impl<'a> Planner<'a> { } } + // We've now placed all the services that should always exist on all + // sleds. Before moving on to make decisions about placing services that + // are _not_ present on all sleds, check the provision state of all our + // sleds so we can avoid any non-provisionable sleds under the + // assumption that there is something amiss with them. + sleds_ineligible_for_services.extend( + self.policy.sleds.iter().filter_map(|(sled_id, sled_info)| { + match sled_info.provision_state { + SledProvisionState::Provisionable => None, + SledProvisionState::NonProvisionable => Some(*sled_id), + } + }), + ); + + self.ensure_correct_number_of_nexus_zones( + &sleds_ineligible_for_services, + )?; + + Ok(()) + } + + fn ensure_correct_number_of_nexus_zones( + &mut self, + sleds_ineligible_for_services: &BTreeSet, + ) -> Result<(), Error> { + // Bin every sled by the number of Nexus zones it currently has while + // counting the total number of Nexus zones. + let mut num_total_nexus = 0; + let mut sleds_by_num_nexus: BTreeMap> = + BTreeMap::new(); + for &sled_id in self.policy.sleds.keys() { + let num_nexus = self.blueprint.sled_num_nexus_zones(sled_id); + num_total_nexus += num_nexus; + + // Only bin this sled if we're allowed to use it. If we have a sled + // we're not allowed to use that's already running a Nexus (seems + // fishy!), we counted its Nexus above but will ignore it here. + if !sleds_ineligible_for_services.contains(&sled_id) { + sleds_by_num_nexus.entry(num_nexus).or_default().push(sled_id); + } + } + + // TODO-correctness What should we do if we have _too many_ Nexus + // instances? For now, just log it the number of zones any time we have + // at least the minimum number. + let nexus_to_add = + self.policy.target_nexus_zone_count.saturating_sub(num_total_nexus); + if nexus_to_add == 0 { + info!( + self.log, "sufficient Nexus zones exist in plan"; + "desired_count" => self.policy.target_nexus_zone_count, + "current_count" => num_total_nexus, + ); + return Ok(()); + } + + // Ensure we have at least one sled on which we can add Nexus zones. If + // we don't, we have nothing else to do. This isn't a hard error, + // because we might be waiting for NTP on all eligible sleds (although + // it would be weird, since we're presumably running from within Nexus + // on some sled). + if sleds_by_num_nexus.is_empty() { + warn!(self.log, "want to add Nexus zones, but no eligible sleds"); + return Ok(()); + } + + // Build a map of sled -> new nexus zone count. + let mut sleds_to_change: BTreeMap = BTreeMap::new(); + + 'outer: for _ in 0..nexus_to_add { + // `sleds_by_num_nexus` is sorted by key already, and we want to + // pick from the lowest-numbered bin. We can just loop over its + // keys, expecting to stop on the first iteration, with the only + // exception being when we've removed all the sleds from a bin. + for (&num_nexus, sleds) in sleds_by_num_nexus.iter_mut() { + // `sleds` contains all sleds with the minimum number of Nexus + // zones. Pick one arbitrarily but deterministically. + let Some(sled_id) = sleds.pop() else { + // We already drained this bin; move on. + continue; + }; + + // This insert might overwrite an old value for this sled (e.g., + // in the "we have 1 sled and need to add many Nexus instances + // to it" case). That's fine. + sleds_to_change.insert(sled_id, num_nexus + 1); + + // Put this sled back in our map, but now with one more Nexus. + sleds_by_num_nexus + .entry(num_nexus + 1) + .or_default() + .push(sled_id); + + continue 'outer; + } + + // This should be unreachable: it's only possible if we fail to find + // a nonempty vec in `sleds_by_num_nexus`, and we checked above that + // `sleds_by_num_nexus` is not empty. + unreachable!("logic error finding sleds for Nexus"); + } + + // For each sled we need to change, actually do so. + let mut total_added = 0; + for (sled_id, new_nexus_count) in sleds_to_change { + match self + .blueprint + .sled_ensure_zone_multiple_nexus(sled_id, new_nexus_count)? + { + EnsureMultiple::Added(n) => { + info!( + self.log, "will add {n} Nexus zone(s) to sled"; + "sled_id" => %sled_id, + ); + total_added += n; + } + // This is only possible if we asked the sled to ensure the same + // number of zones it already has, but that's impossible based + // on the way we built up `sleds_to_change`. + EnsureMultiple::NotNeeded => unreachable!( + "sled on which we added Nexus zones did not add any" + ), + } + } + + // Double check that we didn't make any arithmetic mistakes. If we've + // arrived here, we think we've added the number of Nexus zones we + // needed to. + assert_eq!( + total_added, nexus_to_add, + "internal error counting Nexus zones" + ); + Ok(()) } } @@ -160,8 +310,10 @@ mod test { use super::Planner; use crate::blueprint_builder::test::example; use crate::blueprint_builder::test::policy_add_sled; + use crate::blueprint_builder::test::verify_blueprint; use crate::blueprint_builder::BlueprintBuilder; use nexus_inventory::now_db_precision; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::OmicronZoneType; use nexus_types::inventory::OmicronZonesFound; use omicron_common::api::external::Generation; @@ -187,6 +339,7 @@ mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); // Now run the planner. It should do nothing because our initial // system didn't have any issues that the planner currently knows how to @@ -199,6 +352,7 @@ mod test { "no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -207,6 +361,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint2); // Now add a new sled. let new_sled_id = @@ -222,6 +377,7 @@ mod test { "test: add NTP?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -241,6 +397,7 @@ mod test { )); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint3); // Check that with no change in inventory, the planner makes no changes. // It needs to wait for inventory to reflect the new NTP zone before @@ -253,6 +410,7 @@ mod test { "test: add nothing more", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); let diff = blueprint3.diff_sleds(&blueprint4); @@ -260,6 +418,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint4); // Now update the inventory to have the requested NTP zone. assert!(collection @@ -288,6 +447,7 @@ mod test { "test: add Crucible zones?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -309,11 +469,12 @@ mod test { assert_eq!(zones.len(), 3); for zone in &zones { let OmicronZoneType::Crucible { .. } = zone.zone_type else { - panic!("unexpectedly added a non-Crucible zone"); + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); }; } + verify_blueprint(&blueprint5); - // Check that there are no more steps + // Check that there are no more steps. let blueprint6 = Planner::new_based_on( logctx.log.clone(), &blueprint5, @@ -322,6 +483,7 @@ mod test { "test: no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -330,6 +492,266 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint6); + + logctx.cleanup_successful(); + } + + /// Check that the planner will add more Nexus zones to a single sled, if + /// needed + #[test] + fn test_add_multiple_nexus_to_one_sled() { + let logctx = test_setup_log("planner_add_multiple_nexus_to_one_sled"); + + // For our purposes, we don't care about the internal DNS generation. + let internal_dns_version = Generation::new(); + + // Use our example inventory collection as a starting point, but strip + // it down to just one sled. + let (sled_id, collection, mut policy) = { + let (mut collection, mut policy) = example(); + + // Pick one sled ID to keep and remove the rest. + let keep_sled_id = + policy.sleds.keys().next().copied().expect("no sleds"); + policy.sleds.retain(|&k, _v| keep_sled_id == k); + collection.sled_agents.retain(|&k, _v| keep_sled_id == k); + collection.omicron_zones.retain(|&k, _v| keep_sled_id == k); + + assert_eq!(collection.sled_agents.len(), 1); + assert_eq!(collection.omicron_zones.len(), 1); + + (keep_sled_id, collection, policy) + }; + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + internal_dns_version, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 1 Nexus instance on the one sled we + // kept. + assert_eq!(blueprint1.omicron_zones.len(), 1); + assert_eq!( + blueprint1 + .omicron_zones + .get(&sled_id) + .expect("missing kept sled") + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + + // Now run the planner. It should add additional Nexus instances to the + // one sled we have. + policy.target_nexus_zone_count = 5; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + internal_dns_version, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff_sleds(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let mut sleds = diff.sleds_changed().collect::>(); + assert_eq!(sleds.len(), 1); + let (changed_sled_id, sled_changes) = sleds.pop().unwrap(); + assert_eq!(changed_sled_id, sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + assert_eq!(zones.len(), policy.target_nexus_zone_count - 1); + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Nexus zone: {zone:?}"); + }; + } + + logctx.cleanup_successful(); + } + + /// Check that the planner will spread additional Nexus zones out across + /// sleds as it adds them + #[test] + fn test_spread_additional_nexus_zones_across_sleds() { + let logctx = test_setup_log( + "planner_spread_additional_nexus_zones_across_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + Generation::new(), + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff_sleds(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // All 3 sleds should get additional Nexus zones. We expect a total of + // 11 new Nexus zones, which should be spread evenly across the three + // sleds (two should get 4 and one should get 3). + assert_eq!(sleds.len(), 3); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (3 | 4) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); + + logctx.cleanup_successful(); + } + + /// Check that the planner will skip non-provisionable sleds when allocating + /// extra Nexus zones + #[test] + fn test_nexus_allocation_skips_nonprovisionable_sleds() { + let logctx = test_setup_log( + "planner_nexus_allocation_skips_nonprovisionable_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + Generation::new(), + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Arbitrarily choose one of the sleds and mark it non-provisionable. + let nonprovisionable_sled_id = { + let (sled_id, resources) = + policy.sleds.iter_mut().next().expect("no sleds"); + resources.provision_state = SledProvisionState::NonProvisionable; + *sled_id + }; + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + Generation::new(), + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff_sleds(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // Only 2 of the 3 sleds should get additional Nexus zones. We expect a + // total of 11 new Nexus zones, which should be spread evenly across the + // two sleds (one gets 6 and the other gets 5), while the + // non-provisionable sled should be unchanged. + assert_eq!(sleds.len(), 2); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert!(sled_id != nonprovisionable_sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (5 | 6) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); logctx.cleanup_successful(); } diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 4263c34f3d..ac9d894050 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -109,6 +109,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index f954a35639..e0d8f32316 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -467,6 +467,7 @@ mod test { use super::BackgroundTask; use super::Driver; use crate::app::background::common::ActivationReason; + use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -477,6 +478,7 @@ mod test { use std::time::Instant; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TryRecvError; + use tokio::sync::mpsc::Sender; use tokio::sync::watch; type ControlPlaneTestContext = @@ -814,4 +816,82 @@ mod test { // such a task that would allow us to reliably distinguish between these // two without also spending a lot of wall-clock time on this test. } + + /// Simple BackgroundTask impl that sends a test-only SagaRequest + struct SagaRequestTask { + saga_request: Sender, + } + + impl SagaRequestTask { + fn new(saga_request: Sender) -> SagaRequestTask { + SagaRequestTask { saga_request } + } + } + + impl BackgroundTask for SagaRequestTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let _ = self.saga_request.send(SagaRequest::TestOnly).await; + serde_json::Value::Null + } + .boxed() + } + } + + #[nexus_test(server = crate::Server)] + async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let t1 = SagaRequestTask::new(saga_request); + + let mut driver = Driver::new(); + let (_dep_tx1, dep_rx1) = watch::channel(0); + + let h1 = driver.register( + "t1".to_string(), + "test saga request flow task".to_string(), + Duration::from_secs(300), // should not fire in this test + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + ); + + assert!(matches!( + saga_request_recv.try_recv(), + Err(mpsc::error::TryRecvError::Empty), + )); + + driver.activate(&h1); + + // wait 1 second for the saga request to arrive + tokio::select! { + _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { + assert!(false); + } + + saga_request = saga_request_recv.recv() => { + match saga_request { + None => { + assert!(false); + } + + Some(saga_request) => { + assert!(matches!( + saga_request, + SagaRequest::TestOnly, + )); + } + } + } + } + } } diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/dns_config.rs index 959cf1843e..be18ac3612 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/dns_config.rs @@ -159,7 +159,6 @@ impl BackgroundTask for DnsConfigWatcher { mod test { use crate::app::background::common::BackgroundTask; use crate::app::background::dns_config::DnsConfigWatcher; - use crate::app::background::init::test::read_internal_dns_zone_id; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; @@ -197,9 +196,7 @@ mod test { // Now write generation 2, activate again, and verify that the update // was sent to the watcher. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, &datastore).await; - write_test_dns_generation(&datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, &datastore).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 1); let value = task.activate(&opctx).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 2); diff --git a/nexus/src/app/background/dns_servers.rs b/nexus/src/app/background/dns_servers.rs index 97fb3510b7..8f4cce4ee0 100644 --- a/nexus/src/app/background/dns_servers.rs +++ b/nexus/src/app/background/dns_servers.rs @@ -7,22 +7,15 @@ use super::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; +use internal_dns::names::ServiceName; +use internal_dns::resolver::Resolver; use nexus_db_model::DnsGroup; -use nexus_db_model::ServiceKind; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::DataStore; -use omicron_common::api::external::DataPageParams; use serde::Serialize; use serde_json::json; -use std::net::{SocketAddr, SocketAddrV6}; -use std::num::NonZeroU32; -use std::sync::Arc; +use std::net::SocketAddr; use tokio::sync::watch; -// This constraint could be relaxed by paginating through the list of servers, -// but we don't expect to have this many servers any time soon. -const MAX_DNS_SERVERS: usize = 10; - #[derive(Debug, Clone, Eq, PartialEq, Serialize)] pub struct DnsServersList { pub addresses: Vec, @@ -31,20 +24,17 @@ pub struct DnsServersList { /// Background task that keeps track of the latest list of DNS servers for a DNS /// group pub struct DnsServersWatcher { - datastore: Arc, dns_group: DnsGroup, + resolver: Resolver, last: Option, tx: watch::Sender>, rx: watch::Receiver>, } impl DnsServersWatcher { - pub fn new( - datastore: Arc, - dns_group: DnsGroup, - ) -> DnsServersWatcher { + pub fn new(dns_group: DnsGroup, resolver: Resolver) -> DnsServersWatcher { let (tx, rx) = watch::channel(None); - DnsServersWatcher { datastore, dns_group, last: None, tx, rx } + DnsServersWatcher { dns_group, last: None, tx, rx, resolver } } /// Exposes the latest list of DNS servers for this DNS group @@ -75,58 +65,38 @@ impl BackgroundTask for DnsServersWatcher { }; // Read the latest service configuration for this DNS group. - let service_kind = match self.dns_group { - DnsGroup::Internal => ServiceKind::InternalDns, - DnsGroup::External => ServiceKind::ExternalDns, + let service_name = match self.dns_group { + DnsGroup::Internal => ServiceName::InternalDns, + DnsGroup::External => ServiceName::ExternalDns, }; - let pagparams = DataPageParams { - marker: None, - limit: NonZeroU32::try_from( - u32::try_from(MAX_DNS_SERVERS).unwrap(), - ) - .unwrap(), - direction: dropshot::PaginationOrder::Ascending, + let result = self.resolver.lookup_all_socket_v6(service_name).await; + let addresses = match result { + Err(error) => { + warn!( + &log, + "failed to lookup DNS servers"; + "error" => format!("{:#}", error) + ); + return json!({ + "error": + format!( + "failed to read list of DNS servers: {:#}", + error + ) + }); + } + Ok(addresses) => { + // TODO(eliza): it would be nicer if `Resolver` had a method + // returning an iterator instead of a `Vec`, so we didn't + // have to drain the Vec and then collect it into a new + // one... + addresses.into_iter().map(SocketAddr::V6).collect() + } }; - let result = self - .datastore - .services_list_kind(opctx, service_kind, &pagparams) - .await; - - if let Err(error) = result { - warn!( - &log, - "failed to read list of DNS servers"; - "error" => format!("{:#}", error) - ); - return json!({ - "error": - format!( - "failed to read list of DNS servers: {:#}", - error - ) - }); - } - - let services = result.unwrap(); - if services.len() >= MAX_DNS_SERVERS { - warn!( - &log, - "found {} servers, which is more than MAX_DNS_SERVERS \ - ({}). There may be more that will not be used.", - services.len(), - MAX_DNS_SERVERS - ); - } - - let new_config = DnsServersList { - addresses: services - .into_iter() - .map(|s| SocketAddrV6::new(*s.ip, *s.port, 0, 0).into()) - .collect(), - }; - let new_addrs_dbg = format!("{:?}", new_config); + let new_config = DnsServersList { addresses }; + let new_addrs_dbg = format!("{new_config:?}"); let rv = serde_json::to_value(&new_config).unwrap_or_else(|error| { json!({ @@ -177,119 +147,3 @@ impl BackgroundTask for DnsServersWatcher { .boxed() } } - -#[cfg(test)] -mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_servers::DnsServersList; - use crate::app::background::dns_servers::DnsServersWatcher; - use crate::app::background::dns_servers::MAX_DNS_SERVERS; - use assert_matches::assert_matches; - use async_bb8_diesel::AsyncRunQueryDsl; - use diesel::ExpressionMethods; - use diesel::QueryDsl; - use nexus_db_model::DnsGroup; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::db::model::Service; - use nexus_db_queries::db::model::ServiceKind; - use nexus_test_utils_macros::nexus_test; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; - use uuid::Uuid; - - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - - #[nexus_test(server = crate::Server)] - async fn test_basic(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.apictx().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - // Verify the initial state. - let mut task = - DnsServersWatcher::new(datastore.clone(), DnsGroup::Internal); - let watcher = task.watcher(); - assert_matches!(*watcher.borrow(), None); - - // The datastore from the ControlPlaneTestContext is initialized with - // one DNS server. - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 1); - - // If we add another server, we should see it. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::insert_into(dsl::service) - .values(Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0), - ServiceKind::InternalDns, - )) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 2); - - // If we add MAX_DNS_SERVERS more servers, we should see - // MAX_DNS_SERVERS. - { - use nexus_db_queries::db::schema::service::dsl; - let new_services = (0..u16::try_from(MAX_DNS_SERVERS).unwrap()) - .map(|i| { - Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, i + 2, 0, 0), - ServiceKind::InternalDns, - ) - }) - .collect::>(); - - diesel::insert_into(dsl::service) - .values(new_services) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == MAX_DNS_SERVERS); - - // Now delete all the servers and try again. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::delete( - dsl::service.filter(dsl::kind.eq(ServiceKind::InternalDns)), - ) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.is_empty()); - } -} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index e588ac88fe..846051a068 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -15,7 +15,9 @@ use super::external_endpoints; use super::inventory_collection; use super::nat_cleanup; use super::phantom_disks; +use super::region_replacement; use super::sync_service_zone_nat::ServiceZoneNatTracker; +use crate::app::sagas::SagaRequest; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -25,6 +27,7 @@ use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::collections::HashMap; use std::sync::Arc; +use tokio::sync::mpsc::Sender; use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with @@ -72,10 +75,15 @@ pub struct BackgroundTasks { /// task handle for the service zone nat tracker pub task_service_zone_nat_tracker: common::TaskHandle, + + /// task handle for the task that detects if regions need replacement and + /// begins the process + pub task_region_replacement: common::TaskHandle, } impl BackgroundTasks { /// Kick off all background tasks + #[allow(clippy::too_many_arguments)] pub fn start( opctx: &OpContext, datastore: Arc, @@ -84,6 +92,7 @@ impl BackgroundTasks { mgd_clients: &HashMap>, nexus_id: Uuid, resolver: internal_dns::resolver::Resolver, + saga_request: Sender, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -92,6 +101,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::Internal, + resolver.clone(), &config.dns_internal, ); let (task_external_dns_config, task_external_dns_servers) = init_dns( @@ -99,6 +109,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::External, + resolver.clone(), &config.dns_external, ); @@ -244,6 +255,26 @@ impl BackgroundTasks { ) }; + // Background task: detect if a region needs replacement and begin the + // process + let task_region_replacement = { + let detector = region_replacement::RegionReplacementDetector::new( + datastore, + saga_request.clone(), + ); + + let task = driver.register( + String::from("region_replacement"), + String::from("detects if a region requires replacing and begins the process"), + config.region_replacement.period_secs, + Box::new(detector), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -259,6 +290,7 @@ impl BackgroundTasks { task_blueprint_loader, task_blueprint_executor, task_service_zone_nat_tracker, + task_region_replacement, } } @@ -272,6 +304,7 @@ fn init_dns( opctx: &OpContext, datastore: Arc, dns_group: DnsGroup, + resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, ) -> (common::TaskHandle, common::TaskHandle) { let dns_group_name = dns_group.to_string(); @@ -292,13 +325,13 @@ fn init_dns( ); // Background task: DNS server list watcher - let dns_servers = dns_servers::DnsServersWatcher::new(datastore, dns_group); + let dns_servers = dns_servers::DnsServersWatcher::new(dns_group, resolver); let dns_servers_watcher = dns_servers.watcher(); let task_name_servers = format!("dns_servers_{}", dns_group); let task_servers = driver.register( task_name_servers.clone(), format!( - "watches list of {} DNS servers stored in CockroachDB", + "watches list of {} DNS servers stored in internal DNS", dns_group, ), config.period_secs_servers, @@ -332,22 +365,17 @@ fn init_dns( #[cfg(test)] pub mod test { - use async_bb8_diesel::AsyncRunQueryDsl; use dropshot::HandlerTaskMode; use nexus_db_model::DnsGroup; - use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; use nexus_test_utils_macros::nexus_test; use nexus_types::internal_api::params as nexus_params; - use nexus_types::internal_api::params::ServiceKind; - use omicron_common::api::external::DataPageParams; use omicron_test_utils::dev::poll; use std::net::SocketAddr; - use std::num::NonZeroU32; use std::time::Duration; use tempfile::TempDir; - use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -394,12 +422,23 @@ pub mod test { .expect("failed to get initial DNS server config"); assert_eq!(config.generation, 1); - // We'll need the id of the internal DNS zone. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, datastore).await; + let internal_dns_srv_name = + internal_dns::ServiceName::InternalDns.dns_name(); + + let initial_srv_record = { + let zone = + config.zones.get(0).expect("DNS config must have a zone"); + let Some(record) = zone.records.get(&internal_dns_srv_name) else { + panic!("zone must have a record for {internal_dns_srv_name}") + }; + match record.get(0) { + Some(dns_service_client::types::DnsRecord::Srv(srv)) => srv, + record => panic!("expected a SRV record for {internal_dns_srv_name}, found {record:?}"), + } + }; // Now spin up another DNS server, add it to the list of servers, and - // make sure that DNS gets propagated to it. Note that we shouldn't + // make sure that DNS gets propagated to it. Note that we shouldn't // have to explicitly activate the background task because inserting a // new service ought to do that for us. let log = &cptestctx.logctx.log; @@ -438,29 +477,76 @@ pub mod test { SocketAddr::V4(_) => panic!("expected v6 address"), SocketAddr::V6(a) => a, }; + + // In order to test that DNS gets propagated to a newly-added server, we + // first need to update the source of truth about DNS (the database). + // Then we need to wait for that to get propagated (by this same + // mechanism) to the existing DNS servers. Only then would we expect + // the mechanism to see the new DNS server and then propagate + // configuration to it. + let update = { + use nexus_params::{DnsRecord, Srv}; + + let target = "my-great-dns-server.host"; + + let mut update = test_dns_update_builder(); + update.remove_name(internal_dns_srv_name.clone()).unwrap(); + update + .add_name( + internal_dns_srv_name, + vec![ + DnsRecord::Srv(Srv { + prio: 0, + weight: 0, + port: new_dns_addr.port(), + target: format!( + "{target}.control-plane.oxide.internal" + ), + }), + DnsRecord::Srv(initial_srv_record.clone()), + ], + ) + .unwrap(); + update + .add_name( + target.to_string(), + vec![DnsRecord::Aaaa(*new_dns_addr.ip())], + ) + .unwrap(); + update + }; + write_dns_update(&opctx, datastore, update).await; + info!(&cptestctx.logctx.log, "updated new dns records"); + + // Activate the internal DNS propagation pipeline. nexus - .upsert_service( - &opctx, - Uuid::new_v4(), - cptestctx.sled_agent.sled_agent.id, - Some(Uuid::new_v4()), - new_dns_addr, - ServiceKind::InternalDns.into(), - ) - .await - .unwrap(); + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_config); + + wait_propagate_dns( + &cptestctx.logctx.log, + "initial", + initial_dns_dropshot_server.local_addr(), + 2, + ) + .await; + + // Discover the new internal DNS server from internal DNS. + nexus + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_servers); wait_propagate_dns( &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 1, + 2, ) .await; - // Now, write version 2 of the internal DNS configuration with one + // Now, write version 3 of the internal DNS configuration with one // additional record. - write_test_dns_generation(datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, datastore).await; // Activate the internal DNS propagation pipeline. nexus @@ -472,7 +558,7 @@ pub mod test { &cptestctx.logctx.log, "initial", initial_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; @@ -480,7 +566,7 @@ pub mod test { &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; } @@ -493,15 +579,16 @@ pub mod test { generation: u64, ) { println!( - "waiting for propagation of generation {} to {} DNS server ({})", - generation, label, addr + "waiting for propagation of generation {generation} to {label} \ + DNS server ({addr})", ); let client = dns_service_client::Client::new( &format!("http://{}", addr), log.clone(), ); - poll::wait_for_condition( + let poll_max = Duration::from_secs(30); + let result = poll::wait_for_condition( || async { match client.dns_config_get().await { Err(error) => { @@ -519,87 +606,51 @@ pub mod test { } }, &Duration::from_millis(50), - &Duration::from_secs(30), + &poll_max, ) - .await - .expect("DNS config not propagated in expected time"); + .await; + if let Err(err) = result { + panic!( + "DNS generation {generation} not propagated to \ + {label} DNS server ({addr}) within {poll_max:?}: {err}" + ); + } else { + println!( + "DNS generation {generation} propagated to {label} \ + DNS server ({addr}) successfully." + ); + } } - pub(crate) async fn write_test_dns_generation( + pub(crate) async fn write_dns_update( + opctx: &OpContext, datastore: &DataStore, - internal_dns_zone_id: Uuid, + update: DnsVersionUpdateBuilder, ) { - { - let conn = datastore.pool_connection_for_tests().await.unwrap(); - let _: Result<(), _> = datastore - .transaction_retry_wrapper("write_test_dns_generation") - .transaction(&conn, |conn| async move { - { - use nexus_db_queries::db::model::DnsVersion; - use nexus_db_queries::db::schema::dns_version::dsl; - - diesel::insert_into(dsl::dns_version) - .values(DnsVersion { - dns_group: DnsGroup::Internal, - version: Generation(2u32.try_into().unwrap()), - time_created: chrono::Utc::now(), - creator: String::from("test suite"), - comment: String::from("test suite"), - }) - .execute_async(&conn) - .await - .unwrap(); - } - - { - use nexus_db_queries::db::model::DnsName; - use nexus_db_queries::db::schema::dns_name::dsl; - - diesel::insert_into(dsl::dns_name) - .values( - DnsName::new( - internal_dns_zone_id, - String::from("we-got-beets"), - Generation(2u32.try_into().unwrap()), - None, - vec![nexus_params::DnsRecord::Aaaa( - "fe80::3".parse().unwrap(), - )], - ) - .unwrap(), - ) - .execute_async(&conn) - .await - .unwrap(); - } - - Ok(()) - }) - .await; - } + let conn = datastore.pool_connection_for_tests().await.unwrap(); + info!(opctx.log, "writing DNS update..."); + datastore.dns_update_incremental(opctx, &conn, update).await.unwrap(); } - pub(crate) async fn read_internal_dns_zone_id( + pub(crate) async fn write_test_dns_generation( opctx: &OpContext, datastore: &DataStore, - ) -> Uuid { - let dns_zones = datastore - .dns_zones_list( - &opctx, - DnsGroup::Internal, - &DataPageParams { - marker: None, - direction: dropshot::PaginationOrder::Ascending, - limit: NonZeroU32::new(2).unwrap(), - }, + ) { + let mut update = test_dns_update_builder(); + update + .add_name( + "we-got-beets".to_string(), + vec![nexus_params::DnsRecord::Aaaa("fe80::3".parse().unwrap())], ) - .await .unwrap(); - assert_eq!( - dns_zones.len(), - 1, - "expected exactly one internal DNS zone" - ); - dns_zones[0].id + write_dns_update(opctx, datastore, update).await + } + + fn test_dns_update_builder() -> DnsVersionUpdateBuilder { + DnsVersionUpdateBuilder::new( + DnsGroup::Internal, + "test suite DNS update".to_string(), + "test suite".to_string(), + ) } } diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 2c5fa0ab3c..27cdddfe15 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -16,6 +16,7 @@ mod init; mod inventory_collection; mod nat_cleanup; mod phantom_disks; +mod region_replacement; mod status; mod sync_service_zone_nat; diff --git a/nexus/src/app/background/region_replacement.rs b/nexus/src/app/background/region_replacement.rs new file mode 100644 index 0000000000..fc92f888b9 --- /dev/null +++ b/nexus/src/app/background/region_replacement.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting regions that need replacing and beginning that +//! process +//! +//! TODO this is currently a placeholder for a future PR + +use super::common::BackgroundTask; +use crate::app::sagas::SagaRequest; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::mpsc::Sender; + +pub struct RegionReplacementDetector { + _datastore: Arc, + _saga_request: Sender, +} + +impl RegionReplacementDetector { + pub fn new( + datastore: Arc, + saga_request: Sender, + ) -> Self { + RegionReplacementDetector { + _datastore: datastore, + _saga_request: saga_request, + } + } +} + +impl BackgroundTask for RegionReplacementDetector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let log = &opctx.log; + warn!(&log, "region replacement task started"); + + // TODO + + warn!(&log, "region replacement task done"); + + json!({ + "region_replacement_started_ok": 0, + "region_replacement_started_err": 0, + }) + } + .boxed() + } +} diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 624ae8d93e..61ce803d13 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -20,7 +20,9 @@ use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolName; use nexus_types::identity::Asset; use nexus_types::inventory::Collection; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; +use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -157,11 +159,39 @@ impl super::Nexus { let zpools = zpools_by_sled_id .remove(&sled_id) .unwrap_or_else(BTreeSet::new); - let sled_info = SledResources { subnet, zpools }; + let sled_info = SledResources { + provision_state: sled_row.provision_state().into(), + subnet, + zpools, + }; (sled_id, sled_info) }) .collect(); + let service_ip_pool_ranges = { + let (authz_service_ip_pool, _) = + datastore.ip_pools_service_lookup(opctx).await?; + + let mut ip_ranges = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = datastore + .ip_pool_list_ranges( + opctx, + &authz_service_ip_pool, + &p.current_pagparams(), + ) + .await?; + // The use of `last_address` here assumes `paginator` is sorting + // in Ascending order (which it does - see the implementation of + // `current_pagparams()`). + paginator = p.found_batch(&batch, &|r| r.last_address); + ip_ranges.extend(batch.iter().map(IpRange::from)); + } + + ip_ranges + }; + // The choice of which inventory collection to use here is not // necessarily trivial. Inventory collections may be incomplete due to // transient (or even persistent) errors. It's not yet clear what @@ -192,7 +222,11 @@ impl super::Nexus { Ok(PlanningContext { creator, - policy: Policy { sleds }, + policy: Policy { + sleds, + service_ip_pool_ranges, + target_nexus_zone_count: NEXUS_REDUNDANCY, + }, inventory, internal_dns_version: *dns_version.version, }) @@ -257,7 +291,12 @@ impl super::Nexus { &planning_context.policy, &planning_context.creator, &inventory, - ); + ) + .map_err(|error| { + Error::internal_error(&format!( + "error creating blueprint planner: {error:#}", + )) + })?; let blueprint = planner.plan().map_err(|error| { Error::internal_error(&format!( "error generating blueprint: {}", diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index c9ca4db73e..7a9a26b05f 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -6,6 +6,7 @@ use self::external_endpoints::NexusCertResolver; use crate::app::oximeter::LazyTimeseriesClient; +use crate::app::sagas::SagaRequest; use crate::config; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -362,6 +363,8 @@ impl Nexus { Arc::clone(&db_datastore), ); + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let background_tasks = background::BackgroundTasks::start( &background_ctx, Arc::clone(&db_datastore), @@ -370,6 +373,7 @@ impl Nexus { &mg_clients, config.deployment.id, resolver.clone(), + saga_request, ); let external_resolver = { @@ -484,6 +488,29 @@ impl Nexus { } }); + // Spawn a task to receive SagaRequests from RPWs, and execute them + { + let nexus = nexus.clone(); + tokio::spawn(async move { + loop { + match saga_request_recv.recv().await { + None => { + // If this channel is closed, then RPWs will not be + // able to request that sagas be run. This will + // likely only occur when Nexus itself is shutting + // down, so emit an error and exit the task. + error!(&nexus.log, "saga request channel closed!"); + break; + } + + Some(saga_request) => { + nexus.handle_saga_request(saga_request).await; + } + } + } + }); + } + Ok(nexus) } @@ -828,6 +855,17 @@ impl Nexus { pub(crate) async fn resolver(&self) -> internal_dns::resolver::Resolver { self.internal_resolver.clone() } + + /// Reliable persistent workflows can request that sagas be executed by + /// sending a SagaRequest to a supplied channel. Execute those here. + pub(crate) async fn handle_saga_request(&self, saga_request: SagaRequest) { + match saga_request { + #[cfg(test)] + SagaRequest::TestOnly => { + unimplemented!(); + } + } + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 569153f23e..a4d559f823 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -52,7 +52,6 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; -use omicron_common::api::external::ResourceType; use omicron_common::api::internal::shared::ExternalPortDiscovery; use sled_agent_client::types::AddSledRequest; use sled_agent_client::types::EarlyNetworkConfigBody; @@ -213,11 +212,7 @@ impl super::Nexus { mapped_fleet_roles, }; - let rack_network_config = request.rack_network_config.as_ref().ok_or( - Error::invalid_request( - "cannot initialize a rack without a network config", - ), - )?; + let rack_network_config = &request.rack_network_config; self.db_datastore .rack_set_initialized( @@ -337,289 +332,278 @@ impl super::Nexus { // Currently calling some of the apis directly, but should we be using sagas // going forward via self.run_saga()? Note that self.create_runnable_saga and // self.execute_saga are currently not available within this scope. - info!(self.log, "Checking for Rack Network Configuration"); - if let Some(rack_network_config) = &request.rack_network_config { - info!(self.log, "Recording Rack Network Configuration"); - let address_lot_name = - Name::from_str("initial-infra").map_err(|e| { - Error::internal_error(&format!( - "unable to use `initial-infra` as `Name`: {e}" - )) - })?; - let identity = IdentityMetadataCreateParams { - name: address_lot_name.clone(), - description: "initial infrastructure ip address lot" - .to_string(), - }; + info!(self.log, "Recording Rack Network Configuration"); + let address_lot_name = + Name::from_str("initial-infra").map_err(|e| { + Error::internal_error(&format!( + "unable to use `initial-infra` as `Name`: {e}" + )) + })?; + let identity = IdentityMetadataCreateParams { + name: address_lot_name.clone(), + description: "initial infrastructure ip address lot".to_string(), + }; - let kind = AddressLotKind::Infra; + let kind = AddressLotKind::Infra; - let first_address = IpAddr::V4(rack_network_config.infra_ip_first); - let last_address = IpAddr::V4(rack_network_config.infra_ip_last); - let ipv4_block = - AddressLotBlockCreate { first_address, last_address }; + let first_address = IpAddr::V4(rack_network_config.infra_ip_first); + let last_address = IpAddr::V4(rack_network_config.infra_ip_last); + let ipv4_block = AddressLotBlockCreate { first_address, last_address }; - let blocks = vec![ipv4_block]; + let blocks = vec![ipv4_block]; - let address_lot_params = - AddressLotCreate { identity, kind, blocks }; + let address_lot_params = AddressLotCreate { identity, kind, blocks }; - match self - .db_datastore - .address_lot_create(opctx, &address_lot_params) - .await - { - Ok(_) => Ok(()), - Err(e) => match e { - Error::ObjectAlreadyExists { - type_name: _, - object_name: _, - } => Ok(()), - _ => Err(e), - }, - }?; + match self + .db_datastore + .address_lot_create(opctx, &address_lot_params) + .await + { + Ok(_) => Ok(()), + Err(e) => match e { + Error::ObjectAlreadyExists { type_name: _, object_name: _ } => { + Ok(()) + } + _ => Err(e), + }, + }?; - let mut bgp_configs = HashMap::new(); + let mut bgp_configs = HashMap::new(); - for bgp_config in &rack_network_config.bgp { - bgp_configs.insert(bgp_config.asn, bgp_config.clone()); + for bgp_config in &rack_network_config.bgp { + bgp_configs.insert(bgp_config.asn, bgp_config.clone()); - let bgp_config_name: Name = - format!("as{}", bgp_config.asn).parse().unwrap(); + let bgp_config_name: Name = + format!("as{}", bgp_config.asn).parse().unwrap(); - let announce_set_name: Name = - format!("as{}-announce", bgp_config.asn).parse().unwrap(); + let announce_set_name: Name = + format!("as{}-announce", bgp_config.asn).parse().unwrap(); - let address_lot_name: Name = - format!("as{}-lot", bgp_config.asn).parse().unwrap(); + let address_lot_name: Name = + format!("as{}-lot", bgp_config.asn).parse().unwrap(); - self.db_datastore - .address_lot_create( - &opctx, - &AddressLotCreate { - identity: IdentityMetadataCreateParams { - name: address_lot_name, - description: format!( - "Address lot for announce set in as {}", - bgp_config.asn - ), - }, - kind: AddressLotKind::Infra, - blocks: bgp_config - .originate - .iter() - .map(|o| AddressLotBlockCreate { - first_address: o.network().into(), - last_address: o.broadcast().into(), - }) - .collect(), + self.db_datastore + .address_lot_create( + &opctx, + &AddressLotCreate { + identity: IdentityMetadataCreateParams { + name: address_lot_name, + description: format!( + "Address lot for announce set in as {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create address lot for BGP as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_create_announce_set( - &opctx, - &BgpAnnounceSetCreate { - identity: IdentityMetadataCreateParams { - name: announce_set_name.clone(), - description: format!( - "Announce set for AS {}", - bgp_config.asn - ), - }, - announcement: bgp_config - .originate - .iter() - .map(|x| BgpAnnouncementCreate { - address_lot_block: NameOrId::Name( - format!("as{}", bgp_config.asn) - .parse() - .unwrap(), - ), - network: IpNetwork::from(*x).into(), - }) - .collect(), + kind: AddressLotKind::Infra, + blocks: bgp_config + .originate + .iter() + .map(|o| AddressLotBlockCreate { + first_address: o.network().into(), + last_address: o.broadcast().into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create address lot for BGP as {}: {}", + bgp_config.asn, e + )) + })?; + + self.db_datastore + .bgp_create_announce_set( + &opctx, + &BgpAnnounceSetCreate { + identity: IdentityMetadataCreateParams { + name: announce_set_name.clone(), + description: format!( + "Announce set for AS {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create bgp announce set for as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_config_set( - &opctx, - &BgpConfigCreate { - identity: IdentityMetadataCreateParams { - name: bgp_config_name, - description: format!( - "BGP config for AS {}", - bgp_config.asn + announcement: bgp_config + .originate + .iter() + .map(|x| BgpAnnouncementCreate { + address_lot_block: NameOrId::Name( + format!("as{}", bgp_config.asn) + .parse() + .unwrap(), ), - }, - asn: bgp_config.asn, - bgp_announce_set_id: announce_set_name.into(), - vrf: None, - }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to set bgp config for as {}: {}", - bgp_config.asn, e - )) - })?; - } + network: IpNetwork::from(*x).into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create bgp announce set for as {}: {}", + bgp_config.asn, e + )) + })?; - for (idx, uplink_config) in - rack_network_config.ports.iter().enumerate() - { - let switch = uplink_config.switch.to_string(); - let switch_location = Name::from_str(&switch).map_err(|e| { + self.db_datastore + .bgp_config_set( + &opctx, + &BgpConfigCreate { + identity: IdentityMetadataCreateParams { + name: bgp_config_name, + description: format!( + "BGP config for AS {}", + bgp_config.asn + ), + }, + asn: bgp_config.asn, + bgp_announce_set_id: announce_set_name.into(), + vrf: None, + }, + ) + .await + .map_err(|e| { Error::internal_error(&format!( - "unable to use {switch} as Name: {e}" + "unable to set bgp config for as {}: {}", + bgp_config.asn, e )) })?; + } - let uplink_name = format!("default-uplink{idx}"); - let name = Name::from_str(&uplink_name).unwrap(); + for (idx, uplink_config) in rack_network_config.ports.iter().enumerate() + { + let switch = uplink_config.switch.to_string(); + let switch_location = Name::from_str(&switch).map_err(|e| { + Error::internal_error(&format!( + "unable to use {switch} as Name: {e}" + )) + })?; - let identity = IdentityMetadataCreateParams { - name: name.clone(), - description: "initial uplink configuration".to_string(), - }; + let uplink_name = format!("default-uplink{idx}"); + let name = Name::from_str(&uplink_name).unwrap(); - let port_config = SwitchPortConfigCreate { - geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, - }; + let identity = IdentityMetadataCreateParams { + name: name.clone(), + description: "initial uplink configuration".to_string(), + }; - let mut port_settings_params = SwitchPortSettingsCreate { - identity, - port_config, - groups: vec![], - links: HashMap::new(), - interfaces: HashMap::new(), - routes: HashMap::new(), - bgp_peers: HashMap::new(), - addresses: HashMap::new(), + let port_config = SwitchPortConfigCreate { + geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, }; - let addresses: Vec
= uplink_config - .addresses - .iter() - .map(|a| Address { - address_lot: NameOrId::Name(address_lot_name.clone()), - address: (*a).into(), - }) - .collect(); - - port_settings_params - .addresses - .insert("phy0".to_string(), AddressConfig { addresses }); - - let routes: Vec = uplink_config - .routes - .iter() - .map(|r| Route { - dst: r.destination.into(), - gw: r.nexthop, - vid: None, - }) - .collect(); - - port_settings_params - .routes - .insert("phy0".to_string(), RouteConfig { routes }); - - let peers: Vec = uplink_config - .bgp_peers - .iter() - .map(|r| BgpPeer { - bgp_announce_set: NameOrId::Name( - format!("as{}-announce", r.asn).parse().unwrap(), - ), - bgp_config: NameOrId::Name( - format!("as{}", r.asn).parse().unwrap(), - ), - interface_name: "phy0".into(), - addr: r.addr.into(), - hold_time: r.hold_time.unwrap_or(6) as u32, - idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, - delay_open: r.delay_open.unwrap_or(0) as u32, - connect_retry: r.connect_retry.unwrap_or(3) as u32, - keepalive: r.keepalive.unwrap_or(2) as u32, - }) - .collect(); + let mut port_settings_params = SwitchPortSettingsCreate { + identity, + port_config, + groups: vec![], + links: HashMap::new(), + interfaces: HashMap::new(), + routes: HashMap::new(), + bgp_peers: HashMap::new(), + addresses: HashMap::new(), + }; - port_settings_params - .bgp_peers - .insert("phy0".to_string(), BgpPeerConfig { peers }); + let addresses: Vec
= uplink_config + .addresses + .iter() + .map(|a| Address { + address_lot: NameOrId::Name(address_lot_name.clone()), + address: (*a).into(), + }) + .collect(); + + port_settings_params + .addresses + .insert("phy0".to_string(), AddressConfig { addresses }); + + let routes: Vec = uplink_config + .routes + .iter() + .map(|r| Route { + dst: r.destination.into(), + gw: r.nexthop, + vid: None, + }) + .collect(); + + port_settings_params + .routes + .insert("phy0".to_string(), RouteConfig { routes }); + + let peers: Vec = uplink_config + .bgp_peers + .iter() + .map(|r| BgpPeer { + bgp_announce_set: NameOrId::Name( + format!("as{}-announce", r.asn).parse().unwrap(), + ), + bgp_config: NameOrId::Name( + format!("as{}", r.asn).parse().unwrap(), + ), + interface_name: "phy0".into(), + addr: r.addr.into(), + hold_time: r.hold_time.unwrap_or(6) as u32, + idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, + delay_open: r.delay_open.unwrap_or(0) as u32, + connect_retry: r.connect_retry.unwrap_or(3) as u32, + keepalive: r.keepalive.unwrap_or(2) as u32, + }) + .collect(); + + port_settings_params + .bgp_peers + .insert("phy0".to_string(), BgpPeerConfig { peers }); + + let link = LinkConfigCreate { + mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 + lldp: LldpServiceConfigCreate { + enabled: false, + lldp_config: None, + }, + fec: uplink_config.uplink_port_fec.into(), + speed: uplink_config.uplink_port_speed.into(), + autoneg: uplink_config.autoneg, + }; - let link = LinkConfigCreate { - mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 - lldp: LldpServiceConfigCreate { - enabled: false, - lldp_config: None, - }, - fec: uplink_config.uplink_port_fec.into(), - speed: uplink_config.uplink_port_speed.into(), - autoneg: uplink_config.autoneg, - }; + port_settings_params.links.insert("phy".to_string(), link); - port_settings_params.links.insert("phy".to_string(), link); + match self + .db_datastore + .switch_port_settings_create(opctx, &port_settings_params, None) + .await + { + Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), + Err(e) => Err(e), + }?; - match self - .db_datastore - .switch_port_settings_create( - opctx, - &port_settings_params, - None, - ) - .await - { - Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), - Err(e) => Err(e), - }?; - - let port_settings_id = self - .db_datastore - .switch_port_settings_get_id( - opctx, - nexus_db_model::Name(name.clone()), - ) - .await?; + let port_settings_id = self + .db_datastore + .switch_port_settings_get_id( + opctx, + nexus_db_model::Name(name.clone()), + ) + .await?; - let switch_port_id = self - .db_datastore - .switch_port_get_id( - opctx, - rack_id, - switch_location.into(), - Name::from_str(&uplink_config.port).unwrap().into(), - ) - .await?; + let switch_port_id = self + .db_datastore + .switch_port_get_id( + opctx, + rack_id, + switch_location.into(), + Name::from_str(&uplink_config.port).unwrap().into(), + ) + .await?; + + self.db_datastore + .switch_port_set_settings_id( + opctx, + switch_port_id, + Some(port_settings_id), + db::datastore::UpdatePrecondition::Null, + ) + .await?; + } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 + // record port speed - self.db_datastore - .switch_port_set_settings_id( - opctx, - switch_port_id, - Some(port_settings_id), - db::datastore::UpdatePrecondition::Null, - ) - .await?; - } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 - // record port speed - }; self.initial_bootstore_sync(&opctx).await?; Ok(()) @@ -871,36 +855,15 @@ impl super::Nexus { ) .await?; - // Grab the SPs from the last collection - let collection = - self.db_datastore.inventory_get_latest_collection(opctx).await?; - - // If there isn't a collection, we don't know about the sled - let Some(collection) = collection else { - return Err(Error::unavail("no inventory data available")); - }; - - // Find the revision - let Some(sp) = collection.sps.get(&baseboard_id) else { - return Err(Error::ObjectNotFound { - type_name: ResourceType::Sled, - lookup_type: - omicron_common::api::external::LookupType::ByCompositeId( - format!("{sled:?}"), - ), - }); - }; - - // Convert the baseboard as necessary - let baseboard = sled_agent_client::types::Baseboard::Gimlet { - identifier: sled.serial.clone(), - model: sled.part.clone(), - revision: sp.baseboard_revision.into(), + // Convert `UninitializedSledId` to the sled-agent type + let baseboard_id = sled_agent_client::types::BaseboardId { + serial_number: sled.serial.clone(), + part_number: sled.part.clone(), }; // Make the call to sled-agent let req = AddSledRequest { - sled_id: baseboard, + sled_id: baseboard_id, start_request: StartSledAgentRequest { generation: 0, schema_version: 1, diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 1bd85ecf32..e9f800c61b 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -17,6 +17,7 @@ use steno::ActionContext; use steno::ActionError; use steno::SagaType; use thiserror::Error; +use tokio::sync::mpsc; use uuid::Uuid; pub mod disk_create; @@ -408,3 +409,23 @@ where ) .await } + +/// Reliable persistent workflows can request that sagas be run as part of their +/// activation by sending a SagaRequest through a supplied channel to Nexus. +pub enum SagaRequest { + #[cfg(test)] + TestOnly, +} + +impl SagaRequest { + pub fn channel() -> (mpsc::Sender, mpsc::Receiver) + { + // Limit the maximum number of saga requests that background tasks can + // queue for Nexus to run. + // + // Note this value was chosen arbitrarily! + const MAX_QUEUED_SAGA_REQUESTS: usize = 128; + + mpsc::channel(MAX_QUEUED_SAGA_REQUESTS) + } +} diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 9d0573f6b0..9e2331f416 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -56,14 +56,14 @@ declare_saga_actions! { + spa_ensure_switch_port_settings - spa_undo_ensure_switch_port_settings } - ENSURE_SWITCH_ROUTES -> "ensure_switch_routes" { - + spa_ensure_switch_routes - - spa_undo_ensure_switch_routes - } ENSURE_SWITCH_PORT_UPLINK -> "ensure_switch_port_uplink" { + spa_ensure_switch_port_uplink - spa_undo_ensure_switch_port_uplink } + ENSURE_SWITCH_ROUTES -> "ensure_switch_routes" { + + spa_ensure_switch_routes + - spa_undo_ensure_switch_routes + } ENSURE_SWITCH_PORT_BGP_SETTINGS -> "ensure_switch_port_bgp_settings" { + spa_ensure_switch_port_bgp_settings - spa_undo_ensure_switch_port_bgp_settings @@ -95,6 +95,7 @@ impl NexusSaga for SagaSwitchPortSettingsApply { builder.append(get_switch_port_settings_action()); builder.append(ensure_switch_port_settings_action()); builder.append(ensure_switch_port_uplink_action()); + builder.append(ensure_switch_routes_action()); builder.append(ensure_switch_port_bgp_settings_action()); builder.append(ensure_switch_port_bootstore_network_settings_action()); Ok(builder.build()?) @@ -238,8 +239,8 @@ async fn spa_ensure_switch_routes( IpAddr::V4(v4) => v4, IpAddr::V6(_) => continue, }; - let prefix = match r.gw.ip() { - IpAddr::V4(v4) => Prefix4 { value: v4, length: r.gw.prefix() }, + let prefix = match r.dst.ip() { + IpAddr::V4(v4) => Prefix4 { value: v4, length: r.dst.prefix() }, IpAddr::V6(_) => continue, }; let sr = StaticRoute4 { nexthop, prefix }; diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 943490ac04..ec3f11dc6f 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -23,9 +23,6 @@ use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; -#[cfg(test)] -use nexus_db_queries::db::model::ServiceKind; - impl super::Nexus { // Sleds pub fn sled_lookup<'a>( @@ -44,7 +41,7 @@ impl super::Nexus { // unless the DNS lookups at sled-agent are only for rack-local nexuses. pub(crate) async fn upsert_sled( &self, - opctx: &OpContext, + _opctx: &OpContext, id: Uuid, info: SledAgentStartupInfo, ) -> Result<(), Error> { @@ -73,10 +70,16 @@ impl super::Nexus { ); self.db_datastore.sled_upsert(sled).await?; - // Make sure any firewall rules for serices that may - // be running on this sled get plumbed - self.plumb_service_firewall_rules(opctx, &[id]).await?; + Ok(()) + } + pub(crate) async fn sled_request_firewall_rules( + &self, + opctx: &OpContext, + id: Uuid, + ) -> Result<(), Error> { + info!(self.log, "requesting firewall rules"; "sled_uuid" => id.to_string()); + self.plumb_service_firewall_rules(opctx, &[id]).await?; Ok(()) } @@ -276,41 +279,6 @@ impl super::Nexus { Ok(()) } - // Services - - /// Upserts a Service into the database, updating it if it already exists. - #[cfg(test)] - pub(crate) async fn upsert_service( - &self, - opctx: &OpContext, - id: Uuid, - sled_id: Uuid, - zone_id: Option, - address: SocketAddrV6, - kind: ServiceKind, - ) -> Result<(), Error> { - info!( - self.log, - "upserting service"; - "sled_id" => sled_id.to_string(), - "service_id" => id.to_string(), - "address" => address.to_string(), - ); - let service = - db::model::Service::new(id, sled_id, zone_id, address, kind); - self.db_datastore.service_upsert(opctx, service).await?; - - if kind == ServiceKind::ExternalDns { - self.background_tasks - .activate(&self.background_tasks.task_external_dns_servers); - } else if kind == ServiceKind::InternalDns { - self.background_tasks - .activate(&self.background_tasks.task_internal_dns_servers); - } - - Ok(()) - } - /// Ensure firewall rules for internal services get reflected on all the relevant sleds. pub(crate) async fn plumb_service_firewall_rules( &self, diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index db179dc7f6..d016bd0421 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -76,7 +76,8 @@ async fn main() -> anyhow::Result<()> { // We use the unchecked constructor of the datastore because we // don't want to block on someone else applying an upgrade. - let datastore = DataStore::new_unchecked(pool).map_err(|e| anyhow!(e))?; + let datastore = + DataStore::new_unchecked(log.clone(), pool).map_err(|e| anyhow!(e))?; match args.cmd { Cmd::List => { diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ccd8cebad6..3a9e957328 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -411,7 +411,7 @@ async fn ping( Ok(HttpResponseOk(views::Ping { status: views::PingStatus::Ok })) } -/// Fetch the top-level IAM policy +/// Fetch top-level IAM policy #[endpoint { method = GET, path = "/v1/system/policy", @@ -430,7 +430,7 @@ async fn system_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the top-level IAM policy +/// Update top-level IAM policy #[endpoint { method = PUT, path = "/v1/system/policy", @@ -454,7 +454,7 @@ async fn system_policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the current silo's IAM policy +/// Fetch current silo's IAM policy #[endpoint { method = GET, path = "/v1/policy", @@ -481,7 +481,7 @@ pub(crate) async fn policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the current silo's IAM policy +/// Update current silo's IAM policy #[endpoint { method = PUT, path = "/v1/policy", @@ -513,7 +513,7 @@ async fn policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource utilization of the user's current silo +/// Fetch resource utilization for user's current silo #[endpoint { method = GET, path = "/v1/utilization", @@ -535,7 +535,7 @@ async fn utilization_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the current utilization of a given silo +/// Fetch current utilization for given silo #[endpoint { method = GET, path = "/v1/system/utilization/silos/{silo}", @@ -628,7 +628,7 @@ async fn system_quotas_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource quotas of a given silo +/// Fetch resource quotas for silo #[endpoint { method = GET, path = "/v1/system/silos/{silo}/quotas", @@ -651,7 +651,7 @@ async fn silo_quotas_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the resource quotas of a given silo +/// Update resource quotas for silo /// /// If a quota value is not specified, it will remain unchanged. #[endpoint { @@ -735,9 +735,9 @@ async fn silo_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo +/// Fetch silo /// -/// Fetch a silo by name or ID. +/// Fetch silo by name or ID. #[endpoint { method = GET, path = "/v1/system/silos/{silo}", @@ -759,7 +759,11 @@ async fn silo_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List IP pools available within silo +/// List IP pools linked to silo +/// +/// Linked IP pools are available to users in the specified silo. A silo can +/// have at most one default pool. IPs are allocated from the default pool when +/// users ask for one without specifying a pool. #[endpoint { method = GET, path = "/v1/system/silos/{silo}/ip-pools", @@ -803,7 +807,7 @@ async fn silo_ip_pool_list( /// Delete a silo /// -/// Delete a silo by name. +/// Delete a silo by name or ID. #[endpoint { method = DELETE, path = "/v1/system/silos/{silo}", @@ -825,7 +829,7 @@ async fn silo_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo's IAM policy +/// Fetch silo IAM policy #[endpoint { method = GET, path = "/v1/system/silos/{silo}/policy", @@ -847,7 +851,7 @@ async fn silo_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a silo's IAM policy +/// Update silo IAM policy #[endpoint { method = PUT, path = "/v1/system/silos/{silo}/policy", @@ -877,7 +881,7 @@ async fn silo_policy_update( // Silo-specific user endpoints -/// List built-in (system) users in a silo +/// List built-in (system) users in silo #[endpoint { method = GET, path = "/v1/system/users", @@ -918,7 +922,7 @@ struct UserParam { user_id: Uuid, } -/// Fetch a built-in (system) user +/// Fetch built-in (system) user #[endpoint { method = GET, path = "/v1/system/users/{user_id}", @@ -982,7 +986,7 @@ async fn silo_identity_provider_list( // Silo SAML identity providers -/// Create a SAML IdP +/// Create SAML IdP #[endpoint { method = POST, path = "/v1/system/identity-providers/saml", @@ -1011,7 +1015,7 @@ async fn saml_identity_provider_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a SAML IdP +/// Fetch SAML IdP #[endpoint { method = GET, path = "/v1/system/identity-providers/saml/{provider}", @@ -1049,7 +1053,7 @@ async fn saml_identity_provider_view( // "Local" Identity Provider -/// Create a user +/// Create user /// /// Users can only be created in Silos with `provision_type` == `Fixed`. /// Otherwise, Silo users are just-in-time (JIT) provisioned when a user first @@ -1082,7 +1086,7 @@ async fn local_idp_user_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a user +/// Delete user #[endpoint { method = DELETE, path = "/v1/system/identity-providers/local/users/{user_id}", @@ -1106,7 +1110,7 @@ async fn local_idp_user_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set or invalidate a user's password +/// Set or invalidate user's password /// /// Passwords can only be updated for users in Silos with identity mode /// `LocalOnly`. @@ -1174,7 +1178,7 @@ async fn project_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a project +/// Create project #[endpoint { method = POST, path = "/v1/projects", @@ -1195,7 +1199,7 @@ async fn project_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project +/// Fetch project #[endpoint { method = GET, path = "/v1/projects/{project}", @@ -1219,7 +1223,7 @@ async fn project_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a project +/// Delete project #[endpoint { method = DELETE, path = "/v1/projects/{project}", @@ -1276,7 +1280,7 @@ async fn project_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project's IAM policy +/// Fetch project's IAM policy #[endpoint { method = GET, path = "/v1/projects/{project}/policy", @@ -1301,7 +1305,7 @@ async fn project_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a project's IAM policy +/// Update project's IAM policy #[endpoint { method = PUT, path = "/v1/projects/{project}/policy", @@ -1367,7 +1371,7 @@ async fn project_ip_pool_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/ip-pools/{pool}", @@ -1430,7 +1434,7 @@ pub struct IpPoolPathParam { pub pool_name: Name, } -/// Create an IP pool +/// Create IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools", @@ -1451,7 +1455,7 @@ async fn ip_pool_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}", @@ -1475,7 +1479,7 @@ async fn ip_pool_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an IP pool +/// Delete IP pool #[endpoint { method = DELETE, path = "/v1/system/ip-pools/{pool}", @@ -1497,7 +1501,7 @@ async fn ip_pool_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update an IP pool +/// Update IP pool #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}", @@ -1521,7 +1525,7 @@ async fn ip_pool_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an IP pool's linked silos +/// List IP pool's linked silos #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/silos", @@ -1569,7 +1573,11 @@ async fn ip_pool_silo_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool available within a silo +/// Link IP pool to silo +/// +/// Users in linked silos can allocate external IPs from this pool for their +/// instances. A silo can have at most one default pool. IPs are allocated from +/// the default pool when users ask for one without specifying a pool. #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/silos", @@ -1595,7 +1603,7 @@ async fn ip_pool_silo_link( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Unlink an IP pool from a silo +/// Unlink IP pool from silo /// /// Will fail if there are any outstanding IPs allocated in the silo. #[endpoint { @@ -1620,10 +1628,12 @@ async fn ip_pool_silo_unlink( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool default or not-default for a silo +/// Make IP pool default for silo /// -/// When a pool is made default for a silo, any existing default will remain -/// linked to the silo, but will no longer be the default. +/// When a user asks for an IP (e.g., at instance create time) without +/// specifying a pool, the IP comes from the default pool if a default is +/// configured. When a pool is made the default for a silo, any existing default +/// will remain linked to the silo, but will no longer be the default. #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}/silos/{silo}", @@ -1650,7 +1660,7 @@ async fn ip_pool_silo_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the IP pool used for Oxide services +/// Fetch Oxide service IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools-service", @@ -1671,9 +1681,9 @@ async fn ip_pool_service_view( type IpPoolRangePaginationParams = PaginationParams; -/// List ranges for an IP pool +/// List ranges for IP pool /// -/// List ranges for an IP pool. Ranges are ordered by their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/ranges", @@ -1717,7 +1727,7 @@ async fn ip_pool_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool +/// Add range to IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/add", @@ -1741,7 +1751,7 @@ async fn ip_pool_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool +/// Remove range from IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/remove", @@ -1765,10 +1775,9 @@ async fn ip_pool_range_remove( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List ranges for the IP pool used for Oxide services +/// List IP ranges for the Oxide service pool /// -/// List ranges for the IP pool used for Oxide services. Ranges are ordered by -/// their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools-service/ranges", @@ -1809,7 +1818,7 @@ async fn ip_pool_service_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool used for Oxide services +/// Add IP range to Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/add", @@ -1830,7 +1839,7 @@ async fn ip_pool_service_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool used for Oxide services +/// Remove IP range from Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/remove", @@ -1885,7 +1894,7 @@ async fn floating_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a floating IP +/// Create floating IP #[endpoint { method = POST, path = "/v1/floating-ips", @@ -1911,7 +1920,7 @@ async fn floating_ip_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a floating IP +/// Delete floating IP #[endpoint { method = DELETE, path = "/v1/floating-ips/{floating_ip}", @@ -1941,7 +1950,7 @@ async fn floating_ip_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a floating IP +/// Fetch floating IP #[endpoint { method = GET, path = "/v1/floating-ips/{floating_ip}", @@ -1971,7 +1980,9 @@ async fn floating_ip_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a floating IP to an instance or other resource +/// Attach floating IP +/// +/// Attach floating IP to an instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/attach", @@ -2005,7 +2016,9 @@ async fn floating_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a floating IP from an instance or other resource +/// Detach floating IP +/// +// Detach floating IP from instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/detach", @@ -2097,7 +2110,7 @@ async fn disk_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a disk +/// Fetch disk #[endpoint { method = GET, path = "/v1/disks/{disk}", @@ -2123,7 +2136,7 @@ async fn disk_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a disk +/// Delete disk #[endpoint { method = DELETE, path = "/v1/disks/{disk}", @@ -2211,7 +2224,7 @@ async fn disk_metrics_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Start importing blocks into a disk +/// Start importing blocks into disk /// /// Start the process of importing blocks into a disk #[endpoint { @@ -2242,7 +2255,7 @@ async fn disk_bulk_write_import_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Import blocks into a disk +/// Import blocks into disk #[endpoint { method = POST, path = "/v1/disks/{disk}/bulk-write", @@ -2273,7 +2286,7 @@ async fn disk_bulk_write_import( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop importing blocks into a disk +/// Stop importing blocks into disk /// /// Stop the process of importing blocks into a disk #[endpoint { @@ -2371,7 +2384,7 @@ async fn instance_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an instance +/// Create instance #[endpoint { method = POST, path = "/v1/instances", @@ -2401,7 +2414,7 @@ async fn instance_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance +/// Fetch instance #[endpoint { method = GET, path = "/v1/instances/{instance}", @@ -2435,7 +2448,7 @@ async fn instance_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an instance +/// Delete instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}", @@ -2531,7 +2544,7 @@ async fn instance_reboot( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Boot an instance +/// Boot instance #[endpoint { method = POST, path = "/v1/instances/{instance}/start", @@ -2560,7 +2573,7 @@ async fn instance_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop an instance +/// Stop instance #[endpoint { method = POST, path = "/v1/instances/{instance}/stop", @@ -2589,7 +2602,7 @@ async fn instance_stop( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance's serial console +/// Fetch instance serial console #[endpoint { method = GET, path = "/v1/instances/{instance}/serial-console", @@ -2620,7 +2633,7 @@ async fn instance_serial_console( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stream an instance's serial console +/// Stream instance serial console #[channel { protocol = WEBSOCKETS, path = "/v1/instances/{instance}/serial-console/stream", @@ -2672,9 +2685,10 @@ async fn instance_serial_console_stream( } } -/// List the SSH public keys added to the instance via cloud-init during instance creation +/// List SSH public keys for instance /// -/// Note that this list is a snapshot in time and will not reflect updates made after +/// List SSH public keys injected via cloud-init during instance creation. Note +/// that this list is a snapshot in time and will not reflect updates made after /// the instance is created. #[endpoint { method = GET, @@ -2716,7 +2730,7 @@ async fn instance_ssh_public_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an instance's disks +/// List disks for instance #[endpoint { method = GET, path = "/v1/instances/{instance}/disks", @@ -2757,7 +2771,7 @@ async fn instance_disk_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a disk to an instance +/// Attach disk to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/attach", @@ -2789,7 +2803,7 @@ async fn instance_disk_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a disk from an instance +/// Detach disk from instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/detach", @@ -2860,7 +2874,7 @@ async fn certificate_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new system-wide x.509 certificate +/// Create new system-wide x.509 certificate /// /// This certificate is automatically used by the Oxide Control plane to serve /// external connections. @@ -2890,7 +2904,7 @@ struct CertificatePathParam { certificate: NameOrId, } -/// Fetch a certificate +/// Fetch certificate /// /// Returns the details of a specific certificate #[endpoint { @@ -2914,7 +2928,7 @@ async fn certificate_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a certificate +/// Delete certificate /// /// Permanently delete a certificate. This operation cannot be undone. #[endpoint { @@ -2942,7 +2956,7 @@ async fn certificate_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an address lot +/// Create address lot #[endpoint { method = POST, path = "/v1/system/networking/address-lot", @@ -2968,7 +2982,7 @@ async fn networking_address_lot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an address lot +/// Delete address lot #[endpoint { method = DELETE, path = "/v1/system/networking/address-lot/{address_lot}", @@ -3025,7 +3039,7 @@ async fn networking_address_lot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List the blocks in an address lot +/// List blocks in address lot #[endpoint { method = GET, path = "/v1/system/networking/address-lot/{address_lot}/blocks", @@ -3061,7 +3075,7 @@ async fn networking_address_lot_block_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a loopback address +/// Create loopback address #[endpoint { method = POST, path = "/v1/system/networking/loopback-address", @@ -3102,7 +3116,7 @@ pub struct LoopbackAddressPath { pub subnet_mask: u8, } -/// Delete a loopback address +/// Delete loopback address #[endpoint { method = DELETE, path = "/v1/system/networking/loopback-address/{rack_id}/{switch_location}/{address}/{subnet_mask}", @@ -3249,7 +3263,7 @@ async fn networking_switch_port_settings_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get information about a switch port +/// Get information about switch port #[endpoint { method = GET, path = "/v1/system/networking/switch-port-settings/{port}", @@ -3352,7 +3366,7 @@ async fn networking_switch_port_clear_settings( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP configuration +/// Create new BGP configuration #[endpoint { method = POST, path = "/v1/system/networking/bgp", @@ -3449,7 +3463,7 @@ async fn networking_bgp_imported_routes_ipv4( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP configuration +/// Delete BGP configuration #[endpoint { method = DELETE, path = "/v1/system/networking/bgp", @@ -3470,7 +3484,7 @@ async fn networking_bgp_config_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP announce set +/// Create new BGP announce set #[endpoint { method = POST, path = "/v1/system/networking/bgp-announce", @@ -3518,7 +3532,7 @@ async fn networking_bgp_announce_set_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP announce set +/// Delete BGP announce set #[endpoint { method = DELETE, path = "/v1/system/networking/bgp-announce", @@ -3539,7 +3553,7 @@ async fn networking_bgp_announce_set_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Enable a BFD session. +/// Enable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-enable", @@ -3560,7 +3574,7 @@ async fn networking_bfd_enable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Disable a BFD session. +/// Disable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-disable", @@ -3581,7 +3595,7 @@ async fn networking_bfd_disable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get BFD status. +/// Get BFD status #[endpoint { method = GET, path = "/v1/system/networking/bfd-status", @@ -3652,7 +3666,7 @@ async fn image_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an image +/// Create image /// /// Create a new image in a project. #[endpoint { @@ -3690,7 +3704,7 @@ async fn image_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an image +/// Fetch image /// /// Fetch the details for a specific image in a project. #[endpoint { @@ -3733,7 +3747,7 @@ async fn image_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an image +/// Delete image /// /// Permanently delete an image from a project. This operation cannot be undone. /// Any instances in the project using the image will continue to run, however @@ -3769,9 +3783,9 @@ async fn image_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Promote a project image +/// Promote project image /// -/// Promote a project image to be visible to all projects in the silo +/// Promote project image to be visible to all projects in the silo #[endpoint { method = POST, path = "/v1/images/{image}/promote", @@ -3803,9 +3817,9 @@ async fn image_promote( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Demote a silo image +/// Demote silo image /// -/// Demote a silo image to be visible only to a specified project +/// Demote silo image to be visible only to a specified project #[endpoint { method = POST, path = "/v1/images/{image}/demote", @@ -3877,7 +3891,7 @@ async fn instance_network_interface_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a network interface +/// Create network interface #[endpoint { method = POST, path = "/v1/network-interfaces", @@ -3906,7 +3920,7 @@ async fn instance_network_interface_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a network interface +/// Delete network interface /// /// Note that the primary interface for an instance cannot be deleted if there /// are any secondary interfaces. A new primary interface must be designated @@ -3943,7 +3957,7 @@ async fn instance_network_interface_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a network interface +/// Fetch network interface #[endpoint { method = GET, path = "/v1/network-interfaces/{interface}", @@ -3974,7 +3988,7 @@ async fn instance_network_interface_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a network interface +/// Update network interface #[endpoint { method = PUT, path = "/v1/network-interfaces/{interface}", @@ -4048,7 +4062,7 @@ async fn instance_external_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Allocate and attach an ephemeral IP to an instance +/// Allocate and attach ephemeral IP to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4086,7 +4100,7 @@ async fn instance_ephemeral_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach and deallocate an ephemeral IP from an instance +/// Detach and deallocate ephemeral IP from instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4158,7 +4172,7 @@ async fn snapshot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a snapshot +/// Create snapshot /// /// Creates a point-in-time snapshot from a disk. #[endpoint { @@ -4186,7 +4200,7 @@ async fn snapshot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a snapshot +/// Fetch snapshot #[endpoint { method = GET, path = "/v1/snapshots/{snapshot}", @@ -4214,7 +4228,7 @@ async fn snapshot_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a snapshot +/// Delete snapshot #[endpoint { method = DELETE, path = "/v1/snapshots/{snapshot}", @@ -4281,7 +4295,7 @@ async fn vpc_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC +/// Create VPC #[endpoint { method = POST, path = "/v1/vpcs", @@ -4307,7 +4321,7 @@ async fn vpc_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a VPC +/// Fetch VPC #[endpoint { method = GET, path = "/v1/vpcs/{vpc}", @@ -4362,7 +4376,7 @@ async fn vpc_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a VPC +/// Delete VPC #[endpoint { method = DELETE, path = "/v1/vpcs/{vpc}", @@ -4423,7 +4437,7 @@ async fn vpc_subnet_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a subnet +/// Create subnet #[endpoint { method = POST, path = "/v1/vpc-subnets", @@ -4448,7 +4462,7 @@ async fn vpc_subnet_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a subnet +/// Fetch subnet #[endpoint { method = GET, path = "/v1/vpc-subnets/{subnet}", @@ -4477,7 +4491,7 @@ async fn vpc_subnet_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a subnet +/// Delete subnet #[endpoint { method = DELETE, path = "/v1/vpc-subnets/{subnet}", @@ -4506,7 +4520,7 @@ async fn vpc_subnet_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a subnet +/// Update subnet #[endpoint { method = PUT, path = "/v1/vpc-subnets/{subnet}", @@ -4686,7 +4700,7 @@ async fn vpc_router_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a router +/// Fetch router #[endpoint { method = GET, path = "/v1/vpc-routers/{router}", @@ -4716,7 +4730,7 @@ async fn vpc_router_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC router +/// Create VPC router #[endpoint { method = POST, path = "/v1/vpc-routers", @@ -4748,7 +4762,7 @@ async fn vpc_router_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a router +/// Delete router #[endpoint { method = DELETE, path = "/v1/vpc-routers/{router}", @@ -4778,7 +4792,7 @@ async fn vpc_router_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a router +/// Update router #[endpoint { method = PUT, path = "/v1/vpc-routers/{router}", @@ -4852,7 +4866,7 @@ async fn vpc_router_route_list( // Vpc Router Routes -/// Fetch a route +/// Fetch route #[endpoint { method = GET, path = "/v1/vpc-router-routes/{route}", @@ -4885,7 +4899,7 @@ async fn vpc_router_route_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a router +/// Create router #[endpoint { method = POST, path = "/v1/vpc-router-routes", @@ -4917,7 +4931,7 @@ async fn vpc_router_route_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a route +/// Delete route #[endpoint { method = DELETE, path = "/v1/vpc-router-routes/{route}", @@ -4949,7 +4963,7 @@ async fn vpc_router_route_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a route +/// Update route #[endpoint { method = PUT, path = "/v1/vpc-router-routes/{route}", @@ -5024,7 +5038,7 @@ struct RackPathParam { rack_id: Uuid, } -/// Fetch a rack +/// Fetch rack #[endpoint { method = GET, path = "/v1/system/hardware/racks/{rack_id}", @@ -5045,7 +5059,7 @@ async fn rack_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List uninitialized sleds in a given rack +/// List uninitialized sleds #[endpoint { method = GET, path = "/v1/system/hardware/sleds-uninitialized", @@ -5072,7 +5086,7 @@ async fn sled_list_uninitialized( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a sled to an initialized rack +/// Add sled to initialized rack // // TODO: In the future this should really be a PUT request, once we resolve // https://github.com/oxidecomputer/omicron/issues/4494. It should also @@ -5129,7 +5143,7 @@ async fn sled_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a sled +/// Fetch sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}", @@ -5151,7 +5165,7 @@ async fn sled_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set the sled's provision state +/// Set sled provision state #[endpoint { method = PUT, path = "/v1/system/hardware/sleds/{sled_id}/provision-state", @@ -5189,7 +5203,7 @@ async fn sled_set_provision_state( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List instances running on a given sled +/// List instances running on given sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}/instances", @@ -5290,7 +5304,7 @@ async fn switch_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a switch +/// Fetch switch #[endpoint { method = GET, path = "/v1/system/hardware/switches/{switch_id}", @@ -5473,7 +5487,7 @@ async fn silo_metric( // Updates -/// Upload a TUF repository +/// Upload TUF repository #[endpoint { method = PUT, path = "/v1/system/update/repository", @@ -5498,7 +5512,9 @@ async fn system_update_put_repository( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get the description of a repository by system version. +/// Fetch TUF repository description +/// +/// Fetch description of TUF repository by system version. #[endpoint { method = GET, path = "/v1/system/update/repository/{system_version}", @@ -5653,7 +5669,7 @@ async fn user_builtin_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in user +/// Fetch built-in user #[endpoint { method = GET, path = "/v1/system/users-builtin/{user}", @@ -5735,7 +5751,7 @@ async fn role_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in role +/// Fetch built-in role #[endpoint { method = GET, path = "/v1/system/roles/{role_name}", @@ -5759,7 +5775,7 @@ async fn role_view( // Current user -/// Fetch the user associated with the current session +/// Fetch user for current session #[endpoint { method = GET, path = "/v1/me", @@ -5782,7 +5798,7 @@ pub(crate) async fn current_user_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the silo groups the current user belongs to +/// Fetch current user's groups #[endpoint { method = GET, path = "/v1/me/groups", @@ -5856,7 +5872,7 @@ async fn current_user_ssh_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an SSH public key +/// Create SSH public key /// /// Create an SSH public key for the currently authenticated user. #[endpoint { @@ -5884,9 +5900,9 @@ async fn current_user_ssh_key_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an SSH public key +/// Fetch SSH public key /// -/// Fetch an SSH public key associated with the currently authenticated user. +/// Fetch SSH public key associated with the currently authenticated user. #[endpoint { method = GET, path = "/v1/me/ssh-keys/{ssh_key}", @@ -5918,7 +5934,7 @@ async fn current_user_ssh_key_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an SSH public key +/// Delete SSH public key /// /// Delete an SSH public key associated with the currently authenticated user. #[endpoint { diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 0122d9b439..eddc834a2a 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -57,6 +57,7 @@ type NexusApiDescription = ApiDescription>; pub(crate) fn internal_api() -> NexusApiDescription { fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> { api.register(sled_agent_put)?; + api.register(sled_firewall_rules_request)?; api.register(switch_put)?; api.register(rack_initialization_complete)?; api.register(physical_disk_put)?; @@ -126,6 +127,31 @@ async fn sled_agent_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Request a new set of firewall rules for a sled. +/// +/// This causes Nexus to read the latest set of rules for the sled, +/// and call a Sled endpoint which applies the rules to all OPTE ports +/// that happen to exist. +#[endpoint { + method = POST, + path = "/sled-agents/{sled_id}/firewall-rules-update", + }] +async fn sled_firewall_rules_request( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let path = path_params.into_inner(); + let sled_id = &path.sled_id; + let handler = async { + nexus.sled_request_firewall_rules(&opctx, *sled_id).await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// Path parameters for Rack requests. #[derive(Deserialize, JsonSchema)] struct RackPathParam { diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index e1392440a1..1b80bc4c3c 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -47,7 +47,7 @@ extern crate slog; /// to stdout. pub fn run_openapi_external() -> Result<(), String> { external_api() - .openapi("Oxide Region API", "0.0.1") + .openapi("Oxide Region API", "0.0.6") .description("API for interacting with the Oxide control plane") .contact_url("https://oxide.computer") .contact_email("api@oxide.computer") @@ -288,13 +288,15 @@ impl nexus_test_interface::NexusServer for Server { vec!["qsfp0".parse().unwrap()], )]), ), - rack_network_config: Some(RackNetworkConfig { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56" + .parse() + .unwrap(), infra_ip_first: Ipv4Addr::UNSPECIFIED, infra_ip_last: Ipv4Addr::UNSPECIFIED, ports: Vec::new(), bgp: Vec::new(), - }), + }, }, ) .await diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index b0fb7691e6..2129219936 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -65,7 +65,7 @@ pub const RACK_UUID: &str = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"; pub const SWITCH_UUID: &str = "dae4e1f1-410e-4314-bff1-fec0504be07e"; pub const OXIMETER_UUID: &str = "39e6175b-4df2-4730-b11d-cbc1e60a2e78"; pub const PRODUCER_UUID: &str = "a6458b7d-87c3-4483-be96-854d814c20de"; -pub const RACK_SUBNET: &str = "fd00:1122:3344:01::/56"; +pub const RACK_SUBNET: &str = "fd00:1122:3344:0100::/56"; /// Password for the user created by the test suite /// diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 3571388747..8d37f9e3ef 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -103,6 +103,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 100 blueprints.period_secs_execute = 600 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/tests/integration_tests/commands.rs b/nexus/tests/integration_tests/commands.rs index 02d938b2ac..bc79a7d5a2 100644 --- a/nexus/tests/integration_tests/commands.rs +++ b/nexus/tests/integration_tests/commands.rs @@ -109,7 +109,7 @@ fn test_nexus_openapi() { .expect("stdout was not valid OpenAPI"); assert_eq!(spec.openapi, "3.0.3"); assert_eq!(spec.info.title, "Oxide Region API"); - assert_eq!(spec.info.version, "0.0.1"); + assert_eq!(spec.info.version, "0.0.6"); // Spot check a couple of items. assert!(!spec.paths.paths.is_empty()); diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index a6fc93e92a..a58871ee71 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -110,7 +110,7 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { let baseboard = uninitialized_sleds.pop().unwrap().baseboard; let sled_uuid = Uuid::new_v4(); let sa = SledAgentStartupInfo { - sa_address: "[fd00:1122:3344:01::1]:8080".parse().unwrap(), + sa_address: "[fd00:1122:3344:0100::1]:8080".parse().unwrap(), role: SledRole::Gimlet, baseboard: Baseboard { serial_number: baseboard.serial, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 4c6e5e25e9..f2c2425172 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -11,13 +11,17 @@ //! nexus/deployment does not currently know about nexus/db-model and it's //! convenient to separate these concerns.) +use crate::external_api::views::SledProvisionState; use crate::inventory::Collection; +pub use crate::inventory::NetworkInterface; +pub use crate::inventory::NetworkInterfaceKind; pub use crate::inventory::OmicronZoneConfig; pub use crate::inventory::OmicronZoneDataset; pub use crate::inventory::OmicronZoneType; pub use crate::inventory::OmicronZonesConfig; pub use crate::inventory::SourceNatConfig; pub use crate::inventory::ZpoolName; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; @@ -43,14 +47,26 @@ use uuid::Uuid; /// /// The current policy is pretty limited. It's aimed primarily at supporting /// the add/remove sled use case. +#[derive(Debug, Clone)] pub struct Policy { /// set of sleds that are supposed to be part of the control plane, along /// with information about resources available to the planner pub sleds: BTreeMap, + + /// ranges specified by the IP pool for externally-visible control plane + /// services (e.g., external DNS, Nexus, boundary NTP) + pub service_ip_pool_ranges: Vec, + + /// desired total number of deployed Nexus zones + pub target_nexus_zone_count: usize, } /// Describes the resources available on each sled for the planner +#[derive(Debug, Clone)] pub struct SledResources { + /// provision state of this sled + pub provision_state: SledProvisionState, + /// zpools on this sled /// /// (used to allocate storage for control plane zones with persistent @@ -476,10 +492,11 @@ impl<'a> OmicronZonesDiff<'a> { for z in &bbsledzones.zones { writeln!( f, - "{} zone {} type {} ({})", + "{} zone {} type {} underlay IP {} ({})", prefix, z.id, z.zone_type.label(), + z.underlay_address, label )?; } @@ -539,44 +556,65 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { DiffZoneChangedHow::DetailsChanged => { writeln!( f, - "- zone {} type {} (changed)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (changed)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::RemovedFromService => { writeln!( f, - "- zone {} type {} (in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (removed from service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (removed from service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::AddedToService => { writeln!( f, - "- zone {} type {} (not in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (not in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (added to service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (added to service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::NoChanges => { writeln!( f, - " zone {} type {} (unchanged)", - zone_id, zone_type, + " zone {} type {} underlay IP {} \ + (unchanged)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; } } @@ -585,8 +623,9 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { for zone in sled_changes.zones_added() { writeln!( f, - "+ zone {} type {} (added)", + "+ zone {} type {} underlay IP {} (added)", zone.id, + zone.underlay_address, zone.zone_type.label(), )?; } diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index bc25e8d4bd..ab15ec26b7 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -263,7 +263,7 @@ pub struct RackInitializationRequest { /// The external qsfp ports per sidecar pub external_port_count: ExternalPortDiscovery, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } pub type DnsConfigParams = dns_service_client::types::DnsConfigParams; diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 760d3918f4..71e8e64d97 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -48,7 +48,7 @@ use uuid::Uuid; /// database. /// /// See the documentation in the database schema for more background. -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub struct Collection { /// unique identifier for this collection pub id: Uuid, diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 6fd83cef47..a55803eda9 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -651,7 +651,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -659,10 +658,6 @@ } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ @@ -688,7 +683,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ] }, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 8907d2ada0..94f40fed1b 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -835,6 +835,35 @@ } } }, + "/sled-agents/{sled_id}/firewall-rules-update": { + "post": { + "summary": "Request a new set of firewall rules for a sled.", + "description": "This causes Nexus to read the latest set of rules for the sled, and call a Sled endpoint which applies the rules to all OPTE ports that happen to exist.", + "operationId": "sled_firewall_rules_request", + "parameters": [ + { + "in": "path", + "name": "sled_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sled-agents/{sled_id}/zpools/{zpool_id}": { "put": { "summary": "Report that a pool for a specified sled has come online.", @@ -5636,7 +5665,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -5667,6 +5695,7 @@ "external_port_count", "internal_dns_zone_config", "internal_services_ip_pool_ranges", + "rack_network_config", "recovery_silo", "services" ] diff --git a/openapi/nexus.json b/openapi/nexus.json index 7aedd1b523..f42841dcf6 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "0.0.1" + "version": "0.0.6" }, "paths": { "/device/auth": { @@ -223,7 +223,7 @@ "tags": [ "silos" ], - "summary": "Create a new system-wide x.509 certificate", + "summary": "Create new system-wide x.509 certificate", "description": "This certificate is automatically used by the Oxide Control plane to serve external connections.", "operationId": "certificate_create", "requestBody": { @@ -261,7 +261,7 @@ "tags": [ "silos" ], - "summary": "Fetch a certificate", + "summary": "Fetch certificate", "description": "Returns the details of a specific certificate", "operationId": "certificate_view", "parameters": [ @@ -297,7 +297,7 @@ "tags": [ "silos" ], - "summary": "Delete a certificate", + "summary": "Delete certificate", "description": "Permanently delete a certificate. This operation cannot be undone.", "operationId": "certificate_delete", "parameters": [ @@ -443,7 +443,7 @@ "tags": [ "disks" ], - "summary": "Fetch a disk", + "summary": "Fetch disk", "operationId": "disk_view", "parameters": [ { @@ -487,7 +487,7 @@ "tags": [ "disks" ], - "summary": "Delete a disk", + "summary": "Delete disk", "operationId": "disk_delete", "parameters": [ { @@ -526,7 +526,7 @@ "tags": [ "disks" ], - "summary": "Import blocks into a disk", + "summary": "Import blocks into disk", "operationId": "disk_bulk_write_import", "parameters": [ { @@ -575,7 +575,7 @@ "tags": [ "disks" ], - "summary": "Start importing blocks into a disk", + "summary": "Start importing blocks into disk", "description": "Start the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_start", "parameters": [ @@ -615,7 +615,7 @@ "tags": [ "disks" ], - "summary": "Stop importing blocks into a disk", + "summary": "Stop importing blocks into disk", "description": "Stop the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_stop", "parameters": [ @@ -876,7 +876,7 @@ "tags": [ "floating-ips" ], - "summary": "Create a floating IP", + "summary": "Create floating IP", "operationId": "floating_ip_create", "parameters": [ { @@ -924,7 +924,7 @@ "tags": [ "floating-ips" ], - "summary": "Fetch a floating IP", + "summary": "Fetch floating IP", "operationId": "floating_ip_view", "parameters": [ { @@ -968,7 +968,7 @@ "tags": [ "floating-ips" ], - "summary": "Delete a floating IP", + "summary": "Delete floating IP", "operationId": "floating_ip_delete", "parameters": [ { @@ -1007,7 +1007,8 @@ "tags": [ "floating-ips" ], - "summary": "Attach a floating IP to an instance or other resource", + "summary": "Attach floating IP", + "description": "Attach floating IP to an instance or other resource.", "operationId": "floating_ip_attach", "parameters": [ { @@ -1063,7 +1064,7 @@ "tags": [ "floating-ips" ], - "summary": "Detach a floating IP from an instance or other resource", + "summary": "Detach floating IP", "operationId": "floating_ip_detach", "parameters": [ { @@ -1273,7 +1274,7 @@ "tags": [ "images" ], - "summary": "Create an image", + "summary": "Create image", "description": "Create a new image in a project.", "operationId": "image_create", "parameters": [ @@ -1321,7 +1322,7 @@ "tags": [ "images" ], - "summary": "Fetch an image", + "summary": "Fetch image", "description": "Fetch the details for a specific image in a project.", "operationId": "image_view", "parameters": [ @@ -1366,7 +1367,7 @@ "tags": [ "images" ], - "summary": "Delete an image", + "summary": "Delete image", "description": "Permanently delete an image from a project. This operation cannot be undone. Any instances in the project using the image will continue to run, however new instances can not be created with this image.", "operationId": "image_delete", "parameters": [ @@ -1406,8 +1407,8 @@ "tags": [ "images" ], - "summary": "Demote a silo image", - "description": "Demote a silo image to be visible only to a specified project", + "summary": "Demote silo image", + "description": "Demote silo image to be visible only to a specified project", "operationId": "image_demote", "parameters": [ { @@ -1454,8 +1455,8 @@ "tags": [ "images" ], - "summary": "Promote a project image", - "description": "Promote a project image to be visible to all projects in the silo", + "summary": "Promote project image", + "description": "Promote project image to be visible to all projects in the silo", "operationId": "image_promote", "parameters": [ { @@ -1568,7 +1569,7 @@ "tags": [ "instances" ], - "summary": "Create an instance", + "summary": "Create instance", "operationId": "instance_create", "parameters": [ { @@ -1616,7 +1617,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance", + "summary": "Fetch instance", "operationId": "instance_view", "parameters": [ { @@ -1660,7 +1661,7 @@ "tags": [ "instances" ], - "summary": "Delete an instance", + "summary": "Delete instance", "operationId": "instance_delete", "parameters": [ { @@ -1699,7 +1700,7 @@ "tags": [ "instances" ], - "summary": "List an instance's disks", + "summary": "List disks for instance", "operationId": "instance_disk_list", "parameters": [ { @@ -1775,7 +1776,7 @@ "tags": [ "instances" ], - "summary": "Attach a disk to an instance", + "summary": "Attach disk to instance", "operationId": "instance_disk_attach", "parameters": [ { @@ -1831,7 +1832,7 @@ "tags": [ "instances" ], - "summary": "Detach a disk from an instance", + "summary": "Detach disk from instance", "operationId": "instance_disk_detach", "parameters": [ { @@ -1933,7 +1934,7 @@ "tags": [ "instances" ], - "summary": "Allocate and attach an ephemeral IP to an instance", + "summary": "Allocate and attach ephemeral IP to instance", "operationId": "instance_ephemeral_ip_attach", "parameters": [ { @@ -1987,7 +1988,7 @@ "tags": [ "instances" ], - "summary": "Detach and deallocate an ephemeral IP from an instance", + "summary": "Detach and deallocate ephemeral IP from instance", "operationId": "instance_ephemeral_ip_detach", "parameters": [ { @@ -2128,7 +2129,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance's serial console", + "summary": "Fetch instance serial console", "operationId": "instance_serial_console", "parameters": [ { @@ -2207,7 +2208,7 @@ "tags": [ "instances" ], - "summary": "Stream an instance's serial console", + "summary": "Stream instance serial console", "operationId": "instance_serial_console_stream", "parameters": [ { @@ -2257,8 +2258,8 @@ "tags": [ "instances" ], - "summary": "List the SSH public keys added to the instance via cloud-init during instance creation", - "description": "Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", + "summary": "List SSH public keys for instance", + "description": "List SSH public keys injected via cloud-init during instance creation. Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", "operationId": "instance_ssh_public_key_list", "parameters": [ { @@ -2334,7 +2335,7 @@ "tags": [ "instances" ], - "summary": "Boot an instance", + "summary": "Boot instance", "operationId": "instance_start", "parameters": [ { @@ -2380,7 +2381,7 @@ "tags": [ "instances" ], - "summary": "Stop an instance", + "summary": "Stop instance", "operationId": "instance_stop", "parameters": [ { @@ -2485,7 +2486,7 @@ "tags": [ "projects" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "project_ip_pool_view", "parameters": [ { @@ -2583,7 +2584,7 @@ "tags": [ "session" ], - "summary": "Fetch the user associated with the current session", + "summary": "Fetch user for current session", "operationId": "current_user_view", "responses": { "200": { @@ -2610,7 +2611,7 @@ "tags": [ "session" ], - "summary": "Fetch the silo groups the current user belongs to", + "summary": "Fetch current user's groups", "operationId": "current_user_groups", "parameters": [ { @@ -2727,7 +2728,7 @@ "tags": [ "session" ], - "summary": "Create an SSH public key", + "summary": "Create SSH public key", "description": "Create an SSH public key for the currently authenticated user.", "operationId": "current_user_ssh_key_create", "requestBody": { @@ -2765,8 +2766,8 @@ "tags": [ "session" ], - "summary": "Fetch an SSH public key", - "description": "Fetch an SSH public key associated with the currently authenticated user.", + "summary": "Fetch SSH public key", + "description": "Fetch SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_view", "parameters": [ { @@ -2802,7 +2803,7 @@ "tags": [ "session" ], - "summary": "Delete an SSH public key", + "summary": "Delete SSH public key", "description": "Delete an SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_delete", "parameters": [ @@ -3006,7 +3007,7 @@ "tags": [ "instances" ], - "summary": "Create a network interface", + "summary": "Create network interface", "operationId": "instance_network_interface_create", "parameters": [ { @@ -3062,7 +3063,7 @@ "tags": [ "instances" ], - "summary": "Fetch a network interface", + "summary": "Fetch network interface", "operationId": "instance_network_interface_view", "parameters": [ { @@ -3114,7 +3115,7 @@ "tags": [ "instances" ], - "summary": "Update a network interface", + "summary": "Update network interface", "operationId": "instance_network_interface_update", "parameters": [ { @@ -3176,7 +3177,7 @@ "tags": [ "instances" ], - "summary": "Delete a network interface", + "summary": "Delete network interface", "description": "Note that the primary interface for an instance cannot be deleted if there are any secondary interfaces. A new primary interface must be designated first. The primary interface can be deleted if there are no secondary interfaces.", "operationId": "instance_network_interface_delete", "parameters": [ @@ -3252,7 +3253,7 @@ "tags": [ "silos" ], - "summary": "Fetch the current silo's IAM policy", + "summary": "Fetch current silo's IAM policy", "operationId": "policy_view", "responses": { "200": { @@ -3277,7 +3278,7 @@ "tags": [ "silos" ], - "summary": "Update the current silo's IAM policy", + "summary": "Update current silo's IAM policy", "operationId": "policy_update", "requestBody": { "content": { @@ -3371,7 +3372,7 @@ "tags": [ "projects" ], - "summary": "Create a project", + "summary": "Create project", "operationId": "project_create", "requestBody": { "content": { @@ -3408,7 +3409,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project", + "summary": "Fetch project", "operationId": "project_view", "parameters": [ { @@ -3490,7 +3491,7 @@ "tags": [ "projects" ], - "summary": "Delete a project", + "summary": "Delete project", "operationId": "project_delete", "parameters": [ { @@ -3521,7 +3522,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project's IAM policy", + "summary": "Fetch project's IAM policy", "operationId": "project_policy_view", "parameters": [ { @@ -3557,7 +3558,7 @@ "tags": [ "projects" ], - "summary": "Update a project's IAM policy", + "summary": "Update project's IAM policy", "operationId": "project_policy_update", "parameters": [ { @@ -3672,7 +3673,7 @@ "tags": [ "snapshots" ], - "summary": "Create a snapshot", + "summary": "Create snapshot", "description": "Creates a point-in-time snapshot from a disk.", "operationId": "snapshot_create", "parameters": [ @@ -3721,7 +3722,7 @@ "tags": [ "snapshots" ], - "summary": "Fetch a snapshot", + "summary": "Fetch snapshot", "operationId": "snapshot_view", "parameters": [ { @@ -3765,7 +3766,7 @@ "tags": [ "snapshots" ], - "summary": "Delete a snapshot", + "summary": "Delete snapshot", "operationId": "snapshot_delete", "parameters": [ { @@ -3922,7 +3923,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a rack", + "summary": "Fetch rack", "operationId": "rack_view", "parameters": [ { @@ -4018,7 +4019,7 @@ "tags": [ "system/hardware" ], - "summary": "Add a sled to an initialized rack", + "summary": "Add sled to initialized rack", "operationId": "sled_add", "requestBody": { "content": { @@ -4048,7 +4049,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a sled", + "summary": "Fetch sled", "operationId": "sled_view", "parameters": [ { @@ -4156,7 +4157,7 @@ "tags": [ "system/hardware" ], - "summary": "List instances running on a given sled", + "summary": "List instances running on given sled", "operationId": "sled_instance_list", "parameters": [ { @@ -4225,7 +4226,7 @@ "tags": [ "system/hardware" ], - "summary": "Set the sled's provision state", + "summary": "Set sled provision state", "operationId": "sled_set_provision_state", "parameters": [ { @@ -4274,7 +4275,7 @@ "tags": [ "system/hardware" ], - "summary": "List uninitialized sleds in a given rack", + "summary": "List uninitialized sleds", "operationId": "sled_list_uninitialized", "parameters": [ { @@ -4562,7 +4563,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a switch", + "summary": "Fetch switch", "operationId": "switch_view", "parameters": [ { @@ -4670,7 +4671,7 @@ "tags": [ "system/silos" ], - "summary": "Create a user", + "summary": "Create user", "description": "Users can only be created in Silos with `provision_type` == `Fixed`. Otherwise, Silo users are just-in-time (JIT) provisioned when a user first logs in using an external Identity Provider.", "operationId": "local_idp_user_create", "parameters": [ @@ -4719,7 +4720,7 @@ "tags": [ "system/silos" ], - "summary": "Delete a user", + "summary": "Delete user", "operationId": "local_idp_user_delete", "parameters": [ { @@ -4760,7 +4761,7 @@ "tags": [ "system/silos" ], - "summary": "Set or invalidate a user's password", + "summary": "Set or invalidate user's password", "description": "Passwords can only be updated for users in Silos with identity mode `LocalOnly`.", "operationId": "local_idp_user_set_password", "parameters": [ @@ -4812,7 +4813,7 @@ "tags": [ "system/silos" ], - "summary": "Create a SAML IdP", + "summary": "Create SAML IdP", "operationId": "saml_identity_provider_create", "parameters": [ { @@ -4860,7 +4861,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a SAML IdP", + "summary": "Fetch SAML IdP", "operationId": "saml_identity_provider_view", "parameters": [ { @@ -4964,7 +4965,7 @@ "tags": [ "system/networking" ], - "summary": "Create an IP pool", + "summary": "Create IP pool", "operationId": "ip_pool_create", "requestBody": { "content": { @@ -5001,7 +5002,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "ip_pool_view", "parameters": [ { @@ -5037,7 +5038,7 @@ "tags": [ "system/networking" ], - "summary": "Update an IP pool", + "summary": "Update IP pool", "operationId": "ip_pool_update", "parameters": [ { @@ -5083,7 +5084,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an IP pool", + "summary": "Delete IP pool", "operationId": "ip_pool_delete", "parameters": [ { @@ -5114,8 +5115,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for an IP pool", - "description": "List ranges for an IP pool. Ranges are ordered by their first address.", + "summary": "List ranges for IP pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_range_list", "parameters": [ { @@ -5176,7 +5177,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool", + "summary": "Add range to IP pool", "operationId": "ip_pool_range_add", "parameters": [ { @@ -5224,7 +5225,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool", + "summary": "Remove range from IP pool", "operationId": "ip_pool_range_remove", "parameters": [ { @@ -5265,7 +5266,7 @@ "tags": [ "system/networking" ], - "summary": "List an IP pool's linked silos", + "summary": "List IP pool's linked silos", "operationId": "ip_pool_silo_list", "parameters": [ { @@ -5331,7 +5332,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool available within a silo", + "summary": "Link IP pool to silo", + "description": "Users in linked silos can allocate external IPs from this pool for their instances. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "ip_pool_silo_link", "parameters": [ { @@ -5379,8 +5381,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool default or not-default for a silo", - "description": "When a pool is made default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", + "summary": "Make IP pool default for silo", + "description": "When a user asks for an IP (e.g., at instance create time) without specifying a pool, the IP comes from the default pool if a default is configured. When a pool is made the default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", "operationId": "ip_pool_silo_update", "parameters": [ { @@ -5433,7 +5435,7 @@ "tags": [ "system/networking" ], - "summary": "Unlink an IP pool from a silo", + "summary": "Unlink IP pool from silo", "description": "Will fail if there are any outstanding IPs allocated in the silo.", "operationId": "ip_pool_silo_unlink", "parameters": [ @@ -5472,7 +5474,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch the IP pool used for Oxide services", + "summary": "Fetch Oxide service IP pool", "operationId": "ip_pool_service_view", "responses": { "200": { @@ -5499,8 +5501,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for the IP pool used for Oxide services", - "description": "List ranges for the IP pool used for Oxide services. Ranges are ordered by their first address.", + "summary": "List IP ranges for the Oxide service pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_service_range_list", "parameters": [ { @@ -5552,7 +5554,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool used for Oxide services", + "summary": "Add IP range to Oxide service pool", "operationId": "ip_pool_service_range_add", "requestBody": { "content": { @@ -5589,7 +5591,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool used for Oxide services", + "summary": "Remove IP range from Oxide service pool", "operationId": "ip_pool_service_range_remove", "requestBody": { "content": { @@ -5773,7 +5775,7 @@ "tags": [ "system/networking" ], - "summary": "Create an address lot", + "summary": "Create address lot", "operationId": "networking_address_lot_create", "requestBody": { "content": { @@ -5810,7 +5812,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an address lot", + "summary": "Delete address lot", "operationId": "networking_address_lot_delete", "parameters": [ { @@ -5841,7 +5843,7 @@ "tags": [ "system/networking" ], - "summary": "List the blocks in an address lot", + "summary": "List blocks in address lot", "operationId": "networking_address_lot_block_list", "parameters": [ { @@ -5909,7 +5911,7 @@ "tags": [ "system/networking" ], - "summary": "Disable a BFD session.", + "summary": "Disable a BFD session", "operationId": "networking_bfd_disable", "requestBody": { "content": { @@ -5939,7 +5941,7 @@ "tags": [ "system/networking" ], - "summary": "Enable a BFD session.", + "summary": "Enable a BFD session", "operationId": "networking_bfd_enable", "requestBody": { "content": { @@ -5969,7 +5971,7 @@ "tags": [ "system/networking" ], - "summary": "Get BFD status.", + "summary": "Get BFD status", "operationId": "networking_bfd_status", "responses": { "200": { @@ -6065,7 +6067,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP configuration", + "summary": "Create new BGP configuration", "operationId": "networking_bgp_config_create", "requestBody": { "content": { @@ -6100,7 +6102,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP configuration", + "summary": "Delete BGP configuration", "operationId": "networking_bgp_config_delete", "parameters": [ { @@ -6171,7 +6173,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP announce set", + "summary": "Create new BGP announce set", "operationId": "networking_bgp_announce_set_create", "requestBody": { "content": { @@ -6206,7 +6208,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP announce set", + "summary": "Delete BGP announce set", "operationId": "networking_bgp_announce_set_delete", "parameters": [ { @@ -6369,7 +6371,7 @@ "tags": [ "system/networking" ], - "summary": "Create a loopback address", + "summary": "Create loopback address", "operationId": "networking_loopback_address_create", "requestBody": { "content": { @@ -6406,7 +6408,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a loopback address", + "summary": "Delete loopback address", "operationId": "networking_loopback_address_delete", "parameters": [ { @@ -6598,7 +6600,7 @@ "tags": [ "system/networking" ], - "summary": "Get information about a switch port", + "summary": "Get information about switch port", "operationId": "networking_switch_port_settings_view", "parameters": [ { @@ -6636,7 +6638,7 @@ "tags": [ "policy" ], - "summary": "Fetch the top-level IAM policy", + "summary": "Fetch top-level IAM policy", "operationId": "system_policy_view", "responses": { "200": { @@ -6661,7 +6663,7 @@ "tags": [ "policy" ], - "summary": "Update the top-level IAM policy", + "summary": "Update top-level IAM policy", "operationId": "system_policy_update", "requestBody": { "content": { @@ -6750,7 +6752,7 @@ "tags": [ "roles" ], - "summary": "Fetch a built-in role", + "summary": "Fetch built-in role", "operationId": "role_view", "parameters": [ { @@ -6942,8 +6944,8 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo", - "description": "Fetch a silo by name or ID.", + "summary": "Fetch silo", + "description": "Fetch silo by name or ID.", "operationId": "silo_view", "parameters": [ { @@ -6980,7 +6982,7 @@ "system/silos" ], "summary": "Delete a silo", - "description": "Delete a silo by name.", + "description": "Delete a silo by name or ID.", "operationId": "silo_delete", "parameters": [ { @@ -7011,7 +7013,8 @@ "tags": [ "system/silos" ], - "summary": "List IP pools available within silo", + "summary": "List IP pools linked to silo", + "description": "Linked IP pools are available to users in the specified silo. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "silo_ip_pool_list", "parameters": [ { @@ -7079,7 +7082,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo's IAM policy", + "summary": "Fetch silo IAM policy", "operationId": "silo_policy_view", "parameters": [ { @@ -7115,7 +7118,7 @@ "tags": [ "system/silos" ], - "summary": "Update a silo's IAM policy", + "summary": "Update silo IAM policy", "operationId": "silo_policy_update", "parameters": [ { @@ -7163,7 +7166,7 @@ "tags": [ "system/silos" ], - "summary": "View the resource quotas of a given silo", + "summary": "Fetch resource quotas for silo", "operationId": "silo_quotas_view", "parameters": [ { @@ -7199,7 +7202,7 @@ "tags": [ "system/silos" ], - "summary": "Update the resource quotas of a given silo", + "summary": "Update resource quotas for silo", "description": "If a quota value is not specified, it will remain unchanged.", "operationId": "silo_quotas_update", "parameters": [ @@ -7248,7 +7251,7 @@ "tags": [ "system/silos" ], - "summary": "List built-in (system) users in a silo", + "summary": "List built-in (system) users in silo", "operationId": "silo_user_list", "parameters": [ { @@ -7317,7 +7320,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in (system) user", + "summary": "Fetch built-in (system) user", "operationId": "silo_user_view", "parameters": [ { @@ -7424,7 +7427,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in user", + "summary": "Fetch built-in user", "operationId": "user_builtin_view", "parameters": [ { @@ -7520,7 +7523,7 @@ "tags": [ "system/silos" ], - "summary": "View the current utilization of a given silo", + "summary": "Fetch current utilization for given silo", "operationId": "silo_utilization_view", "parameters": [ { @@ -7626,7 +7629,7 @@ "tags": [ "silos" ], - "summary": "View the resource utilization of the user's current silo", + "summary": "Fetch resource utilization for user's current silo", "operationId": "utilization_view", "responses": { "200": { @@ -7828,7 +7831,7 @@ "tags": [ "vpcs" ], - "summary": "Create a subnet", + "summary": "Create subnet", "operationId": "vpc_subnet_create", "parameters": [ { @@ -7884,7 +7887,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a subnet", + "summary": "Fetch subnet", "operationId": "vpc_subnet_view", "parameters": [ { @@ -7936,7 +7939,7 @@ "tags": [ "vpcs" ], - "summary": "Update a subnet", + "summary": "Update subnet", "operationId": "vpc_subnet_update", "parameters": [ { @@ -7998,7 +8001,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a subnet", + "summary": "Delete subnet", "operationId": "vpc_subnet_delete", "parameters": [ { @@ -8196,7 +8199,7 @@ "tags": [ "vpcs" ], - "summary": "Create a VPC", + "summary": "Create VPC", "operationId": "vpc_create", "parameters": [ { @@ -8244,7 +8247,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a VPC", + "summary": "Fetch VPC", "operationId": "vpc_view", "parameters": [ { @@ -8342,7 +8345,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a VPC", + "summary": "Delete VPC", "operationId": "vpc_delete", "parameters": [ { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 4b53397ffb..99156fffd4 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -136,6 +136,30 @@ } } }, + "/bootstore/status": { + "get": { + "summary": "Get the internal state of the local bootstore node", + "operationId": "bootstore_status", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BootstoreStatus" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/cockroachdb": { "post": { "summary": "Initializes a CockroachDB cluster", @@ -1240,7 +1264,7 @@ "type": "object", "properties": { "sled_id": { - "$ref": "#/components/schemas/Baseboard" + "$ref": "#/components/schemas/BaseboardId" }, "start_request": { "$ref": "#/components/schemas/StartSledAgentRequest" @@ -1319,6 +1343,24 @@ } ] }, + "BaseboardId": { + "description": "A representation of a Baseboard ID as used in the inventory subsystem This type is essentially the same as a `Baseboard` except it doesn't have a revision or HW type (Gimlet, PC, Unknown).", + "type": "object", + "properties": { + "part_number": { + "description": "Oxide Part Number", + "type": "string" + }, + "serial_number": { + "description": "Serial number (unique for a given part number)", + "type": "string" + } + }, + "required": [ + "part_number", + "serial_number" + ] + }, "BgpConfig": { "type": "object", "properties": { @@ -2513,6 +2555,60 @@ } ] }, + "BootstoreStatus": { + "type": "object", + "properties": { + "accepted_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "established_connections": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EstablishedConnection" + } + }, + "fsm_ledger_generation": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "fsm_state": { + "type": "string" + }, + "negotiating_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "network_config_ledger_generation": { + "nullable": true, + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "peers": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + } + }, + "required": [ + "accepted_connections", + "established_connections", + "fsm_ledger_generation", + "fsm_state", + "negotiating_connections", + "peers" + ] + }, "BundleUtilization": { "description": "The portion of a debug dataset used for zone bundles.", "type": "object", @@ -3829,6 +3925,21 @@ "request_id" ] }, + "EstablishedConnection": { + "type": "object", + "properties": { + "addr": { + "type": "string" + }, + "baseboard": { + "$ref": "#/components/schemas/Baseboard" + } + }, + "required": [ + "addr", + "baseboard" + ] + }, "Field": { "description": "A `Field` is a named aspect of a target or metric.", "type": "object", diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 300e8412c3..b9645a174f 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -1132,7 +1132,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } ] } @@ -2172,7 +2172,7 @@ } }, "rack_network_config": { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } }, "required": [ @@ -2190,46 +2190,6 @@ "type": "string", "format": "uuid" }, - "RackNetworkConfigV1": { - "description": "Initial network configuration", - "type": "object", - "properties": { - "bgp": { - "description": "BGP configurations for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/BgpConfig" - } - }, - "infra_ip_first": { - "description": "First ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "infra_ip_last": { - "description": "Last ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "ports": { - "description": "Uplinks for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/PortConfigV1" - } - }, - "rack_subnet": { - "$ref": "#/components/schemas/Ipv6Network" - } - }, - "required": [ - "bgp", - "infra_ip_first", - "infra_ip_last", - "ports", - "rack_subnet" - ] - }, "RackOperationStatus": { "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackInitId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackResetId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", "oneOf": [ @@ -4698,6 +4658,38 @@ } ] }, + "UserSpecifiedRackNetworkConfig": { + "description": "User-specified parts of [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig).", + "type": "object", + "properties": { + "bgp": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BgpConfig" + } + }, + "infra_ip_first": { + "type": "string", + "format": "ipv4" + }, + "infra_ip_last": { + "type": "string", + "format": "ipv4" + }, + "ports": { + "type": "array", + "items": { + "$ref": "#/components/schemas/PortConfigV1" + } + } + }, + "required": [ + "bgp", + "infra_ip_first", + "infra_ip_last", + "ports" + ] + }, "IgnitionCommand": { "description": "Ignition command.\n\n
JSON schema\n\n```json { \"description\": \"Ignition command.\", \"type\": \"string\", \"enum\": [ \"power_on\", \"power_off\", \"power_reset\" ] } ```
", "type": "string", diff --git a/package-manifest.toml b/package-manifest.toml index 8944e59c37..9b72dd7d18 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -405,10 +405,10 @@ only_for_targets.image = "standard" # 3. Use source.type = "manual" instead of "prebuilt" source.type = "prebuilt" source.repo = "crucible" -source.commit = "2d4bc11232d53f177c286383926fa5f8c1b2a938" +source.commit = "796dce526dd7ed7b52a0429a486ccba4a9da1ce5" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible.sha256.txt -source.sha256 = "88ec93657a644e8f10a32d1d22cc027db901aea81027f49ce7bee58fc4a35755" +source.sha256 = "8b654627a4250e8d444133cf3130838d224b13e53f3e48cf0d031314d6f05ee0" output.type = "zone" [package.crucible-pantry] @@ -416,10 +416,10 @@ service_name = "crucible_pantry" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "crucible" -source.commit = "2d4bc11232d53f177c286383926fa5f8c1b2a938" +source.commit = "796dce526dd7ed7b52a0429a486ccba4a9da1ce5" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible-pantry.sha256.txt -source.sha256 = "e2c3ed2d4cd6b5da3d38dd52df6d4a259280be7d45c30a363e9c71b174ecc6f8" +source.sha256 = "8602b2d6e7beb0731ae2be481715c94795657306d6013cc6d81fd60c4784a6ed" output.type = "zone" # Refer to @@ -430,10 +430,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "ff6c4df2e816eee6e7b2b0488777d30ef35ee217" +source.commit = "c7cdaf1875d259e29ca50a14b77b0bfd9dfe443d" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "aa10aa245a92e657fc074bd588ef6bbddaad2d9c946a8e1b91c02dce7e057561" +source.sha256 = "0203b7f702377c877c4132851ca102d68cd8fd2c20e4fd5b59d950cbb07fd9ff" output.type = "zone" [package.mg-ddm-gz] @@ -446,10 +446,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//maghemite.sha256.txt -source.sha256 = "36e976ae9b1517b358ec7eadd5fb03f5d40d54074ff830a79895f8fc3e643935" +source.sha256 = "19d5eaa744257c32ccdca52af79d718aeb88a0af188345d33a4564a69b259632" output.type = "tarball" [package.mg-ddm] @@ -462,10 +462,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "bc3137751db24d2e44eca7118f6ca825ed3e9df736480fc210392802cd063dd8" +source.sha256 = "ffb647b3297ec616d3d9ea93396ad9edd16ed146048a660b34e9b86e85d466b7" output.type = "zone" output.intermediate_only = true @@ -477,10 +477,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" +source.sha256 = "26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" output.type = "zone" output.intermediate_only = true @@ -497,8 +497,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "fd159136c552d8b4ec4d49dd9bae7e38f6a636e6" -source.sha256 = "1e24598ba77dc00682cdf54fc370696ef5aa49ed510ab7f72fcc91d61d679e7b" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" output.type = "zone" output.intermediate_only = true @@ -522,8 +522,8 @@ only_for_targets.image = "standard" # 2. Copy the output zone image from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "fd159136c552d8b4ec4d49dd9bae7e38f6a636e6" -source.sha256 = "720df8aff3aaa0f8a86ec606089ebf8b5068d7f3c243bd4c868b96ef72d13485" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "cc0429f0d9ce6df94e834cea89cabbdf4d1fbfe623369dd3eb84c5b2677414be" output.type = "zone" output.intermediate_only = true @@ -540,8 +540,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz source.type = "prebuilt" source.repo = "dendrite" -source.commit = "fd159136c552d8b4ec4d49dd9bae7e38f6a636e6" -source.sha256 = "5e34a10d9dca6c94f96075140d42b755dee1f5e6a3485fc239b12e12b89a30c5" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "fa25585fb3aa1a888b76133af3060b859cbea8e53287bb1cc64e70889db37679" output.type = "zone" output.intermediate_only = true diff --git a/schema/crdb/33.0.0/up01.sql b/schema/crdb/33.0.0/up01.sql new file mode 100644 index 0000000000..624aec4ea6 --- /dev/null +++ b/schema/crdb/33.0.0/up01.sql @@ -0,0 +1,42 @@ +/** + * A view of the ipv4 nat change history + * used to summarize changes for external viewing + */ +CREATE VIEW IF NOT EXISTS omicron.public.ipv4_nat_changes +AS +WITH interleaved_versions AS ( + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + version_added AS version, + (version_removed IS NOT NULL) as deleted + FROM ipv4_nat_entry + WHERE version_removed IS NULL + + UNION + + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + version_added AS version, + (version_removed IS NOT NULL) as deleted + FROM ipv4_nat_entry WHERE version_removed IS NOT NULL +) +SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + version, + deleted +FROM interleaved_versions; diff --git a/schema/crdb/33.0.1/up01.sql b/schema/crdb/33.0.1/up01.sql new file mode 100644 index 0000000000..354480c0c9 --- /dev/null +++ b/schema/crdb/33.0.1/up01.sql @@ -0,0 +1 @@ +DROP VIEW IF EXISTS omicron.public.ipv4_nat_changes; diff --git a/schema/crdb/33.0.1/up02.sql b/schema/crdb/33.0.1/up02.sql new file mode 100644 index 0000000000..5a2a183f4c --- /dev/null +++ b/schema/crdb/33.0.1/up02.sql @@ -0,0 +1,60 @@ +/* + * A view of the ipv4 nat change history + * used to summarize changes for external viewing + */ +CREATE VIEW IF NOT EXISTS omicron.public.ipv4_nat_changes +AS +-- Subquery: +-- We need to be able to order partial changesets. ORDER BY on separate columns +-- will not accomplish this, so we'll do this by interleaving version_added +-- and version_removed (version_removed taking priority if NOT NULL) and then sorting +-- on the appropriate version numbers at call time. +WITH interleaved_versions AS ( + -- fetch all active NAT entries (entries that have not been soft deleted) + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + -- rename version_added to version + version_added AS version, + -- create a new virtual column, boolean value representing whether or not + -- the record has been soft deleted + (version_removed IS NOT NULL) as deleted + FROM omicron.public.ipv4_nat_entry + WHERE version_removed IS NULL + + -- combine the datasets, unifying the version_added and version_removed + -- columns to a single `version` column so we can interleave and sort the entries + UNION + + -- fetch all inactive NAT entries (entries that have been soft deleted) + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + -- rename version_removed to version + version_removed AS version, + -- create a new virtual column, boolean value representing whether or not + -- the record has been soft deleted + (version_removed IS NOT NULL) as deleted + FROM omicron.public.ipv4_nat_entry + WHERE version_removed IS NOT NULL +) +-- this is our new "table" +-- here we select the columns from the subquery defined above +SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + version, + deleted +FROM interleaved_versions; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 22e58c9251..1bd261209b 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3441,6 +3441,67 @@ STORING ( time_deleted ); +/* + * A view of the ipv4 nat change history + * used to summarize changes for external viewing + */ +CREATE VIEW IF NOT EXISTS omicron.public.ipv4_nat_changes +AS +-- Subquery: +-- We need to be able to order partial changesets. ORDER BY on separate columns +-- will not accomplish this, so we'll do this by interleaving version_added +-- and version_removed (version_removed taking priority if NOT NULL) and then sorting +-- on the appropriate version numbers at call time. +WITH interleaved_versions AS ( + -- fetch all active NAT entries (entries that have not been soft deleted) + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + -- rename version_added to version + version_added AS version, + -- create a new virtual column, boolean value representing whether or not + -- the record has been soft deleted + (version_removed IS NOT NULL) as deleted + FROM omicron.public.ipv4_nat_entry + WHERE version_removed IS NULL + + -- combine the datasets, unifying the version_added and version_removed + -- columns to a single `version` column so we can interleave and sort the entries + UNION + + -- fetch all inactive NAT entries (entries that have been soft deleted) + SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + -- rename version_removed to version + version_removed AS version, + -- create a new virtual column, boolean value representing whether or not + -- the record has been soft deleted + (version_removed IS NOT NULL) as deleted + FROM omicron.public.ipv4_nat_entry + WHERE version_removed IS NOT NULL +) +-- this is our new "table" +-- here we select the columns from the subquery defined above +SELECT + external_address, + first_port, + last_port, + sled_address, + vni, + mac, + version, + deleted +FROM interleaved_versions; + INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -3448,7 +3509,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '32.0.0', NULL) + ( TRUE, NOW(), NOW(), '33.0.1', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index cbd73ed066..f5ac5bd0ff 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -466,7 +466,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ], "properties": { @@ -521,19 +521,12 @@ }, "rack_network_config": { "description": "Initial rack network configuration", - "anyOf": [ + "allOf": [ { "$ref": "#/definitions/RackNetworkConfigV1" - }, - { - "type": "null" } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 79189e7f49..48444af8d4 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; use sha3::{Digest, Sha3_256}; use sled_hardware::Baseboard; use std::borrow::Cow; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; @@ -24,14 +24,13 @@ pub enum BootstrapAddressDiscovery { /// Ignore all bootstrap addresses except our own. OnlyOurs, /// Ignore all bootstrap addresses except the following. - OnlyThese { addrs: HashSet }, + OnlyThese { addrs: BTreeSet }, } // "Shadow" copy of `RackInitializeRequest` that does no validation on its // fields. #[derive(Clone, Deserialize)] struct UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr, trust_quorum_peers: Option>, bootstrap_discovery: BootstrapAddressDiscovery, ntp_servers: Vec, @@ -41,7 +40,7 @@ struct UnvalidatedRackInitializeRequest { external_dns_zone_name: String, external_certificates: Vec, recovery_silo: RecoverySiloConfig, - rack_network_config: Option, + rack_network_config: RackNetworkConfig, } /// Configuration for the "rack setup service". @@ -53,8 +52,6 @@ struct UnvalidatedRackInitializeRequest { #[derive(Clone, Deserialize, Serialize, PartialEq, JsonSchema)] #[serde(try_from = "UnvalidatedRackInitializeRequest")] pub struct RackInitializeRequest { - pub rack_subnet: Ipv6Addr, - /// The set of peer_ids required to initialize trust quorum /// /// The value is `None` if we are not using trust quorum @@ -89,7 +86,7 @@ pub struct RackInitializeRequest { pub recovery_silo: RecoverySiloConfig, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } // This custom debug implementation hides the private keys. @@ -98,7 +95,6 @@ impl std::fmt::Debug for RackInitializeRequest { // If you find a compiler error here, and you just added a field to this // struct, be sure to add it to the Debug impl below! let RackInitializeRequest { - rack_subnet, trust_quorum_peers: trust_qurorum_peers, bootstrap_discovery, ntp_servers, @@ -112,7 +108,6 @@ impl std::fmt::Debug for RackInitializeRequest { } = &self; f.debug_struct("RackInitializeRequest") - .field("rack_subnet", rack_subnet) .field("trust_quorum_peers", trust_qurorum_peers) .field("bootstrap_discovery", bootstrap_discovery) .field("ntp_servers", ntp_servers) @@ -155,7 +150,6 @@ impl TryFrom for RackInitializeRequest { } Ok(RackInitializeRequest { - rack_subnet: value.rack_subnet, trust_quorum_peers: value.trust_quorum_peers, bootstrap_discovery: value.bootstrap_discovery, ntp_servers: value.ntp_servers, @@ -174,10 +168,21 @@ impl TryFrom for RackInitializeRequest { pub type Certificate = nexus_client::types::Certificate; pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; +/// A representation of a Baseboard ID as used in the inventory subsystem +/// This type is essentially the same as a `Baseboard` except it doesn't have a +/// revision or HW type (Gimlet, PC, Unknown). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + /// A request to Add a given sled after rack initialization has occurred #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] pub struct AddSledRequest { - pub sled_id: Baseboard, + pub sled_id: BaseboardId, pub start_request: StartSledAgentRequest, } @@ -255,9 +260,6 @@ pub struct StartSledAgentRequestBody { /// true. pub is_lrtq_learner: bool, - // Note: The order of these fields is load bearing, because we serialize - // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it - // must come after non-table fields. /// Portion of the IP space to be managed by the Sled Agent. pub subnet: Ipv6Subnet, } @@ -360,6 +362,7 @@ pub fn test_config() -> RackInitializeRequest { #[cfg(test)] mod tests { + use std::net::Ipv4Addr; use std::net::Ipv6Addr; use super::*; @@ -387,7 +390,6 @@ mod tests { #[test] fn parse_rack_initialization_weak_hash() { let config = r#" - rack_subnet = "fd00:1122:3344:0100::" bootstrap_discovery.type = "only_ours" ntp_servers = [ "ntp.eng.oxide.computer" ] dns_servers = [ "1.1.1.1", "9.9.9.9" ] @@ -472,7 +474,6 @@ mod tests { // Conjure up a config; we'll tweak the internal services pools and // external DNS IPs, but no other fields matter. let mut config = UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -486,7 +487,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; // Valid configs: all external DNS IPs are contained in the IP pool diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 0798aed664..5f888504db 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,7 +8,7 @@ use super::sled_agent::SledAgent; use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ - CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, + BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRulesEnsureBody, @@ -85,6 +85,7 @@ pub fn api() -> SledApiDescription { api.register(host_os_write_status_get)?; api.register(host_os_write_status_delete)?; api.register(inventory)?; + api.register(bootstore_status)?; Ok(()) } @@ -972,3 +973,23 @@ async fn inventory( let sa = request_context.context(); Ok(HttpResponseOk(sa.inventory()?)) } + +/// Get the internal state of the local bootstore node +#[endpoint { + method = GET, + path = "/bootstore/status", +}] +async fn bootstore_status( + request_context: RequestContext, +) -> Result, HttpError> { + let sa = request_context.context(); + let bootstore = sa.bootstore(); + let status = bootstore + .get_status() + .await + .map_err(|e| { + HttpError::from(omicron_common::api::external::Error::from(e)) + })? + .into(); + Ok(HttpResponseOk(status)) +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f14a13aa41..7ed1264d9c 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -25,6 +25,7 @@ use sled_hardware::Baseboard; pub use sled_hardware::DendriteAsic; use sled_storage::dataset::DatasetKind; use sled_storage::dataset::DatasetName; +use std::collections::BTreeSet; use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::str::FromStr; @@ -664,6 +665,27 @@ impl OmicronZoneType { *address, )) } + + /// Does this zone require time synchronization before it is initialized?" + /// + /// This function is somewhat conservative - the set of services + /// that can be launched before timesync has completed is intentionally kept + /// small, since it would be easy to add a service that expects time to be + /// reasonably synchronized. + pub fn requires_timesync(&self) -> bool { + match self { + // These zones can be initialized and started before time has been + // synchronized. For the NTP zones, this should be self-evident -- + // we need the NTP zone to actually perform time synchronization! + // + // The DNS zone is a bit of an exception here, since the NTP zone + // itself may rely on DNS lookups as a dependency. + OmicronZoneType::BoundaryNtp { .. } + | OmicronZoneType::InternalNtp { .. } + | OmicronZoneType::InternalDns { .. } => false, + _ => true, + } + } } impl crate::smf_helper::Service for OmicronZoneType { @@ -844,3 +866,45 @@ pub struct Inventory { pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, } + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct EstablishedConnection { + baseboard: Baseboard, + addr: SocketAddrV6, +} + +impl From<(Baseboard, SocketAddrV6)> for EstablishedConnection { + fn from(value: (Baseboard, SocketAddrV6)) -> Self { + EstablishedConnection { baseboard: value.0, addr: value.1 } + } +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootstoreStatus { + pub fsm_ledger_generation: u64, + pub network_config_ledger_generation: Option, + pub fsm_state: String, + pub peers: BTreeSet, + pub established_connections: Vec, + pub accepted_connections: BTreeSet, + pub negotiating_connections: BTreeSet, +} + +impl From for BootstoreStatus { + fn from(value: bootstore::schemes::v0::Status) -> Self { + BootstoreStatus { + fsm_ledger_generation: value.fsm_ledger_generation, + network_config_ledger_generation: value + .network_config_ledger_generation, + fsm_state: value.fsm_state.to_string(), + peers: value.peers, + established_connections: value + .connections + .into_iter() + .map(EstablishedConnection::from) + .collect(), + accepted_connections: value.accepted_connections, + negotiating_connections: value.negotiating_connections, + } + } +} diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 33de7121d4..52bea295a5 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -70,12 +70,14 @@ impl SetupServiceConfig { } pub fn az_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new(self.rack_network_config.rack_subnet.ip()) } /// Returns the subnet for our rack. pub fn rack_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new( + self.rack_network_config.rack_subnet.ip(), + ) } /// Returns the subnet for the `index`-th sled in the rack. @@ -92,12 +94,12 @@ mod test { use anyhow::Context; use camino::Utf8PathBuf; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; #[test] fn test_subnets() { let cfg = SetupServiceConfig { - rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: vec![String::from("test.pool.example.com")], @@ -119,7 +121,13 @@ mod test { .parse() .unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; assert_eq!( diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 8fab8a0b8d..77fd8a39de 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -15,8 +15,8 @@ use internal_dns::ServiceName; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, - MGD_PORT, MGS_PORT, NTP_PORT, NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, - SLED_PREFIX, + MGD_PORT, MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, + RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; use omicron_common::api::external::{MacAddr, Vni}; use omicron_common::api::internal::shared::{ @@ -34,7 +34,7 @@ use sled_agent_client::{ use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::num::Wrapping; use thiserror::Error; @@ -43,9 +43,6 @@ use uuid::Uuid; // The number of boundary NTP servers to create from RSS. const BOUNDARY_NTP_COUNT: usize = 2; -// The number of Nexus instances to create from RSS. -const NEXUS_COUNT: usize = 3; - // The number of CRDB instances to create from RSS. const CRDB_COUNT: usize = 5; @@ -442,7 +439,7 @@ impl Plan { } // Provision Nexus zones, continuing to stripe across sleds. - for _ in 0..NEXUS_COUNT { + for _ in 0..NEXUS_REDUNDANCY { let sled = { let which_sled = sled_allocator.next().ok_or(PlanError::NotEnoughSleds)?; @@ -701,7 +698,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - sleds: &HashMap, + sleds: &BTreeMap, ) -> Result { // Load the information we need about each Sled to be able to allocate // components on it. @@ -1071,6 +1068,7 @@ mod tests { use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::RecoverySiloConfig; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; const EXPECTED_RESERVED_ADDRESSES: u16 = 2; const EXPECTED_USABLE_ADDRESSES: u16 = @@ -1142,7 +1140,6 @@ mod tests { "fd01::103", ]; let config = Config { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -1166,7 +1163,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; let mut svp = ServicePortBuilder::new(&config); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 07f33893fc..efdd86d2f9 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet}; use std::net::{Ipv6Addr, SocketAddrV6}; use thiserror::Error; use uuid::Uuid; @@ -46,7 +46,7 @@ const RSS_SLED_PLAN_FILENAME: &str = "rss-sled-plan.json"; #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct Plan { pub rack_id: Uuid, - pub sleds: HashMap, + pub sleds: BTreeMap, // Store the provided RSS configuration as part of the sled plan; if it // changes after reboot, we need to know. @@ -81,7 +81,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - bootstrap_addrs: HashSet, + bootstrap_addrs: BTreeSet, use_trust_quorum: bool, ) -> Result { let rack_id = Uuid::new_v4(); @@ -117,7 +117,7 @@ impl Plan { info!(log, "Serializing plan"); - let mut sleds = std::collections::HashMap::new(); + let mut sleds = BTreeMap::new(); for (addr, allocation) in allocations { sleds.insert(addr, allocation); } @@ -152,4 +152,24 @@ mod tests { &serde_json::to_string_pretty(&schema).unwrap(), ); } + + #[test] + fn test_read_known_rss_sled_plans() { + let known_rss_sled_plans = &["madrid-rss-sled-plan.json"]; + + let path = Utf8PathBuf::from("tests/old-rss-sled-plans"); + let out_path = Utf8PathBuf::from("tests/output/new-rss-sled-plans"); + for sled_plan_basename in known_rss_sled_plans { + println!("checking {:?}", sled_plan_basename); + let contents = + std::fs::read_to_string(path.join(sled_plan_basename)) + .expect("failed to read file"); + let parsed: Plan = + serde_json::from_str(&contents).expect("failed to parse file"); + expectorate::assert_contents( + out_path.join(sled_plan_basename), + &serde_json::to_string_pretty(&parsed).unwrap(), + ); + } + } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index af81df52bb..2788e189cc 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -601,58 +601,55 @@ impl ServiceInner { .map(Into::into) .collect(); - let rack_network_config = match &config.rack_network_config { - Some(config) => { - let value = NexusTypes::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, - infra_ip_first: config.infra_ip_first, - infra_ip_last: config.infra_ip_last, - ports: config - .ports - .iter() - .map(|config| NexusTypes::PortConfigV1 { - port: config.port.clone(), - routes: config - .routes - .iter() - .map(|r| NexusTypes::RouteConfig { - destination: r.destination, - nexthop: r.nexthop, - }) - .collect(), - addresses: config.addresses.clone(), - switch: config.switch.into(), - uplink_port_speed: config.uplink_port_speed.into(), - uplink_port_fec: config.uplink_port_fec.into(), - autoneg: config.autoneg, - bgp_peers: config - .bgp_peers - .iter() - .map(|b| NexusTypes::BgpPeerConfig { - addr: b.addr, - asn: b.asn, - port: b.port.clone(), - hold_time: b.hold_time, - connect_retry: b.connect_retry, - delay_open: b.delay_open, - idle_hold_time: b.idle_hold_time, - keepalive: b.keepalive, - }) - .collect(), - }) - .collect(), - bgp: config - .bgp - .iter() - .map(|config| NexusTypes::BgpConfig { - asn: config.asn, - originate: config.originate.clone(), - }) - .collect(), - }; - Some(value) + let rack_network_config = { + let config = &config.rack_network_config; + NexusTypes::RackNetworkConfigV1 { + rack_subnet: config.rack_subnet, + infra_ip_first: config.infra_ip_first, + infra_ip_last: config.infra_ip_last, + ports: config + .ports + .iter() + .map(|config| NexusTypes::PortConfigV1 { + port: config.port.clone(), + routes: config + .routes + .iter() + .map(|r| NexusTypes::RouteConfig { + destination: r.destination, + nexthop: r.nexthop, + }) + .collect(), + addresses: config.addresses.clone(), + switch: config.switch.into(), + uplink_port_speed: config.uplink_port_speed.into(), + uplink_port_fec: config.uplink_port_fec.into(), + autoneg: config.autoneg, + bgp_peers: config + .bgp_peers + .iter() + .map(|b| NexusTypes::BgpPeerConfig { + addr: b.addr, + asn: b.asn, + port: b.port.clone(), + hold_time: b.hold_time, + connect_retry: b.connect_retry, + delay_open: b.delay_open, + idle_hold_time: b.idle_hold_time, + keepalive: b.keepalive, + }) + .collect(), + }) + .collect(), + bgp: config + .bgp + .iter() + .map(|config| NexusTypes::BgpConfig { + asn: config.asn, + originate: config.originate.clone(), + }) + .collect(), } - None => None, }; info!(self.log, "rack_network_config: {:#?}", rack_network_config); @@ -868,14 +865,14 @@ impl ServiceInner { // - Enough peers to create a new plan (if one does not exist) let bootstrap_addrs = match &config.bootstrap_discovery { BootstrapAddressDiscovery::OnlyOurs => { - HashSet::from([local_bootstrap_agent.our_address()]) + BTreeSet::from([local_bootstrap_agent.our_address()]) } BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = SledPlan::load(&self.log, storage_manager).await?; if let Some(plan) = &maybe_sled_plan { - let stored_peers: HashSet = + let stored_peers: BTreeSet = plan.sleds.keys().map(|a| *a.ip()).collect(); if stored_peers != bootstrap_addrs { let e = concat!( @@ -931,7 +928,7 @@ impl ServiceInner { schema_version: 1, body: EarlyNetworkConfigBody { ntp_servers: config.ntp_servers.clone(), - rack_network_config: config.rack_network_config.clone(), + rack_network_config: Some(config.rack_network_config.clone()), }, }; info!(self.log, "Writing Rack Network Configuration to bootstore"); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 77b6bcbed4..bc40187b38 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -57,7 +57,7 @@ use illumos_utils::running_zone::{ }; use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; use illumos_utils::zone::AddressRequest; -use illumos_utils::zone::Zones; +use illumos_utils::zpool::ZpoolName; use illumos_utils::{execute, PFEXEC}; use internal_dns::resolver::Resolver; use itertools::Itertools; @@ -80,8 +80,7 @@ use omicron_common::api::internal::shared::{ HostPortConfig, RackNetworkConfig, }; use omicron_common::backoff::{ - retry_notify, retry_policy_internal_service_aggressive, retry_policy_local, - BackoffError, + retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; use omicron_common::ledger::{self, Ledger, Ledgerable}; use omicron_common::nexus_config::{ @@ -101,7 +100,6 @@ use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::collections::HashSet; -use std::iter::FromIterator; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; @@ -112,6 +110,11 @@ use tokio::sync::{oneshot, MutexGuard}; use tokio::task::JoinHandle; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + const IPV6_UNSPECIFIED: IpAddr = IpAddr::V6(Ipv6Addr::UNSPECIFIED); #[derive(thiserror::Error, Debug)] @@ -160,6 +163,16 @@ pub enum Error { err: illumos_utils::running_zone::RunCommandError, }, + #[error("Cannot list zones")] + ZoneList(#[source] illumos_utils::zone::AdmError), + + #[error("Cannot remove zone")] + ZoneRemoval { + zone_name: String, + #[source] + err: illumos_utils::zone::AdmError, + }, + #[error("Failed to boot zone: {0}")] ZoneBoot(#[from] illumos_utils::running_zone::BootError), @@ -169,6 +182,9 @@ pub enum Error { #[error(transparent)] ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), + #[error("Failed to initialize zones: {errors:?}")] + ZoneEnsure { errors: Vec<(String, Error)> }, + #[error("Error contacting ddmd: {0}")] DdmError(#[from] DdmError), @@ -267,17 +283,47 @@ impl Error { impl From for omicron_common::api::external::Error { fn from(err: Error) -> Self { match err { - err @ Error::RequestedConfigConflicts(_) => { + Error::RequestedConfigConflicts(_) => { omicron_common::api::external::Error::invalid_request( &err.to_string(), ) } - err @ Error::RequestedConfigOutdated { .. } => { + Error::RequestedConfigOutdated { .. } => { omicron_common::api::external::Error::conflict(&err.to_string()) } - err @ Error::TimeNotSynchronized => { + Error::TimeNotSynchronized => { omicron_common::api::external::Error::unavail(&err.to_string()) } + Error::ZoneEnsure { errors } => { + // As a special case, if any zones failed to timesync, + // prioritize that error. + // + // This conversion to a 503 error was requested in + // https://github.com/oxidecomputer/omicron/issues/4776 , + // and we preserve that behavior here, even though we may + // launch many zones at the same time. + if let Some(err) = errors.iter().find_map(|(_, err)| { + if matches!(err, Error::TimeNotSynchronized) { + Some(err) + } else { + None + } + }) { + omicron_common::api::external::Error::unavail( + &err.to_string(), + ) + } else { + let internal_message = errors + .iter() + .map(|(name, err)| { + format!("failed to start {name}: {err:?}") + }) + .join("\n"); + omicron_common::api::external::Error::InternalError { + internal_message, + } + } + } _ => omicron_common::api::external::Error::InternalError { internal_message: err.to_string(), }, @@ -300,27 +346,6 @@ fn display_zone_init_errors(errors: &[(String, Box)]) -> String { output } -// Does this zone require time synchronization before it is initialized?" -// -// This function is somewhat conservative - the set of services -// that can be launched before timesync has completed is intentionally kept -// small, since it would be easy to add a service that expects time to be -// reasonably synchronized. -fn zone_requires_timesync(zone_type: &OmicronZoneType) -> bool { - match zone_type { - // These zones can be initialized and started before time has been - // synchronized. For the NTP zones, this should be self-evident -- - // we need the NTP zone to actually perform time synchronization! - // - // The DNS zone is a bit of an exception here, since the NTP zone - // itself may rely on DNS lookups as a dependency. - OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - | OmicronZoneType::InternalDns { .. } => false, - _ => true, - } -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -343,7 +368,13 @@ const ZONES_LEDGER_FILENAME: &str = "omicron-zones.json"; /// wants for all of its zones) with the locally-determined configuration for /// these zones. #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZonesConfigLocal { /// generation of the Omicron-provided part of the configuration @@ -404,7 +435,13 @@ impl OmicronZonesConfigLocal { /// wants for this zone) with any locally-determined configuration (like the /// path to the root filesystem) #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZoneConfigLocal { pub zone: OmicronZoneConfig, @@ -551,7 +588,33 @@ enum SledLocalZone { }, } -type ZoneMap = BTreeMap; +// The return type for `start_omicron_zones`. +// +// When multiple zones are started concurrently, some can fail while others +// succeed. This structure allows the function to return this nuanced +// information. +#[must_use] +struct StartZonesResult { + // The set of zones which have successfully started. + new_zones: Vec, + + // The set of (zone name, error) of zones that failed to start. + errors: Vec<(String, Error)>, +} + +// A running zone and the configuration which started it. +struct OmicronZone { + runtime: RunningZone, + config: OmicronZoneConfigLocal, +} + +impl OmicronZone { + fn name(&self) -> &str { + self.runtime.name() + } +} + +type ZoneMap = BTreeMap; /// Manages miscellaneous Sled-local services. pub struct ServiceManagerInner { @@ -718,7 +781,7 @@ impl ServiceManager { &self, // This argument attempts to ensure that the caller holds the right // lock. - _map: &MutexGuard<'_, BTreeMap>, + _map: &MutexGuard<'_, ZoneMap>, ) -> Result>, Error> { // First, try to load the current software's zone ledger. If that // works, we're done. @@ -893,84 +956,9 @@ impl ServiceManager { let omicron_zones_config = zones_config.clone().to_omicron_zones_config(); - // Initialize internal DNS only first: we need it to look up the - // boundary switch addresses. This dependency is implicit: when we call - // `ensure_all_omicron_zones` below, we eventually land in - // `opte_ports_needed()`, which for some service types (including Ntp - // but _not_ including InternalDns), we perform internal DNS lookups. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - None, - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!(z.zone_type, OmicronZoneType::InternalDns { .. }) - }, - ) - .await?; - - // Initialize NTP services next as they are required for time - // synchronization, which is a pre-requisite for the other services. We - // keep `OmicronZoneType::InternalDns` because - // `ensure_all_omicron_zones` is additive. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(&all_zones_request), - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!( - z.zone_type, - OmicronZoneType::InternalDns { .. } - | OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - ) - }, - ) - .await?; - - drop(existing_zones); - - info!(&self.inner.log, "Waiting for sled time synchronization"); - - retry_notify( - retry_policy_local(), - || async { - match self.timesync_get().await { - Ok(TimeSync { sync: true, .. }) => { - info!(&self.inner.log, "Time is synchronized"); - Ok(()) - } - Ok(ts) => Err(BackoffError::transient(format!( - "No sync {:?}", - ts - ))), - Err(e) => Err(BackoffError::transient(format!( - "Error checking for time synchronization: {}", - e - ))), - } - }, - |error, delay| { - warn!( - self.inner.log, - "Time not yet synchronised (retrying in {:?})", - delay; - "error" => ?error - ); - }, - ) - .await - .expect("Expected an infinite retry loop syncing time"); - - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize all remaining services self.ensure_all_omicron_zones( &mut existing_zones, - Some(&all_zones_request), omicron_zones_config, - |_| true, ) .await?; Ok(()) @@ -2688,17 +2676,73 @@ impl ServiceManager { Ok(running_zone) } - // Populates `existing_zones` according to the requests in `services`. - async fn initialize_omicron_zones_locked( + // Ensures that a single Omicron zone is running. + // + // This method is NOT idempotent. + // + // - If the zone already exists, in any form, it is fully removed + // before being initialized. This is primarily intended to remove "partially + // stopped/started" zones with detritus from interfering with a new zone + // being launched. + // - If zones need time to be synchronized before they are initialized + // (e.g., this is a hard requirement for CockroachDb) they can check the + // `time_is_synchronized` argument. + // - `all_u2_pools` provides a snapshot into durable storage on this sled, + // which gives the storage manager an opportunity to validate the zone's + // storage configuration against the reality of the current sled. + async fn start_omicron_zone( &self, - existing_zones: &mut BTreeMap, - requests: &Vec, - ) -> Result<(), Error> { - if let Some(name) = requests - .iter() - .map(|request| request.zone.zone_name()) - .duplicates() - .next() + zone: &OmicronZoneConfig, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + // Ensure the zone has been fully removed before we try to boot it. + // + // This ensures that old "partially booted/stopped" zones do not + // interfere with our installation. + self.ensure_removed(&zone).await?; + + // If this zone requires timesync and we aren't ready, fail it early. + if zone.zone_type.requires_timesync() && !time_is_synchronized { + return Err(Error::TimeNotSynchronized); + } + + // Ensure that this zone's storage is ready. + let root = self + .validate_storage_and_pick_mountpoint(&zone, &all_u2_pools) + .await?; + + let config = OmicronZoneConfigLocal { zone: zone.clone(), root }; + + let runtime = self + .initialize_zone( + ZoneArgs::Omicron(&config), + // filesystems= + &[], + // data_links= + &[], + ) + .await?; + + Ok(OmicronZone { runtime, config }) + } + + // Concurrently attempts to start all zones identified by requests. + // + // This method is NOT idempotent. + // + // If we try to start ANY zones concurrently, the result is contained + // in the `StartZonesResult` value. This will contain the set of zones which + // were initialized successfully, as well as the set of zones which failed + // to start. + async fn start_omicron_zones( + &self, + requests: impl Iterator + Clone, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + if let Some(name) = + requests.clone().map(|zone| zone.zone_name()).duplicates().next() { return Err(Error::BadServiceRequest { service: name, @@ -2706,38 +2750,29 @@ impl ServiceManager { }); } - let futures = requests.iter().map(|request| { - async move { - self.initialize_zone( - ZoneArgs::Omicron(request), - // filesystems= - &[], - // data_links= - &[], - ) + let futures = requests.map(|zone| async move { + self.start_omicron_zone(&zone, time_is_synchronized, all_u2_pools) .await - .map_err(|error| (request.zone.zone_name(), error)) - } + .map_err(|err| (zone.zone_name().to_string(), err)) }); + let results = futures::future::join_all(futures).await; + let mut new_zones = Vec::new(); let mut errors = Vec::new(); for result in results { match result { Ok(zone) => { - existing_zones.insert(zone.name().to_string(), zone); + info!(self.inner.log, "Zone started"; "zone" => zone.name()); + new_zones.push(zone); } - Err((zone_name, error)) => { - errors.push((zone_name, Box::new(error))); + Err((name, error)) => { + warn!(self.inner.log, "Zone failed to start"; "zone" => &name); + errors.push((name, error)) } } } - - if !errors.is_empty() { - return Err(Error::ZoneInitialize(errors)); - } - - Ok(()) + Ok(StartZonesResult { new_zones, errors }) } /// Create a zone bundle for the provided zone. @@ -2761,7 +2796,7 @@ impl ServiceManager { return self .inner .zone_bundler - .create(zone, ZoneBundleCause::ExplicitRequest) + .create(&zone.runtime, ZoneBundleCause::ExplicitRequest) .await; } Err(BundleError::NoSuchZone { name: name.to_string() }) @@ -2799,7 +2834,7 @@ impl ServiceManager { /// boot. pub async fn ensure_all_omicron_zones_persistent( &self, - request: OmicronZonesConfig, + mut request: OmicronZonesConfig, ) -> Result<(), Error> { let log = &self.inner.log; @@ -2838,21 +2873,47 @@ impl ServiceManager { // If the generation is the same as what we're running, but the contents // aren't, that's a problem, too. - if ledger_zone_config.omicron_generation == request.generation - && ledger_zone_config.clone().to_omicron_zones_config().zones - != request.zones - { - return Err(Error::RequestedConfigConflicts(request.generation)); + if ledger_zone_config.omicron_generation == request.generation { + // Nexus should send us consistent zone orderings; however, we may + // reorder the zone list inside `ensure_all_omicron_zones`. To avoid + // equality checks failing only because the two lists are ordered + // differently, sort them both here before comparing. + let mut ledger_zones = + ledger_zone_config.clone().to_omicron_zones_config().zones; + + // We sort by ID because we assume no two zones have the same ID. If + // that assumption is wrong, we may return an error here where the + // conflict is soley the list orders, but in such a case that's the + // least of our problems. + ledger_zones.sort_by_key(|z| z.id); + request.zones.sort_by_key(|z| z.id); + + if ledger_zones != request.zones { + return Err(Error::RequestedConfigConflicts( + request.generation, + )); + } } - let new_config = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(ledger_zone_config), - request, - |_| true, - ) - .await?; + let omicron_generation = request.generation; + let ledger_generation = ledger_zone_config.ledger_generation; + self.ensure_all_omicron_zones(&mut existing_zones, request).await?; + let zones = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.clone()) + .collect(); + + let new_config = OmicronZonesConfigLocal { + omicron_generation, + ledger_generation, + zones, + }; + + // If the contents of the ledger would be identical, we can avoid + // performing an update and commit. + if *ledger_zone_config == new_config { + return Ok(()); + } // Update the zones in the ledger and write it back to both M.2s *ledger_zone_config = new_config; @@ -2863,44 +2924,48 @@ impl ServiceManager { // Ensures that only the following Omicron zones are running. // - // Does not record any information such that these services are - // re-instantiated on boot. - async fn ensure_all_omicron_zones( + // This method strives to be idempotent. + // + // - Starting and stopping zones is not an atomic operation - it's possible + // that we cannot start a zone after a previous one has been successfully + // created (or destroyed) intentionally. As a result, even in error cases, + // it's possible that the set of `existing_zones` changes. However, this set + // will only change in the direction of `new_request`: zones will only be + // removed if they ARE NOT part of `new_request`, and zones will only be + // added if they ARE part of `new_request`. + // - Zones are not updated in-place: two zone configurations that differ + // in any way are treated as entirely distinct. + // - This method does not record any information such that these services + // are re-instantiated on boot. + async fn ensure_all_omicron_zones( &self, // The MutexGuard here attempts to ensure that the caller has the right // lock held when calling this function. - existing_zones: &mut MutexGuard<'_, BTreeMap>, - old_config: Option<&OmicronZonesConfigLocal>, + existing_zones: &mut MutexGuard<'_, ZoneMap>, new_request: OmicronZonesConfig, - filter: F, - ) -> Result - where - F: Fn(&OmicronZoneConfig) -> bool, - { - let log = &self.inner.log; - + ) -> Result<(), Error> { // Do some data-normalization to ensure we can compare the "requested // set" vs the "existing set" as HashSets. - let old_zones_set: HashSet = old_config - .map(|old_config| { - HashSet::from_iter( - old_config.zones.iter().map(|z| z.zone.clone()), - ) - }) - .unwrap_or_else(HashSet::new); - let requested_zones_set = - HashSet::from_iter(new_request.zones.into_iter().filter(filter)); + let old_zone_configs: HashSet = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.zone.clone()) + .collect(); + let requested_zones_set: HashSet = + new_request.zones.into_iter().collect(); let zones_to_be_removed = - old_zones_set.difference(&requested_zones_set); - let zones_to_be_added = requested_zones_set.difference(&old_zones_set); + old_zone_configs.difference(&requested_zones_set); + let zones_to_be_added = + requested_zones_set.difference(&old_zone_configs); - // For each new zone request, ensure that we've sufficiently - // synchronized time. - // - // NOTE: This imposes a constraint, during initial setup, cold boot, - // etc, that NTP and the internal DNS system it depends on MUST be - // initialized prior to other zones. + // Destroy zones that should not be running + for zone in zones_to_be_removed { + self.zone_bundle_and_try_remove(existing_zones, &zone).await; + } + + // Collect information that's necessary to start new zones + let storage = self.inner.storage.get_latest_resources().await; + let all_u2_pools = storage.all_u2_zpools(); let time_is_synchronized = match self.timesync_get_locked(&existing_zones).await { // Time is synchronized @@ -2908,166 +2973,179 @@ impl ServiceManager { // Time is not synchronized, or we can't check _ => false, }; - for zone in zones_to_be_added.clone() { - if zone_requires_timesync(&zone.zone_type) && !time_is_synchronized - { - return Err(Error::TimeNotSynchronized); - } + + // Concurrently boot all new zones + let StartZonesResult { new_zones, errors } = self + .start_omicron_zones( + zones_to_be_added, + time_is_synchronized, + &all_u2_pools, + ) + .await?; + + // Add the new zones to our tracked zone set + existing_zones.extend( + new_zones.into_iter().map(|zone| (zone.name().to_string(), zone)), + ); + + // If any zones failed to start, exit with an error + if !errors.is_empty() { + return Err(Error::ZoneEnsure { errors }); } + Ok(()) + } - // Destroy zones that should not be running - for zone in zones_to_be_removed { - let expected_zone_name = zone.zone_name(); - if let Some(mut zone) = existing_zones.remove(&expected_zone_name) { - debug!( - log, - "removing an existing zone"; - "zone_name" => &expected_zone_name, + // Attempts to take a zone bundle and remove a zone. + // + // Logs, but does not return an error on failure. + async fn zone_bundle_and_try_remove( + &self, + existing_zones: &mut MutexGuard<'_, ZoneMap>, + zone: &OmicronZoneConfig, + ) { + let log = &self.inner.log; + let expected_zone_name = zone.zone_name(); + let Some(mut zone) = existing_zones.remove(&expected_zone_name) else { + warn!( + log, + "Expected to remove zone, but could not find it"; + "zone_name" => &expected_zone_name, + ); + return; + }; + debug!( + log, + "removing an existing zone"; + "zone_name" => &expected_zone_name, + ); + if let Err(e) = self + .inner + .zone_bundler + .create(&zone.runtime, ZoneBundleCause::UnexpectedZone) + .await + { + error!( + log, + "Failed to take bundle of unexpected zone"; + "zone_name" => &expected_zone_name, + "reason" => ?e, + ); + } + if let Err(e) = zone.runtime.stop().await { + error!(log, "Failed to stop zone {}: {e}", zone.name()); + } + } + + // Ensures that if a zone is about to be installed, it does not exist. + async fn ensure_removed( + &self, + zone: &OmicronZoneConfig, + ) -> Result<(), Error> { + let zone_name = zone.zone_name(); + match Zones::find(&zone_name).await { + Ok(Some(zone)) => { + warn!( + self.inner.log, + "removing zone"; + "zone" => &zone_name, + "state" => ?zone.state(), ); - if let Err(e) = self - .inner - .zone_bundler - .create(&zone, ZoneBundleCause::UnexpectedZone) - .await + if let Err(e) = + Zones::halt_and_remove_logged(&self.inner.log, &zone_name) + .await { error!( - log, - "Failed to take bundle of unexpected zone"; - "zone_name" => &expected_zone_name, - "reason" => ?e, + self.inner.log, + "Failed to remove zone"; + "zone" => &zone_name, + "error" => %e, ); + return Err(Error::ZoneRemoval { + zone_name: zone_name.to_string(), + err: e, + }); } - if let Err(e) = zone.stop().await { - error!(log, "Failed to stop zone {}: {e}", zone.name()); - } - } else { - warn!(log, "Expected to remove zone, but could not find it"); + return Ok(()); } + Ok(None) => return Ok(()), + Err(err) => return Err(Error::ZoneList(err)), } + } - // Create zones that should be running - let storage = self.inner.storage.get_latest_resources().await; - let all_u2_pools = storage.all_u2_zpools(); - - let mut new_zones = Vec::new(); - for zone in zones_to_be_added { - // Check if we think the zone should already be running - let name = zone.zone_name(); - if existing_zones.contains_key(&name) { - // Make sure the zone actually exists in the right state too - match Zones::find(&name).await { - Ok(Some(zone)) if zone.state() == zone::State::Running => { - info!(log, "skipping running zone"; "zone" => &name); - continue; - } - _ => { - // Mismatch between SA's view and reality, let's try to - // clean up any remanants and try initialize it again - warn!( - log, - "expected to find existing zone in running state"; - "zone" => &name, - ); - if let Err(e) = - existing_zones.remove(&name).unwrap().stop().await - { - error!( - log, - "Failed to stop zone"; - "zone" => &name, - "error" => %e, - ); - } - } - } - } + // Returns a zone filesystem mountpoint, after ensuring that U.2 storage + // is valid. + async fn validate_storage_and_pick_mountpoint( + &self, + zone: &OmicronZoneConfig, + all_u2_pools: &Vec, + ) -> Result { + let name = zone.zone_name(); + + // For each new zone request, we pick a U.2 to store the zone + // filesystem. Note: This isn't known to Nexus right now, so it's a + // local-to-sled decision. + // + // Currently, the zone filesystem should be destroyed between + // reboots, so it's fine to make this decision locally. + let root = if let Some(dataset) = zone.dataset_name() { + // Check that the dataset is actually ready to be used. + let [zoned, canmount, encryption] = + illumos_utils::zfs::Zfs::get_values( + &dataset.full_name(), + &["zoned", "canmount", "encryption"], + ) + .map_err(|err| Error::GetZfsValue { + zone: zone.zone_name(), + source: err, + })?; - // For each new zone request, we pick a U.2 to store the zone - // filesystem. Note: This isn't known to Nexus right now, so it's a - // local-to-sled decision. - // - // Currently, the zone filesystem should be destroyed between - // reboots, so it's fine to make this decision locally. - let root = if let Some(dataset) = zone.dataset_name() { - // Check that the dataset is actually ready to be used. - let [zoned, canmount, encryption] = - illumos_utils::zfs::Zfs::get_values( - &dataset.full_name(), - &["zoned", "canmount", "encryption"], - ) - .map_err(|err| Error::GetZfsValue { + let check_property = |name, actual, expected| { + if actual != expected { + return Err(Error::DatasetNotReady { zone: zone.zone_name(), - source: err, - })?; - - let check_property = |name, actual, expected| { - if actual != expected { - return Err(Error::DatasetNotReady { - zone: zone.zone_name(), - dataset: dataset.full_name(), - prop_name: String::from(name), - prop_value: actual, - prop_value_expected: String::from(expected), - }); - } - return Ok(()); - }; - check_property("zoned", zoned, "on")?; - check_property("canmount", canmount, "on")?; - if dataset.dataset().dataset_should_be_encrypted() { - check_property("encryption", encryption, "aes-256-gcm")?; - } - - // If the zone happens to already manage a dataset, then - // we co-locate the zone dataset on the same zpool. - // - // This slightly reduces the underlying fault domain for the - // service. - let data_pool = dataset.pool(); - if !all_u2_pools.contains(&data_pool) { - warn!( - log, - "zone dataset requested on a zpool which doesn't exist"; - "zone" => &name, - "zpool" => %data_pool - ); - return Err(Error::MissingDevice { - device: format!("zpool: {data_pool}"), + dataset: dataset.full_name(), + prop_name: String::from(name), + prop_value: actual, + prop_value_expected: String::from(expected), }); } - data_pool.dataset_mountpoint(ZONE_DATASET) - } else { - // If the zone it not coupled to other datsets, we pick one - // arbitrarily. - let mut rng = rand::thread_rng(); - all_u2_pools - .choose(&mut rng) - .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) - .ok_or_else(|| Error::U2NotFound)? - .clone() + return Ok(()); }; - - new_zones.push(OmicronZoneConfigLocal { zone: zone.clone(), root }); - } - - self.initialize_omicron_zones_locked(existing_zones, &new_zones) - .await?; - - if let Some(old_config) = old_config { - for old_zone in &old_config.zones { - if requested_zones_set.contains(&old_zone.zone) { - new_zones.push(old_zone.clone()); - } + check_property("zoned", zoned, "on")?; + check_property("canmount", canmount, "on")?; + if dataset.dataset().dataset_should_be_encrypted() { + check_property("encryption", encryption, "aes-256-gcm")?; } - } - Ok(OmicronZonesConfigLocal { - omicron_generation: new_request.generation, - ledger_generation: old_config - .map(|c| c.ledger_generation) - .unwrap_or_else(Generation::new), - zones: new_zones, - }) + // If the zone happens to already manage a dataset, then + // we co-locate the zone dataset on the same zpool. + // + // This slightly reduces the underlying fault domain for the + // service. + let data_pool = dataset.pool(); + if !all_u2_pools.contains(&data_pool) { + warn!( + self.inner.log, + "zone dataset requested on a zpool which doesn't exist"; + "zone" => &name, + "zpool" => %data_pool + ); + return Err(Error::MissingDevice { + device: format!("zpool: {data_pool}"), + }); + } + data_pool.dataset_mountpoint(ZONE_DATASET) + } else { + // If the zone it not coupled to other datsets, we pick one + // arbitrarily. + let mut rng = rand::thread_rng(); + all_u2_pools + .choose(&mut rng) + .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) + .ok_or_else(|| Error::U2NotFound)? + .clone() + }; + Ok(root) } pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { @@ -3080,7 +3158,7 @@ impl ServiceManager { if zone.name().contains(&ZoneType::CockroachDb.to_string()) { let address = Zones::get_address( Some(zone.name()), - &zone.control_interface(), + &zone.runtime.control_interface(), )? .ip(); let host = &format!("[{address}]:{COCKROACH_PORT}"); @@ -3088,7 +3166,7 @@ impl ServiceManager { log, "Initializing CRDB Cluster - sending request to {host}" ); - if let Err(err) = zone.run_cmd(&[ + if let Err(err) = zone.runtime.run_cmd(&[ "/opt/oxide/cockroachdb/bin/cockroach", "init", "--insecure", @@ -3103,26 +3181,28 @@ impl ServiceManager { } }; info!(log, "Formatting CRDB"); - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbwipe.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbinit.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbwipe.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbinit.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; info!(log, "Formatting CRDB - Completed"); // In the single-sled case, if there are multiple CRDB nodes on @@ -3203,7 +3283,8 @@ impl ServiceManager { // connect to the UNIX socket at // format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root()) - match ntp_zone.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) { + match ntp_zone.runtime.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) + { Ok(stdout) => { let v: Vec<&str> = stdout.split(',').collect(); @@ -3793,6 +3874,15 @@ mod test { expected_zone_name_prefix: &str, ) -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zone doesn't already exist + let find_zone_ctx = MockZones::find_context(); + let prefix = expected_zone_name_prefix.to_string(); + find_zone_ctx.expect().return_once(move |zone_name| { + assert!(zone_name.starts_with(&prefix)); + Ok(None) + }); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().return_once( @@ -3850,6 +3940,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -3867,6 +3958,11 @@ mod test { // because these functions may return any number of times. fn expect_new_services() -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zones don't already exist + let find_zone_ctx = MockZones::find_context(); + find_zone_ctx.expect().returning(move |_zone_name| Ok(None)); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().returning( @@ -3925,6 +4021,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -4214,9 +4311,24 @@ mod test { OmicronZoneType::Oximeter { address }, ) .await; + + // First, ensure this is the right kind of error. + let err = result.unwrap_err(); + let errors = match &err { + Error::ZoneEnsure { errors } => errors, + err => panic!("unexpected result: {err:?}"), + }; + assert_eq!(errors.len(), 1); assert_matches::assert_matches!( - result, - Err(Error::TimeNotSynchronized) + errors[0].1, + Error::TimeNotSynchronized + ); + + // Next, ensure this still converts to an "unavail" common error + let common_err = omicron_common::api::external::Error::from(err); + assert_matches::assert_matches!( + common_err, + omicron_common::api::external::Error::ServiceUnavailable { .. } ); // Should succeed: we don't care that time has not yet synchronized (for @@ -4521,88 +4633,6 @@ mod test { logctx.cleanup_successful(); } - #[tokio::test] - async fn test_old_ledger_migration_continue() { - // This test is just like "test_old_ledger_migration", except that we - // deploy a new zone after migration and before shutting down the - // service manager. This tests that new changes modify the new, - // migrated config. - let logctx = omicron_test_utils::dev::test_setup_log( - "test_old_ledger_migration_continue", - ); - let test_config = TestConfig::new().await; - - // Before we start the service manager, stuff one of our old-format - // service ledgers into place. - let contents = - include_str!("../tests/old-service-ledgers/rack2-sled10.json"); - std::fs::write( - test_config.config_dir.path().join(SERVICES_LEDGER_FILENAME), - contents, - ) - .expect("failed to copy example old-format services ledger into place"); - - // Now start the service manager. - let helper = - LedgerTestHelper::new(logctx.log.clone(), &test_config).await; - let mgr = helper.clone().new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - - // Trigger the migration code. - let unused = Mutex::new(BTreeMap::new()); - let migrated_ledger = mgr - .load_ledgered_zones(&unused.lock().await) - .await - .expect("failed to load ledgered zones") - .unwrap(); - - // The other test verified that migration has happened normally so let's - // assume it has. Now provision a new zone. - let vv = migrated_ledger.data().omicron_generation.next(); - let id = Uuid::new_v4(); - - let _expectations = expect_new_services(); - let address = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0); - let mut zones = - migrated_ledger.data().clone().to_omicron_zones_config().zones; - zones.push(OmicronZoneConfig { - id, - underlay_address: Ipv6Addr::LOCALHOST, - zone_type: OmicronZoneType::InternalNtp { - address, - ntp_servers: vec![], - dns_servers: vec![], - domain: None, - }, - }); - mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig { - generation: vv, - zones, - }) - .await - .expect("failed to add new zone after migration"); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - // Just to be sure, shut down the manager and create a new one without - // triggering migration again. It should now report one more zone than - // was migrated earlier. - drop_service_manager(mgr); - - let mgr = helper.new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - drop_service_manager(mgr); - logctx.cleanup_successful(); - } - #[tokio::test] async fn test_old_ledger_migration_bad() { let logctx = omicron_test_utils::dev::test_setup_log( diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index b214667631..fd5995b8f1 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -26,6 +26,8 @@ use omicron_common::FileKv; use slog::{info, Drain, Logger}; use std::collections::HashMap; use std::net::IpAddr; +use std::net::Ipv4Addr; +use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use std::sync::Arc; @@ -455,7 +457,13 @@ pub async fn run_standalone_server( external_port_count: NexusTypes::ExternalPortDiscovery::Static( HashMap::new(), ), - rack_network_config: None, + rack_network_config: NexusTypes::RackNetworkConfigV1 { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; handoff_to_nexus(&log, &config, &rack_init_request).await?; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index eaf354db26..1a634a6346 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -9,7 +9,7 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkSetupError, }; -use crate::bootstrap::params::StartSledAgentRequest; +use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; use crate::instance_manager::{InstanceManager, ReservoirMode}; use crate::long_running_tasks::LongRunningTaskHandles; @@ -105,6 +105,9 @@ pub enum Error { #[error("Failed to operate on underlay device: {0}")] Underlay(#[from] underlay::Error), + #[error("Failed to request firewall rules")] + FirewallRequest(#[source] nexus_client::Error), + #[error(transparent)] Services(#[from] crate::services::Error), @@ -602,11 +605,24 @@ impl SledAgent { retry_notify( retry_policy_internal_service_aggressive(), || async { - self.inner - .services - .load_services() + // Load as many services as we can, and don't exit immediately + // upon failure... + let load_services_result = + self.inner.services.load_services().await.map_err(|err| { + BackoffError::transient(Error::from(err)) + }); + + // ... and request firewall rule updates for as many services as + // we can. Note that we still make this request even if we only + // partially load some services. + let firewall_result = self + .request_firewall_update() .await - .map_err(|err| BackoffError::transient(err)) + .map_err(|err| BackoffError::transient(err)); + + // Only complete if we have loaded all services and firewall + // rules successfully. + load_services_result.and(firewall_result) }, |err, delay| { warn!( @@ -618,10 +634,6 @@ impl SledAgent { ) .await .unwrap(); // we retry forever, so this can't fail - - // Now that we've initialized the sled services, notify nexus again - // at which point it'll plumb any necessary firewall rules back to us. - self.notify_nexus_about_self(&self.log); } pub(crate) fn switch_zone_underlay_info( @@ -642,7 +654,26 @@ impl SledAgent { &self.inner.start_request } - // Sends a request to Nexus informing it that the current sled exists. + /// Requests firewall rules from Nexus. + /// + /// Does not retry upon failure. + async fn request_firewall_update(&self) -> Result<(), Error> { + let sled_id = self.inner.id; + + self.inner + .nexus_client + .client() + .sled_firewall_rules_request(&sled_id) + .await + .map_err(|err| Error::FirewallRequest(err))?; + Ok(()) + } + + /// Sends a request to Nexus informing it that the current sled exists, + /// with information abou the existing set of hardware. + /// + /// Does not block until Nexus is available -- the future created by this + /// function is retried in a queue that is polled in the background. pub(crate) fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; let nexus_client = self.inner.nexus_client.clone(); @@ -658,7 +689,7 @@ impl SledAgent { let log = log.clone(); let fut = async move { // Notify the control plane that we're up, and continue trying this - // until it succeeds. We retry with an randomized, capped + // until it succeeds. We retry with a randomized, capped // exponential backoff. // // TODO-robustness if this returns a 400 error, we probably want to @@ -1187,8 +1218,8 @@ pub enum AddSledError { }, #[error("Failed to connect to DDM")] DdmAdminClient(#[source] ddm_admin_client::DdmError), - #[error("Failed to learn bootstrap ip for {0}")] - NotFound(Baseboard), + #[error("Failed to learn bootstrap ip for {0:?}")] + NotFound(BaseboardId), #[error("Failed to initialize {sled_id}: {err}")] BootstrapTcpClient { sled_id: Baseboard, @@ -1199,7 +1230,7 @@ pub enum AddSledError { /// Add a sled to an initialized rack. pub async fn sled_add( log: Logger, - sled_id: Baseboard, + sled_id: BaseboardId, request: StartSledAgentRequest, ) -> Result<(), AddSledError> { // Get all known bootstrap addresses via DDM @@ -1227,16 +1258,20 @@ pub async fn sled_add( }) .collect::>(); - // Execute the futures until we find our matching sled or done searching + // Execute the futures until we find our matching sled or are done searching let mut target_ip = None; + let mut found_baseboard = None; while let Some((ip, result)) = addrs_to_sleds.next().await { match result { Ok(baseboard) => { // Convert from progenitor type back to `sled-hardware` // type. - let found = baseboard.into_inner().into(); - if sled_id == found { + let found: Baseboard = baseboard.into_inner().into(); + if sled_id.serial_number == found.identifier() + && sled_id.part_number == found.model() + { target_ip = Some(ip); + found_baseboard = Some(found); break; } } @@ -1259,10 +1294,14 @@ pub async fn sled_add( log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), ); + // Safe to unwrap, because we would have bailed when checking target_ip + // above otherwise. baseboard and target_ip are set together. + let baseboard = found_baseboard.unwrap(); + client.start_sled_agent(&request).await.map_err(|err| { - AddSledError::BootstrapTcpClient { sled_id: sled_id.clone(), err } + AddSledError::BootstrapTcpClient { sled_id: baseboard.clone(), err } })?; - info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %sled_id); + info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %baseboard); Ok(()) } diff --git a/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..5512247ee8 --- /dev/null +++ b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1 @@ +{"rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","sleds":{"[fdb0:a840:2504:396::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b3e78a88-0f2e-476e-a8a9-2d8c90a169d6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:103::/64"}}},"[fdb0:a840:2504:157::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"168e1ad6-1e4b-4f7a-b894-157974bd8bb8","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:104::/64"}}},"[fdb0:a840:2504:355::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b9877212-212b-4588-b818-9c7b53c5b143","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:102::/64"}}},"[fdb0:a840:2504:3d2::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"c3a0f8be-5b05-4ee8-8c4e-2514de6501b6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:101::/64"}}}},"config":{"rack_subnet":"fd00:1122:3344:100::","trust_quorum_peers":[{"type":"gimlet","identifier":"BRM42220081","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220046","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM44220001","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220004","model":"913-0000019","revision":6}],"bootstrap_discovery":{"type":"only_these","addrs":["fdb0:a840:2504:3d2::1","fdb0:a840:2504:355::1","fdb0:a840:2504:396::1","fdb0:a840:2504:157::1"]},"ntp_servers":["ntp.eng.oxide.computer"],"dns_servers":["1.1.1.1","9.9.9.9"],"internal_services_ip_pool_ranges":[{"first":"172.20.28.1","last":"172.20.28.10"}],"external_dns_ips":["172.20.28.1"],"external_dns_zone_name":"madrid.eng.oxide.computer","external_certificates":[{"cert":"","key":""}],"recovery_silo":{"silo_name":"recovery","user_name":"recovery","user_password_hash":"$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY"},"rack_network_config":{"rack_subnet":"fd00:1122:3344:1::/56","infra_ip_first":"172.20.15.37","infra_ip_last":"172.20.15.38","ports":[{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.38/29"],"switch":"switch0","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false},{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.37/29"],"switch":"switch1","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false}],"bgp":[]}}} diff --git a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..69f68c60ad --- /dev/null +++ b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1,164 @@ +{ + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "sleds": { + "[fdb0:a840:2504:157::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "168e1ad6-1e4b-4f7a-b894-157974bd8bb8", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:104::/64" + } + } + }, + "[fdb0:a840:2504:355::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b9877212-212b-4588-b818-9c7b53c5b143", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:102::/64" + } + } + }, + "[fdb0:a840:2504:396::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b3e78a88-0f2e-476e-a8a9-2d8c90a169d6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:103::/64" + } + } + }, + "[fdb0:a840:2504:3d2::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "c3a0f8be-5b05-4ee8-8c4e-2514de6501b6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:101::/64" + } + } + } + }, + "config": { + "trust_quorum_peers": [ + { + "type": "gimlet", + "identifier": "BRM42220081", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220046", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM44220001", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220004", + "model": "913-0000019", + "revision": 6 + } + ], + "bootstrap_discovery": { + "type": "only_these", + "addrs": [ + "fdb0:a840:2504:157::1", + "fdb0:a840:2504:355::1", + "fdb0:a840:2504:396::1", + "fdb0:a840:2504:3d2::1" + ] + }, + "ntp_servers": [ + "ntp.eng.oxide.computer" + ], + "dns_servers": [ + "1.1.1.1", + "9.9.9.9" + ], + "internal_services_ip_pool_ranges": [ + { + "first": "172.20.28.1", + "last": "172.20.28.10" + } + ], + "external_dns_ips": [ + "172.20.28.1" + ], + "external_dns_zone_name": "madrid.eng.oxide.computer", + "external_certificates": [ + { + "cert": "", + "key": "" + } + ], + "recovery_silo": { + "silo_name": "recovery", + "user_name": "recovery", + "user_password_hash": "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY" + }, + "rack_network_config": { + "rack_subnet": "fd00:1122:3344:1::/56", + "infra_ip_first": "172.20.15.37", + "infra_ip_last": "172.20.15.38", + "ports": [ + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.38/29" + ], + "switch": "switch0", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + }, + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.37/29" + ], + "switch": "switch1", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + } + ], + "bgp": [] + } + } +} \ No newline at end of file diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 8fc2429169..40ed41bfda 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 15f0a4ebe1..2e259aa42f 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index f7a93260e3..6c874d9a70 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 12cb2afd24..d0b4f94d9f 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/tools/ci_download_clickhouse b/tools/ci_download_clickhouse index 03a5bff24c..675566fad7 100755 --- a/tools/ci_download_clickhouse +++ b/tools/ci_download_clickhouse @@ -20,7 +20,7 @@ DOWNLOAD_DIR="$TARGET_DIR/downloads" DEST_DIR="./$TARGET_DIR/clickhouse" # If you change this, you must also update the md5sums below -CIDL_VERSION="v22.8.9.24" +CIDL_VERSION="$(cat "$SOURCE_DIR/clickhouse_version")" source "$SOURCE_DIR/clickhouse_checksums" # Download from manually-populated S3 bucket for now diff --git a/tools/ci_download_cockroachdb b/tools/ci_download_cockroachdb index ca484c000f..5755e7e665 100755 --- a/tools/ci_download_cockroachdb +++ b/tools/ci_download_cockroachdb @@ -13,7 +13,7 @@ set -o errexit SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" ARG0="$(basename "${BASH_SOURCE[0]}")" -# If you change this, you must also update the md5sums below +# If you change this, you must also update the sha256sums below CIDL_VERSION="$(cat "$SOURCE_DIR/cockroachdb_version")" source "$SOURCE_DIR/cockroachdb_checksums" @@ -49,6 +49,7 @@ function main # Configure this program configure_os "$CIDL_OS" CIDL_URL="$CIDL_URL_BASE/$TARBALL_FILENAME" + CIDL_SHA256FUNC="do_sha256sum" # Download the file. echo "URL: $CIDL_URL" @@ -60,9 +61,9 @@ function main local DO_DOWNLOAD="true" if [[ -f "$TARBALL_FILE" ]]; then # If the file exists with a valid checksum, we can skip downloading. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" == "$CIDL_MD5" ]]; then + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" == "$CIDL_SHA256" ]]; then DO_DOWNLOAD="false" fi fi @@ -72,12 +73,12 @@ function main do_download_curl "$CIDL_URL" "$TARBALL_FILE" || \ fail "failed to download file" - # Verify the md5sum. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" != "$CIDL_MD5" ]]; then - fail "md5sum mismatch \ - (expected $CIDL_MD5, found $calculated_md5)" + # Verify the sha256sum. + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" != "$CIDL_SHA256" ]]; then + fail "sha256sum mismatch \ + (expected $CIDL_SHA256, found $calculated_sha256)" fi fi @@ -105,24 +106,21 @@ function configure_os darwin*) CIDL_BUILD="darwin-10.9-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_DARWIN" - CIDL_MD5FUNC="do_md5" + CIDL_SHA256="$CIDL_SHA256_DARWIN" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; linux-gnu*) CIDL_BUILD="linux-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_LINUX" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_LINUX" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; solaris*) CIDL_BUILD="illumos" CIDL_SUFFIX="tar.gz" - CIDL_MD5="$CIDL_MD5_ILLUMOS" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_ILLUMOS" CIDL_URL_BASE="$CIDL_URL_ILLUMOS" CIDL_ASSEMBLE="do_assemble_illumos" ;; @@ -143,14 +141,9 @@ function do_download_curl curl --silent --show-error --fail --location --output "$2" "$1" } -function do_md5 +function do_sha256sum { - md5 < "$1" -} - -function do_md5sum -{ - md5sum < "$1" | awk '{print $1}' + sha256sum < "$1" | awk '{print $1}' } function do_untar diff --git a/tools/clickhouse_version b/tools/clickhouse_version new file mode 100644 index 0000000000..93b98bf738 --- /dev/null +++ b/tools/clickhouse_version @@ -0,0 +1 @@ +v22.8.9.24 \ No newline at end of file diff --git a/tools/cockroachdb_checksums b/tools/cockroachdb_checksums index 50e873100f..20b6e237f8 100644 --- a/tools/cockroachdb_checksums +++ b/tools/cockroachdb_checksums @@ -1,3 +1,3 @@ -CIDL_MD5_DARWIN="2db972c254b4e3b599e12110520178b5" -CIDL_MD5_LINUX="8c3170883e0a0be1a34b44090c067a8c" -CIDL_MD5_ILLUMOS="d8999aff364e5d70f226e139fda724a3" +CIDL_SHA256_DARWIN="1ca69e0911af11a73305c3c6f4650b912d70754900b5bf7b80a1d361efe36561" +CIDL_SHA256_LINUX="24c321820e7ee45fa07fe91ac138befe13ad860e41c6ed595ce58823205ff4a9" +CIDL_SHA256_ILLUMOS="f151714ba3a6e02caaaa59727482c36085e60d6bd2fa963938e9a3d8c8a77088" diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 56bcb2d9ff..6895170e02 100644 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="fd159136c552d8b4ec4d49dd9bae7e38f6a636e6" -SHA2="e8f73a83d5c62f7efce998f821acc80a91b7995c95bd9ec2c228372829310099" +COMMIT="3618dd6017b363c5d34399273453cf50b9c9a43e" +SHA2="aa670165e5b459fab4caba36ae4d382a09264ff5cf6a2dac0dae0a0db39a378e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 497ce5c010..74b379f359 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="1e24598ba77dc00682cdf54fc370696ef5aa49ed510ab7f72fcc91d61d679e7b" -CIDL_SHA256_LINUX_DPD="4fc43b53a048264664ede64805d4d179ec32d50cf9ab1aaa0fa4e17190e511a2" -CIDL_SHA256_LINUX_SWADM="0ab34a2063e68568aa064f7b71825a603d47b3e399f3e7f45976edb5d5283f0f" +CIDL_SHA256_ILLUMOS="eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" +CIDL_SHA256_LINUX_DPD="cb9a1978d1fe3a3f2391757f80436d8cc87c0041161652ad2234e7cf83e9ae36" +CIDL_SHA256_LINUX_SWADM="b7e737be56a8a815a95624f0b5c42ce1e339b07feeae7b3d7b9b4bc17c204245" diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 8ee3001179..6c58d83ea3 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0b0dbc2f8bbc5d2d9be92d64c4865f8f9335355aae62f7de9f67f81dfb3f1803" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 3fa53a9483..896be8d38c 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0ac038bbaa54d0ae0ac5ccaeff48f03070618372cca26c9d09b716b909bf9355" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 1dacea54dc..8fc4d083f8 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" -MGD_LINUX_SHA256="248732202f5102bf0947f5f91871379b6c6945fe387d4272cebe6e08f1b58184" \ No newline at end of file +CIDL_SHA256="26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" +MGD_LINUX_SHA256="b2c823dd714fad67546a0e0c0d4ae56f2fe2e7c43434469b38e13b78de9f6968" \ No newline at end of file diff --git a/wicket-common/src/rack_setup.rs b/wicket-common/src/rack_setup.rs index e3d5fad5fb..f28c0639a9 100644 --- a/wicket-common/src/rack_setup.rs +++ b/wicket-common/src/rack_setup.rs @@ -5,12 +5,24 @@ // Copyright 2023 Oxide Computer Company use omicron_common::address; -use omicron_common::api::internal::shared::RackNetworkConfig; +use omicron_common::api::internal::shared::BgpConfig; +use omicron_common::api::internal::shared::PortConfigV1; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use std::collections::BTreeSet; use std::net::IpAddr; +use std::net::Ipv4Addr; + +/// User-specified parts of +/// [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct UserSpecifiedRackNetworkConfig { + pub infra_ip_first: Ipv4Addr, + pub infra_ip_last: Ipv4Addr, + pub ports: Vec, + pub bgp: Vec, +} // The portion of `CurrentRssUserConfig` that can be posted in one shot; it is // provided by the wicket user uploading a TOML file, currently. @@ -27,5 +39,5 @@ pub struct PutRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: RackNetworkConfig, + pub rack_network_config: UserSpecifiedRackNetworkConfig, } diff --git a/wicket/src/cli/rack_setup/config_template.toml b/wicket/src/cli/rack_setup/config_template.toml index 2886fa01d7..d091237b5f 100644 --- a/wicket/src/cli/rack_setup/config_template.toml +++ b/wicket/src/cli/rack_setup/config_template.toml @@ -40,7 +40,6 @@ bootstrap_sleds = [] # TODO: docs on network config [rack_network_config] -rack_subnet = "" infra_ip_first = "" infra_ip_last = "" diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 5a8e8a560e..d050610c30 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -19,7 +19,7 @@ use wicket_common::rack_update::SpType; use wicketd_client::types::BootstrapSledDescription; use wicketd_client::types::CurrentRssUserConfigInsensitive; use wicketd_client::types::IpRange; -use wicketd_client::types::RackNetworkConfigV1; +use wicketd_client::types::UserSpecifiedRackNetworkConfig; static TEMPLATE: &str = include_str!("config_template.toml"); @@ -176,7 +176,7 @@ fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { fn populate_network_table( table: &mut Table, - config: Option<&RackNetworkConfigV1>, + config: Option<&UserSpecifiedRackNetworkConfig>, ) { // Helper function to serialize enums into their appropriate string // representations. @@ -195,7 +195,6 @@ fn populate_network_table( }; for (property, value) in [ - ("rack_subnet", config.rack_subnet.to_string()), ("infra_ip_first", config.infra_ip_first.to_string()), ("infra_ip_last", config.infra_ip_last.to_string()), ] { @@ -350,7 +349,6 @@ fn populate_network_table( #[cfg(test)] mod tests { use super::*; - use omicron_common::api::internal::shared::RackNetworkConfigV1 as InternalRackNetworkConfig; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; use wicket_common::rack_update::SpIdentifier; @@ -373,6 +371,7 @@ mod tests { use omicron_common::api::internal::shared::PortSpeed as InternalPortSpeed; use omicron_common::api::internal::shared::RouteConfig as InternalRouteConfig; use omicron_common::api::internal::shared::SwitchLocation as InternalSwitchLocation; + use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig as InternalUserSpecifiedRackNetworkConfig; let rnc = value.rack_network_config.unwrap(); @@ -401,8 +400,7 @@ mod tests { .collect(), external_dns_ips: value.external_dns_ips, ntp_servers: value.ntp_servers, - rack_network_config: InternalRackNetworkConfig { - rack_subnet: rnc.rack_subnet, + rack_network_config: InternalUserSpecifiedRackNetworkConfig { infra_ip_first: rnc.infra_ip_first, infra_ip_last: rnc.infra_ip_last, ports: rnc @@ -514,8 +512,7 @@ mod tests { )], external_dns_ips: vec!["10.0.0.1".parse().unwrap()], ntp_servers: vec!["ntp1.com".into(), "ntp2.com".into()], - rack_network_config: Some(RackNetworkConfigV1 { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: Some(UserSpecifiedRackNetworkConfig { infra_ip_first: "172.30.0.1".parse().unwrap(), infra_ip_last: "172.30.0.10".parse().unwrap(), ports: vec![PortConfigV1 { diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs index be21984997..c009d597c8 100644 --- a/wicket/src/ui/panes/update.rs +++ b/wicket/src/ui/panes/update.rs @@ -1435,12 +1435,13 @@ impl UpdatePane { Constraint::Length(cell_width), Constraint::Length(cell_width), ]; - let header_table = Table::new(std::iter::empty(), &width_constraints) - .header( - Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) - .style(header_style), - ) - .block(block.clone().title("OVERVIEW (* = active)")); + let header_table = + Table::new(std::iter::empty::(), &width_constraints) + .header( + Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) + .style(header_style), + ) + .block(block.clone().title("OVERVIEW (* = active)")); frame.render_widget(header_table, self.table_headers_rect); // For the selected item, draw the version table. diff --git a/wicket/src/ui/widgets/ignition.rs b/wicket/src/ui/widgets/ignition.rs index cef942d2c7..1e04c4d02b 100644 --- a/wicket/src/ui/widgets/ignition.rs +++ b/wicket/src/ui/widgets/ignition.rs @@ -61,29 +61,26 @@ impl IgnitionPopup { format!("IGNITION: {}", component.to_string_uppercase()), style::header(true), )]), - body: Text { - lines: vec![ - Line::from(vec![Span::styled( - "Power On", - style::line( - self.selected_command == IgnitionCommand::PowerOn, - ), - )]), - Line::from(vec![Span::styled( - "Power Off", - style::line( - self.selected_command == IgnitionCommand::PowerOff, - ), - )]), - Line::from(vec![Span::styled( - "Power Reset", - style::line( - self.selected_command - == IgnitionCommand::PowerReset, - ), - )]), - ], - }, + body: Text::from(vec![ + Line::from(vec![Span::styled( + "Power On", + style::line( + self.selected_command == IgnitionCommand::PowerOn, + ), + )]), + Line::from(vec![Span::styled( + "Power Off", + style::line( + self.selected_command == IgnitionCommand::PowerOff, + ), + )]), + Line::from(vec![Span::styled( + "Power Reset", + style::line( + self.selected_command == IgnitionCommand::PowerReset, + ), + )]), + ]), buttons: vec![ButtonText::new("Close", "Esc")], } } diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index 9c1740679f..9748a93bd5 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -32,7 +32,6 @@ use http::StatusCode; use internal_dns::resolver::Resolver; use omicron_common::address; use omicron_common::api::external::SemverVersion; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::update::ArtifactHashId; use omicron_common::update::ArtifactId; @@ -47,6 +46,7 @@ use std::net::IpAddr; use std::net::Ipv6Addr; use std::time::Duration; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; use wicket_common::update_events::EventReport; use wicket_common::WICKETD_TIMEOUT; @@ -172,7 +172,7 @@ pub struct CurrentRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: Option, + pub rack_network_config: Option, } // This is a summary of the subset of `RackInitializeRequest` that is sensitive; @@ -1189,12 +1189,14 @@ async fn post_start_preflight_uplink_check( let (network_config, dns_servers, ntp_servers) = { let rss_config = rqctx.rss_config.lock().unwrap(); - let network_config = - rss_config.rack_network_config().cloned().ok_or_else(|| { + let network_config = rss_config + .user_specified_rack_network_config() + .cloned() + .ok_or_else(|| { HttpError::for_bad_request( None, "uplink preflight check requires setting \ - the uplink config for RSS" + the uplink config for RSS" .to_string(), ) })?; diff --git a/wicketd/src/preflight_check.rs b/wicketd/src/preflight_check.rs index 75cc5f6e09..4cd17604a0 100644 --- a/wicketd/src/preflight_check.rs +++ b/wicketd/src/preflight_check.rs @@ -2,7 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use slog::o; use slog::Logger; @@ -12,6 +11,7 @@ use std::sync::Mutex; use tokio::sync::oneshot; use update_engine::events::EventReport; use update_engine::GenericSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; mod uplink; @@ -44,7 +44,7 @@ impl PreflightCheckerHandler { pub(crate) async fn uplink_start( &self, - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, @@ -94,7 +94,7 @@ pub(crate) struct PreflightCheckerBusy; #[derive(Debug)] enum PreflightCheck { Uplink { - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index 47995f0c10..31d479a5ed 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -22,7 +22,6 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::api::internal::shared::PortConfigV1; use omicron_common::api::internal::shared::PortFec as OmicronPortFec; use omicron_common::api::internal::shared::PortSpeed as OmicronPortSpeed; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::OMICRON_DPD_TAG; use schemars::JsonSchema; @@ -49,6 +48,7 @@ use trust_dns_resolver::error::ResolveError; use trust_dns_resolver::error::ResolveErrorKind; use trust_dns_resolver::TokioAsyncResolver; use update_engine::StepSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; const DNS_PORT: u16 = 53; @@ -68,7 +68,7 @@ const IPADM: &str = "/usr/sbin/ipadm"; const ROUTE: &str = "/usr/sbin/route"; pub(super) async fn run_local_uplink_preflight_check( - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index f654597d81..4bc1a6b62b 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -26,7 +26,6 @@ use gateway_client::types::SpType; use omicron_certificates::CertificateError; use omicron_common::address; use omicron_common::address::Ipv4Range; -use omicron_common::api::internal::shared::RackNetworkConfig; use sled_hardware::Baseboard; use slog::warn; use std::collections::BTreeSet; @@ -34,6 +33,7 @@ use std::mem; use std::net::IpAddr; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; // TODO-correctness For now, we always use the same rack subnet when running // RSS. When we get to multirack, this will be wrong, but there are many other @@ -64,7 +64,7 @@ pub(crate) struct CurrentRssConfig { external_dns_zone_name: String, external_certificates: Vec, recovery_silo_password_hash: Option, - rack_network_config: Option, + rack_network_config: Option, // External certificates are uploaded in two separate actions (cert then // key, or vice versa). Here we store a partial certificate; once we have @@ -82,7 +82,9 @@ impl CurrentRssConfig { &self.ntp_servers } - pub(crate) fn rack_network_config(&self) -> Option<&RackNetworkConfig> { + pub(crate) fn user_specified_rack_network_config( + &self, + ) -> Option<&UserSpecifiedRackNetworkConfig> { self.rack_network_config.as_ref() } @@ -252,7 +254,6 @@ impl CurrentRssConfig { .collect(); let request = RackInitializeRequest { - rack_subnet: RACK_SUBNET, trust_quorum_peers, bootstrap_discovery: BootstrapAddressDiscovery::OnlyThese( bootstrap_ips, @@ -268,7 +269,7 @@ impl CurrentRssConfig { user_name: UserId(RECOVERY_SILO_USERNAME.into()), user_password_hash, }, - rack_network_config: Some(rack_network_config), + rack_network_config, }; Ok(request) @@ -452,7 +453,7 @@ impl From<&'_ CurrentRssConfig> for CurrentRssUserConfig { } fn validate_rack_network_config( - config: &RackNetworkConfig, + config: &UserSpecifiedRackNetworkConfig, ) -> Result { use bootstrap_agent_client::types::BgpConfig as BaBgpConfig; use bootstrap_agent_client::types::BgpPeerConfig as BaBgpPeerConfig; @@ -497,7 +498,7 @@ fn validate_rack_network_config( // TODO Add more client side checks on `rack_network_config` contents? Ok(bootstrap_agent_client::types::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, + rack_subnet: RACK_SUBNET.into(), infra_ip_first: config.infra_ip_first, infra_ip_last: config.infra_ip_last, ports: config diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 4e62ba13e3..7038f9c038 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -233,42 +233,42 @@ bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-f dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-linux-gnu.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-illumos.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } @@ -276,7 +276,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } @@ -286,7 +286,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] }