From 8f8cca54ace4d03229c05df48ae8aa3b4aa9ee9f Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 5 Feb 2024 18:07:44 -0800 Subject: [PATCH 01/27] [wicket] update to ratatui 0.26 (#4991) Just a couple small updates. --- Cargo.lock | 83 ++++++++++++++++++++++++------- Cargo.toml | 4 +- wicket/src/ui/panes/update.rs | 13 ++--- wicket/src/ui/widgets/ignition.rs | 43 ++++++++-------- 4 files changed, 93 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c00f0c17f6..ad6502d26e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -466,7 +466,7 @@ source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2 dependencies = [ "bhyve_api_sys", "libc", - "strum", + "strum 0.25.0", ] [[package]] @@ -475,7 +475,7 @@ version = "0.0.0" source = "git+https://github.com/oxidecomputer/propolis?rev=ff6c4df2e816eee6e7b2b0488777d30ef35ee217#ff6c4df2e816eee6e7b2b0488777d30ef35ee217" dependencies = [ "libc", - "strum", + "strum 0.25.0", ] [[package]] @@ -830,6 +830,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cbc" version = "0.1.2" @@ -1049,6 +1058,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "console" version = "0.15.8" @@ -4313,7 +4335,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4385,7 +4407,7 @@ dependencies = [ "slog", "static_assertions", "steno", - "strum", + "strum 0.25.0", "subprocess", "swrite", "term", @@ -4448,7 +4470,7 @@ dependencies = [ "serde_json", "sled-agent-client", "slog", - "strum", + "strum 0.25.0", "thiserror", "tokio", "uuid", @@ -4537,7 +4559,7 @@ dependencies = [ "serde_json", "sled-agent-client", "steno", - "strum", + "strum 0.25.0", "thiserror", "uuid", ] @@ -4854,7 +4876,7 @@ dependencies = [ "serde_urlencoded", "serde_with", "slog", - "strum", + "strum 0.25.0", "test-strategy", "thiserror", "tokio", @@ -5057,7 +5079,7 @@ dependencies = [ "slog-term", "sp-sim", "steno", - "strum", + "strum 0.25.0", "subprocess", "tempfile", "term", @@ -5110,7 +5132,7 @@ dependencies = [ "sled-agent-client", "slog", "slog-error-chain", - "strum", + "strum 0.25.0", "subprocess", "tabled", "textwrap 0.16.0", @@ -5145,7 +5167,7 @@ dependencies = [ "slog-bunyan", "slog-term", "smf", - "strum", + "strum 0.25.0", "swrite", "tar", "thiserror", @@ -5669,7 +5691,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "strum", + "strum 0.25.0", "thiserror", "trybuild", "uuid", @@ -5722,7 +5744,7 @@ dependencies = [ "slog-async", "slog-dtrace", "slog-term", - "strum", + "strum 0.25.0", "subprocess", "thiserror", "tokio", @@ -5764,7 +5786,7 @@ dependencies = [ "slog-term", "sqlformat", "sqlparser", - "strum", + "strum 0.25.0", "tabled", "tempfile", "thiserror", @@ -6850,19 +6872,20 @@ dependencies = [ [[package]] name = "ratatui" -version = "0.25.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5659e52e4ba6e07b2dad9f1158f578ef84a73762625ddb51536019f34d180eb" +checksum = "154b85ef15a5d1719bcaa193c3c81fe645cd120c156874cd660fe49fd21d1373" dependencies = [ "bitflags 2.4.0", "cassowary", + "compact_str", "crossterm", "indoc 2.0.3", "itertools 0.12.1", "lru", "paste", "stability", - "strum", + "strum 0.26.1", "unicode-segmentation", "unicode-width", ] @@ -6959,7 +6982,7 @@ dependencies = [ "nu-ansi-term", "serde", "strip-ansi-escapes", - "strum", + "strum 0.25.0", "strum_macros 0.25.2", "thiserror", "unicode-segmentation", @@ -8663,6 +8686,15 @@ dependencies = [ "strum_macros 0.25.2", ] +[[package]] +name = "strum" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "723b93e8addf9aa965ebe2d11da6d7540fa2283fcea14b3371ff055f7ba13f5f" +dependencies = [ + "strum_macros 0.26.1", +] + [[package]] name = "strum_macros" version = "0.24.3" @@ -8689,6 +8721,19 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "strum_macros" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3417fc93d76740d974a01654a09777cb500428cc874ca9f45edfe0c4d4cd18" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] + [[package]] name = "subprocess" version = "0.2.9" @@ -9597,9 +9642,9 @@ dependencies = [ [[package]] name = "tui-tree-widget" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136011b328c4f392499a02c4b5b78d509fb297bf9c10f2bda5d11d65cb946e4c" +checksum = "5c317bb061f42d943a2eb118b5de0ee98fc2443f0631e54b24a19de014a28810" dependencies = [ "ratatui", "unicode-width", diff --git a/Cargo.toml b/Cargo.toml index 6e4799d184..b368bd114b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,7 +314,7 @@ propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev proptest = "1.4.0" quote = "1.0" rand = "0.8.5" -ratatui = "0.25.0" +ratatui = "0.26.0" rayon = "1.8" rcgen = "0.12.1" reedline = "0.28.0" @@ -400,7 +400,7 @@ trust-dns-server = "0.22" trybuild = "1.0.89" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } -tui-tree-widget = "0.16.0" +tui-tree-widget = "0.17.0" unicode-width = "0.1.11" update-common = { path = "update-common" } update-engine = { path = "update-engine" } diff --git a/wicket/src/ui/panes/update.rs b/wicket/src/ui/panes/update.rs index be21984997..c009d597c8 100644 --- a/wicket/src/ui/panes/update.rs +++ b/wicket/src/ui/panes/update.rs @@ -1435,12 +1435,13 @@ impl UpdatePane { Constraint::Length(cell_width), Constraint::Length(cell_width), ]; - let header_table = Table::new(std::iter::empty(), &width_constraints) - .header( - Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) - .style(header_style), - ) - .block(block.clone().title("OVERVIEW (* = active)")); + let header_table = + Table::new(std::iter::empty::(), &width_constraints) + .header( + Row::new(vec!["COMPONENT", "VERSION", "TARGET", "STATUS"]) + .style(header_style), + ) + .block(block.clone().title("OVERVIEW (* = active)")); frame.render_widget(header_table, self.table_headers_rect); // For the selected item, draw the version table. diff --git a/wicket/src/ui/widgets/ignition.rs b/wicket/src/ui/widgets/ignition.rs index cef942d2c7..1e04c4d02b 100644 --- a/wicket/src/ui/widgets/ignition.rs +++ b/wicket/src/ui/widgets/ignition.rs @@ -61,29 +61,26 @@ impl IgnitionPopup { format!("IGNITION: {}", component.to_string_uppercase()), style::header(true), )]), - body: Text { - lines: vec![ - Line::from(vec![Span::styled( - "Power On", - style::line( - self.selected_command == IgnitionCommand::PowerOn, - ), - )]), - Line::from(vec![Span::styled( - "Power Off", - style::line( - self.selected_command == IgnitionCommand::PowerOff, - ), - )]), - Line::from(vec![Span::styled( - "Power Reset", - style::line( - self.selected_command - == IgnitionCommand::PowerReset, - ), - )]), - ], - }, + body: Text::from(vec![ + Line::from(vec![Span::styled( + "Power On", + style::line( + self.selected_command == IgnitionCommand::PowerOn, + ), + )]), + Line::from(vec![Span::styled( + "Power Off", + style::line( + self.selected_command == IgnitionCommand::PowerOff, + ), + )]), + Line::from(vec![Span::styled( + "Power Reset", + style::line( + self.selected_command == IgnitionCommand::PowerReset, + ), + )]), + ]), buttons: vec![ButtonText::new("Close", "Esc")], } } From d4d8ca256d5d23c1d51494b16cfe3c519e2b05fd Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 18:43:38 -0800 Subject: [PATCH 02/27] Update Rust crate parse-display to 0.9.0 (#4981) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 48 ++++++++++++++++++++++++++++++++---------------- Cargo.toml | 2 +- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad6502d26e..325157f4bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5954,27 +5954,26 @@ dependencies = [ [[package]] name = "parse-display" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6509d08722b53e8dafe97f2027b22ccbe3a5db83cb352931e9716b0aa44bc5c" +checksum = "06af5f9333eb47bd9ba8462d612e37a8328a5cb80b13f0af4de4c3b89f52dee5" dependencies = [ - "once_cell", "parse-display-derive", "regex", + "regex-syntax 0.8.2", ] [[package]] name = "parse-display-derive" -version = "0.8.2" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68517892c8daf78da08c0db777fcc17e07f2f63ef70041718f8a7630ad84f341" +checksum = "dc9252f259500ee570c75adcc4e317fa6f57a1e47747d622e0bf838002a7b790" dependencies = [ - "once_cell", "proc-macro2", "quote", "regex", - "regex-syntax 0.7.5", - "structmeta", + "regex-syntax 0.8.2", + "structmeta 0.3.0", "syn 2.0.48", ] @@ -7050,12 +7049,6 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.2" @@ -8638,7 +8631,19 @@ checksum = "78ad9e09554f0456d67a69c1584c9798ba733a5b50349a6c0d0948710523922d" dependencies = [ "proc-macro2", "quote", - "structmeta-derive", + "structmeta-derive 0.2.0", + "syn 2.0.48", +] + +[[package]] +name = "structmeta" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive 0.3.0", "syn 2.0.48", ] @@ -8653,6 +8658,17 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "structmeta-derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "structopt" version = "0.3.26" @@ -8962,7 +8978,7 @@ checksum = "b8361c808554228ad09bfed70f5c823caf8a3450b6881cc3a38eb57e8c08c1d9" dependencies = [ "proc-macro2", "quote", - "structmeta", + "structmeta 0.2.0", "syn 2.0.48", ] diff --git a/Cargo.toml b/Cargo.toml index b368bd114b..40425871f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -293,7 +293,7 @@ oximeter-instruments = { path = "oximeter/instruments" } oximeter-macro-impl = { path = "oximeter/oximeter-macro-impl" } oximeter-producer = { path = "oximeter/producer" } p256 = "0.13" -parse-display = "0.8.2" +parse-display = "0.9.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } parse-size = "1.0.0" paste = "1.0.14" From a47252bb58d02c60a5acba18220af2c426454cb7 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 05:10:17 +0000 Subject: [PATCH 03/27] Update taiki-e/install-action digest to 717ed1c (#4993) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`e17a4e2` -> `717ed1c`](https://togithub.com/taiki-e/install-action/compare/e17a4e2...717ed1c) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 92f7af36d5..68d816fc2d 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@e17a4e247d4a36441181d7758c499d97e1e006bd # v2 + uses: taiki-e/install-action@717ed1cb83959ef327137c2f806e1d8597bfca9f # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From ab0f50c6093a0432ff0f07ccb5e2474330798a8f Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 06:16:28 +0000 Subject: [PATCH 04/27] Update Rust crate tempfile to 3.10 (#4995) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 25 ++++++++++++------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 16 ++++++++-------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 325157f4bd..c5d849a23e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -332,7 +332,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7b2dbe9169059af0f821e811180fddc971fc210c776c133c7819ccd6e478db" dependencies = [ - "rustix 0.38.30", + "rustix 0.38.31", "tempfile", "windows-sys 0.52.0", ] @@ -2246,9 +2246,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fatfs" @@ -2269,7 +2269,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -2280,7 +2280,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" dependencies = [ "cfg-if", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -3676,7 +3676,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.48.0", ] @@ -5405,7 +5405,7 @@ dependencies = [ "regex-syntax 0.8.2", "reqwest", "ring 0.17.7", - "rustix 0.38.30", + "rustix 0.38.31", "schemars", "semver 1.0.21", "serde", @@ -7414,9 +7414,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.30" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.0", "errno", @@ -8914,14 +8914,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", - "rustix 0.38.30", + "rustix 0.38.31", "windows-sys 0.52.0", ] diff --git a/Cargo.toml b/Cargo.toml index 40425871f9..3372034810 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -378,7 +378,7 @@ syn = { version = "2.0" } tabled = "0.15.0" tar = "0.4" tempdir = "0.3" -tempfile = "3.9" +tempfile = "3.10" term = "0.7" termios = "0.3" textwrap = "0.16.0" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 4e62ba13e3..7038f9c038 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -233,42 +233,42 @@ bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-f dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-linux-gnu.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.aarch64-apple-darwin.build-dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } [target.x86_64-unknown-illumos.dependencies] bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.4.0", default-features = false, features = ["std"] } @@ -276,7 +276,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } @@ -286,7 +286,7 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } errno = { version = "0.3.8", default-features = false, features = ["std"] } mio = { version = "0.8.9", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.30", features = ["fs", "termios"] } +rustix = { version = "0.38.31", features = ["fs", "termios"] } toml_datetime = { version = "0.6.5", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } From 9576766ec798e8bbf2fa85e56c726b2f863f1183 Mon Sep 17 00:00:00 2001 From: Bryan Cantrill Date: Tue, 6 Feb 2024 00:00:32 -0800 Subject: [PATCH 05/27] omdb: add "mgs sensors" and "mgs dashboard" commands (#4973) --- Cargo.lock | 39 +- Cargo.toml | 3 + dev-tools/omdb/Cargo.toml | 5 + dev-tools/omdb/src/bin/omdb/mgs.rs | 39 +- dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs | 1113 ++++++++++++++++++ dev-tools/omdb/src/bin/omdb/mgs/sensors.rs | 950 +++++++++++++++ dev-tools/omdb/tests/usage_errors.out | 2 + 7 files changed, 2144 insertions(+), 7 deletions(-) create mode 100644 dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs create mode 100644 dev-tools/omdb/src/bin/omdb/mgs/sensors.rs diff --git a/Cargo.lock b/Cargo.lock index c5d849a23e..73ec3e4b4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1419,6 +1419,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.9.2" @@ -2042,9 +2063,9 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.13" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfc4744c1b8f2a09adc0e55242f60b1af195d88596bd8700be74418c056c555" +checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" [[package]] name = "ecdsa" @@ -4202,6 +4223,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +dependencies = [ + "serde", +] + [[package]] name = "nanorand" version = "0.7.0" @@ -5102,9 +5132,12 @@ dependencies = [ "async-bb8-diesel", "chrono", "clap 4.4.3", + "crossterm", "crucible-agent-client", + "csv", "diesel", "dropshot", + "dyn-clone", "expectorate", "futures", "gateway-client", @@ -5113,6 +5146,7 @@ dependencies = [ "humantime", "internal-dns", "ipnetwork", + "multimap", "nexus-client", "nexus-db-model", "nexus-db-queries", @@ -5126,6 +5160,7 @@ dependencies = [ "omicron-workspace-hack", "oximeter-client", "pq-sys", + "ratatui", "regex", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 3372034810..0c8e3245c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -183,6 +183,7 @@ crossterm = { version = "0.27.0", features = ["event-stream"] } crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "2d4bc11232d53f177c286383926fa5f8c1b2a938" } +csv = "1.3.0" curve25519-dalek = "4" datatest-stable = "0.2.3" display-error-chain = "0.2.0" @@ -197,6 +198,7 @@ dns-server = { path = "dns-server" } dns-service-client = { path = "clients/dns-service-client" } dpd-client = { path = "clients/dpd-client" } dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] } +dyn-clone = "1.0.16" either = "1.9.0" expectorate = "1.1.0" fatfs = "0.3.6" @@ -248,6 +250,7 @@ mime_guess = "2.0.4" mockall = "0.12" newtype_derive = "0.1.6" mg-admin-client = { path = "clients/mg-admin-client" } +multimap = "0.8.1" nexus-blueprint-execution = { path = "nexus/blueprint-execution" } nexus-client = { path = "clients/nexus-client" } nexus-db-model = { path = "nexus/db-model" } diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index e08d5f9477..3f566f55ee 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -12,9 +12,12 @@ anyhow.workspace = true async-bb8-diesel.workspace = true chrono.workspace = true clap.workspace = true +crossterm.workspace = true crucible-agent-client.workspace = true +csv.workspace = true diesel.workspace = true dropshot.workspace = true +dyn-clone.workspace = true futures.workspace = true gateway-client.workspace = true gateway-messages.workspace = true @@ -29,6 +32,7 @@ omicron-common.workspace = true oximeter-client.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" +ratatui.workspace = true serde.workspace = true serde_json.workspace = true sled-agent-client.workspace = true @@ -43,6 +47,7 @@ uuid.workspace = true ipnetwork.workspace = true omicron-workspace-hack.workspace = true nexus-test-utils.workspace = true +multimap.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/mgs.rs b/dev-tools/omdb/src/bin/omdb/mgs.rs index 770cba9f62..ece4c4f109 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs.rs @@ -22,6 +22,12 @@ use gateway_client::types::SpState; use gateway_client::types::SpType; use tabled::Tabled; +mod dashboard; +mod sensors; + +use dashboard::DashboardArgs; +use sensors::SensorsArgs; + /// Arguments to the "omdb mgs" subcommand #[derive(Debug, Args)] pub struct MgsArgs { @@ -35,19 +41,25 @@ pub struct MgsArgs { #[derive(Debug, Subcommand)] enum MgsCommands { + /// Dashboard of SPs + Dashboard(DashboardArgs), + /// Show information about devices and components visible to MGS Inventory(InventoryArgs), + + /// Show information about sensors, as gleaned by MGS + Sensors(SensorsArgs), } #[derive(Debug, Args)] struct InventoryArgs {} impl MgsArgs { - pub(crate) async fn run_cmd( + async fn mgs_client( &self, omdb: &Omdb, log: &slog::Logger, - ) -> Result<(), anyhow::Error> { + ) -> Result { let mgs_url = match &self.mgs_url { Some(cli_or_env_url) => cli_or_env_url.clone(), None => { @@ -68,11 +80,24 @@ impl MgsArgs { } }; eprintln!("note: using MGS URL {}", &mgs_url); - let mgs_client = gateway_client::Client::new(&mgs_url, log.clone()); + Ok(gateway_client::Client::new(&mgs_url, log.clone())) + } + pub(crate) async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> Result<(), anyhow::Error> { match &self.command { - MgsCommands::Inventory(inventory_args) => { - cmd_mgs_inventory(&mgs_client, inventory_args).await + MgsCommands::Dashboard(args) => { + dashboard::cmd_mgs_dashboard(omdb, log, self, args).await + } + MgsCommands::Inventory(args) => { + let mgs_client = self.mgs_client(omdb, log).await?; + cmd_mgs_inventory(&mgs_client, args).await + } + MgsCommands::Sensors(args) => { + sensors::cmd_mgs_sensors(omdb, log, self, args).await } } } @@ -156,6 +181,10 @@ fn sp_type_to_str(s: &SpType) -> &'static str { } } +fn sp_to_string(s: &SpIdentifier) -> String { + format!("{} {}", sp_type_to_str(&s.type_), s.slot) +} + fn show_sp_ids(sp_ids: &[SpIdentifier]) -> Result<(), anyhow::Error> { #[derive(Tabled)] #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs new file mode 100644 index 0000000000..20d651bfdf --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs @@ -0,0 +1,1113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Code for the MGS dashboard subcommand + +use anyhow::{Context, Result}; +use chrono::{Local, Offset, TimeZone}; +use crossterm::{ + event::{ + self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, + KeyModifiers, + }, + execute, + terminal::{ + disable_raw_mode, enable_raw_mode, EnterAlternateScreen, + LeaveAlternateScreen, + }, +}; +use dyn_clone::DynClone; +use ratatui::{ + backend::{Backend, CrosstermBackend}, + layout::{Alignment, Constraint, Direction, Layout, Rect}, + style::{Color, Modifier, Style}, + symbols, + text::{Line, Span}, + widgets::{ + Axis, Block, Borders, Chart, Dataset, List, ListItem, ListState, + Paragraph, + }, + Frame, Terminal, +}; + +use crate::mgs::sensors::{ + sensor_data, sensor_metadata, SensorId, SensorInput, SensorMetadata, + SensorValues, SensorsArgs, +}; +use crate::mgs::sp_to_string; +use clap::Args; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpIdentifier; +use multimap::MultiMap; +use std::collections::HashMap; +use std::fs::File; +use std::io; +use std::time::{Duration, Instant, SystemTime}; + +#[derive(Debug, Args)] +pub(crate) struct DashboardArgs { + #[clap(flatten)] + sensors_args: SensorsArgs, + + /// simulate real-time with input + #[clap(long)] + simulate_realtime: bool, +} + +struct StatefulList { + state: ListState, + n: usize, +} + +impl StatefulList { + fn next(&mut self) { + self.state.select(match self.state.selected() { + Some(ndx) => Some((ndx + 1) % self.n), + None => Some(0), + }); + } + + fn previous(&mut self) { + self.state.select(match self.state.selected() { + Some(0) => Some(self.n - 1), + Some(ndx) => Some(ndx - 1), + None => Some(0), + }); + } + + fn unselect(&mut self) { + self.state.select(None); + } + + fn selected(&self) -> Option { + self.state.selected() + } +} + +struct Series { + name: String, + color: Color, + data: Vec<(f64, f64)>, + raw: Vec>, +} + +trait Attributes: DynClone { + fn label(&self) -> String; + fn legend_label(&self) -> String; + fn x_axis_label(&self) -> String { + "Time".to_string() + } + fn y_axis_label(&self) -> String; + fn axis_value(&self, val: f64) -> String; + fn legend_value(&self, val: f64) -> String; + + fn increase(&mut self, _ndx: usize) -> Option { + None + } + + fn decrease(&mut self, _ndx: usize) -> Option { + None + } + + fn clear(&mut self) {} +} + +dyn_clone::clone_trait_object!(Attributes); + +#[derive(Clone)] +struct TempGraph; + +impl Attributes for TempGraph { + fn label(&self) -> String { + "Temperature".to_string() + } + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Degrees Celsius".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.0}°", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:4.2}°", val) + } +} + +#[derive(Clone)] +struct FanGraph; + +impl Attributes for FanGraph { + fn label(&self) -> String { + "Fan speed".to_string() + } + fn legend_label(&self) -> String { + "Fans".to_string() + } + + fn y_axis_label(&self) -> String { + "RPM".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:3.1}K", val / 1000.0) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:.0}", val) + } +} + +#[derive(Clone)] +struct CurrentGraph; + +impl Attributes for CurrentGraph { + fn label(&self) -> String { + "Output current".to_string() + } + + fn legend_label(&self) -> String { + "Regulators".to_string() + } + + fn y_axis_label(&self) -> String { + "Rails".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}A", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}A", val) + } +} + +#[derive(Clone)] +struct VoltageGraph; + +impl Attributes for VoltageGraph { + fn label(&self) -> String { + "Voltage".to_string() + } + + fn legend_label(&self) -> String { + "Rails".to_string() + } + + fn y_axis_label(&self) -> String { + "Volts".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}V", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}V", val) + } +} + +#[derive(Clone)] +struct SensorGraph; + +impl Attributes for SensorGraph { + fn label(&self) -> String { + "Sensor output".to_string() + } + + fn legend_label(&self) -> String { + "Sensors".to_string() + } + + fn y_axis_label(&self) -> String { + "Units".to_string() + } + + fn axis_value(&self, val: f64) -> String { + format!("{:2.2}", val) + } + + fn legend_value(&self, val: f64) -> String { + format!("{:3.2}", val) + } +} + +struct Graph { + series: Vec, + legend: StatefulList, + time: usize, + width: usize, + offs: usize, + interpolate: usize, + bounds: [f64; 2], + attributes: Box, +} + +impl Graph { + fn new(all: &[String], attr: Box) -> Result { + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, s) in all.iter().enumerate() { + series.push(Series { + name: s.to_string(), + color: colors[ndx % colors.len()], + data: Vec::new(), + raw: Vec::new(), + }) + } + + Ok(Graph { + series, + legend: StatefulList { state: ListState::default(), n: all.len() }, + time: 0, + width: 600, + offs: 0, + interpolate: 0, + bounds: [20.0, 120.0], + attributes: attr, + }) + } + + fn flip(from: &[(&Self, String)], series_ndx: usize) -> Self { + let rep = from[0].0; + let mut series = vec![]; + + let colors = [ + Color::Yellow, + Color::Green, + Color::Magenta, + Color::White, + Color::Red, + Color::LightRed, + Color::Blue, + Color::LightMagenta, + Color::LightYellow, + Color::LightCyan, + Color::LightGreen, + Color::LightBlue, + Color::LightRed, + ]; + + for (ndx, (graph, name)) in from.iter().enumerate() { + series.push(Series { + name: name.clone(), + color: colors[ndx % colors.len()], + data: graph.series[series_ndx].data.clone(), + raw: graph.series[series_ndx].raw.clone(), + }); + } + + Graph { + series, + legend: StatefulList { state: ListState::default(), n: from.len() }, + time: rep.time, + width: rep.width, + offs: rep.offs, + interpolate: rep.interpolate, + bounds: rep.bounds, + attributes: rep.attributes.clone(), + } + } + + fn data(&mut self, data: &[Option]) { + for (ndx, s) in self.series.iter_mut().enumerate() { + s.raw.push(data[ndx]); + } + + self.time += 1; + + if self.offs > 0 { + self.offs += 1; + } + } + + fn update_data(&mut self) { + for s in &mut self.series { + s.data = Vec::new(); + } + + for i in 0..self.width { + if self.time < (self.width - i) + self.offs { + continue; + } + + let offs = self.time - (self.width - i) - self.offs; + + for (_ndx, s) in &mut self.series.iter_mut().enumerate() { + if let Some(datum) = s.raw[offs] { + let point = (i as f64, datum as f64); + + if self.interpolate != 0 { + if let Some(last) = s.data.last() { + let x_delta = point.0 - last.0; + let slope = (point.1 - last.1) / x_delta; + let x_inc = x_delta / self.interpolate as f64; + + for x in 0..self.interpolate { + s.data.push(( + point.0 + x as f64 * x_inc, + point.1 + (slope * x_inc), + )); + } + } + } + + s.data.push((i as f64, datum as f64)); + } + } + } + + self.update_bounds(); + } + + fn update_bounds(&mut self) { + let selected = self.legend.state.selected(); + let mut min = None; + let mut max = None; + + for (ndx, s) in self.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + for (_, datum) in &s.data { + min = match min { + Some(min) if datum < min => Some(datum), + None => Some(datum), + _ => min, + }; + + max = match max { + Some(max) if datum > max => Some(datum), + None => Some(datum), + _ => max, + }; + } + } + + if let Some(min) = min { + self.bounds[0] = ((min * 0.85) / 2.0) * 2.0; + } + + if self.bounds[0] < 0.0 { + self.bounds[0] = 0.0; + } + + if let Some(max) = max { + self.bounds[1] = ((max * 1.15) / 2.0) * 2.0; + } + } + + fn previous(&mut self) { + self.legend.previous(); + } + + fn next(&mut self) { + self.legend.next(); + } + + fn unselect(&mut self) { + self.legend.unselect(); + } + + fn selected(&self) -> Option { + self.legend.selected() + } + + fn set_interpolate(&mut self) { + let interpolate = (1000.0 - self.width as f64) / self.width as f64; + + if interpolate >= 1.0 { + self.interpolate = interpolate as usize; + } else { + self.interpolate = 0; + } + } + + fn zoom_in(&mut self) { + self.width = (self.width as f64 * 0.8) as usize; + self.set_interpolate(); + } + + fn zoom_out(&mut self) { + self.width = (self.width as f64 * 1.25) as usize; + self.set_interpolate(); + } + + fn time_right(&mut self) { + let delta = (self.width as f64 * 0.25) as usize; + + if delta > self.offs { + self.offs = 0; + } else { + self.offs -= delta; + } + } + + fn time_left(&mut self) { + self.offs += (self.width as f64 * 0.25) as usize; + } +} + +struct Dashboard { + graphs: HashMap<(SpIdentifier, MeasurementKind), Graph>, + flipped: HashMap, + sids: HashMap<(SpIdentifier, MeasurementKind), Vec>, + kinds: Vec, + selected_kind: usize, + sps: Vec, + selected_sp: usize, + status: String, + time: u64, +} + +impl Dashboard { + fn new(metadata: &SensorMetadata) -> Result { + let mut sps = + metadata.sensors_by_sp.keys().copied().collect::>(); + let mut graphs = HashMap::new(); + let mut sids = HashMap::new(); + sps.sort(); + + let kinds = vec![ + MeasurementKind::Temperature, + MeasurementKind::Speed, + MeasurementKind::Current, + ]; + + for &sp in sps.iter() { + let sensors = metadata.sensors_by_sp.get_vec(&sp).unwrap(); + let mut by_kind = MultiMap::new(); + + for sid in sensors { + let (_, s, _) = metadata.sensors_by_id.get(sid).unwrap(); + by_kind.insert(s.kind, (s.name.clone(), *sid)); + } + + let keys = by_kind.keys().copied().collect::>(); + + for k in keys { + let mut v = by_kind.remove(&k).unwrap(); + v.sort(); + + let labels = + v.iter().map(|(n, _)| n.clone()).collect::>(); + + graphs.insert( + (sp, k), + Graph::new( + labels.as_slice(), + match k { + MeasurementKind::Temperature => Box::new(TempGraph), + MeasurementKind::Current => Box::new(CurrentGraph), + MeasurementKind::Speed => Box::new(FanGraph), + MeasurementKind::Voltage => Box::new(VoltageGraph), + _ => Box::new(SensorGraph), + }, + )?, + ); + + sids.insert( + (sp, k), + v.iter().map(|(_, sid)| *sid).collect::>(), + ); + } + } + + let status = sp_to_string(&sps[0]); + + Ok(Dashboard { + graphs, + flipped: HashMap::new(), + sids, + kinds, + selected_kind: 0, + sps, + selected_sp: 0, + status, + time: secs()?, + }) + } + + fn status(&self) -> Vec<(&str, &str)> { + vec![("Status", &self.status)] + } + + fn update_data(&mut self) { + for graph in self.graphs.values_mut() { + graph.update_data(); + } + + for graph in self.flipped.values_mut() { + graph.update_data(); + } + } + + fn up(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.previous(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().previous(); + } + } + + fn down(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.next(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().next(); + } + } + + fn esc(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if let Some(flipped) = self.flipped.get_mut(&selected_kind) { + flipped.unselect(); + return; + } + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + self.graphs.get_mut(&(*sp, selected_kind)).unwrap().unselect(); + } + } + + fn left(&mut self) { + if self.selected_sp == 0 { + self.selected_sp = self.sps.len() - 1; + } else { + self.selected_sp -= 1; + } + + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn right(&mut self) { + self.selected_sp = (self.selected_sp + 1) % self.sps.len(); + self.status = sp_to_string(&self.sps[self.selected_sp]); + } + + fn time_left(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_left(); + } + + for graph in self.flipped.values_mut() { + graph.time_left(); + } + } + + fn time_right(&mut self) { + for graph in self.graphs.values_mut() { + graph.time_right(); + } + + for graph in self.flipped.values_mut() { + graph.time_right(); + } + } + + fn flip(&mut self) { + let selected_kind = self.kinds[self.selected_kind]; + let type_ = self.sps[self.selected_sp].type_; + + if self.flipped.remove(&selected_kind).is_some() { + return; + } + + let sp = self.sps[self.selected_sp]; + + let graph = self.graphs.get(&(sp, selected_kind)).unwrap(); + + if let Some(ndx) = graph.selected() { + let mut from = vec![]; + + for sp in self.sps.iter().filter(|&s| s.type_ == type_) { + from.push(( + self.graphs.get(&(*sp, selected_kind)).unwrap(), + sp_to_string(sp), + )); + } + + self.flipped + .insert(selected_kind, Graph::flip(from.as_slice(), ndx)); + } + } + + fn tab(&mut self) { + self.selected_kind = (self.selected_kind + 1) % self.kinds.len(); + } + + fn zoom_in(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_in(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_in(); + } + } + + fn zoom_out(&mut self) { + for graph in self.graphs.values_mut() { + graph.zoom_out(); + } + + for graph in self.flipped.values_mut() { + graph.zoom_out(); + } + } + + fn gap(&mut self, length: u64) { + let mut gap: Vec> = vec![]; + + for (graph, sids) in &self.sids { + while gap.len() < sids.len() { + gap.push(None); + } + + let graph = self.graphs.get_mut(graph).unwrap(); + + for _ in 0..length { + graph.data(&gap[0..sids.len()]); + } + } + } + + fn values(&mut self, values: &SensorValues) { + for (graph, sids) in &self.sids { + let mut data = vec![]; + + for sid in sids { + if let Some(value) = values.values.get(sid) { + data.push(*value); + } else { + data.push(None); + } + } + + let graph = self.graphs.get_mut(graph).unwrap(); + graph.data(data.as_slice()); + } + + self.time = values.time; + } +} + +fn run_dashboard( + terminal: &mut Terminal, + dashboard: &mut Dashboard, + force_update: bool, +) -> Result { + let update = if crossterm::event::poll(Duration::from_secs(0))? { + if let Event::Key(key) = event::read()? { + match key.code { + KeyCode::Char('q') => return Ok(true), + KeyCode::Char('+') => dashboard.zoom_in(), + KeyCode::Char('-') => dashboard.zoom_out(), + KeyCode::Char('<') => dashboard.time_left(), + KeyCode::Char('>') => dashboard.time_right(), + KeyCode::Char('!') => dashboard.flip(), + KeyCode::Char('l') => { + // + // ^L -- form feed -- is historically used to clear and + // redraw the screen. And, notably, it is what dtach(1) + // will send when attaching to a dashboard. If we + // see ^L, clear the terminal to force a total redraw. + // + if key.modifiers == KeyModifiers::CONTROL { + terminal.clear()?; + } + } + KeyCode::Up => dashboard.up(), + KeyCode::Down => dashboard.down(), + KeyCode::Right => dashboard.right(), + KeyCode::Left => dashboard.left(), + KeyCode::Esc => dashboard.esc(), + KeyCode::Tab => dashboard.tab(), + _ => {} + } + } + true + } else { + force_update + }; + + if update { + dashboard.update_data(); + terminal.draw(|f| draw(f, dashboard))?; + } + + Ok(false) +} + +fn secs() -> Result { + let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH)?; + Ok(now.as_secs()) +} + +/// +/// Runs `omdb mgs dashboard` +/// +pub(crate) async fn cmd_mgs_dashboard( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &DashboardArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.sensors_args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, values) = + sensor_metadata(&mut input, &args.sensors_args).await?; + + let mut dashboard = Dashboard::new(&metadata)?; + let mut last = values.time; + let mut force = true; + let mut update = true; + + dashboard.values(&values); + + if args.sensors_args.input.is_some() && !args.simulate_realtime { + loop { + let values = sensor_data(&mut input, &metadata).await?; + + if values.time == 0 { + break; + } + + if values.time != last + 1 { + dashboard.gap(values.time - last - 1); + } + + last = values.time; + dashboard.values(&values); + } + + update = false; + } + + // setup terminal + enable_raw_mode()?; + let mut stdout = io::stdout(); + execute!(stdout, EnterAlternateScreen, EnableMouseCapture)?; + let backend = CrosstermBackend::new(stdout); + let mut terminal = Terminal::new(backend)?; + + let res = 'outer: loop { + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break Err(err), + Ok(true) => break Ok(()), + _ => {} + } + + force = false; + + let now = match secs() { + Err(err) => break Err(err), + Ok(now) => now, + }; + + if update && now != last { + let kicked = Instant::now(); + let f = sensor_data(&mut input, &metadata); + last = now; + + while Instant::now().duration_since(kicked).as_millis() < 800 { + tokio::time::sleep(Duration::from_millis(10)).await; + + match run_dashboard(&mut terminal, &mut dashboard, force) { + Err(err) => break 'outer Err(err), + Ok(true) => break 'outer Ok(()), + _ => {} + } + } + + let values = match f.await { + Err(err) => break Err(err), + Ok(v) => v, + }; + + dashboard.values(&values); + force = true; + continue; + } + + tokio::time::sleep(Duration::from_millis(10)).await; + }; + + // restore terminal + disable_raw_mode()?; + execute!( + terminal.backend_mut(), + LeaveAlternateScreen, + DisableMouseCapture + )?; + terminal.show_cursor()?; + + if let Err(err) = res { + println!("{err:?}"); + } + + Ok(()) +} + +fn draw_graph(f: &mut Frame, parent: Rect, graph: &mut Graph, now: u64) { + // + // We want the right panel to be 31 characters wide (a left-justified 20 + // and a right justified 8 + margins), but we don't want it to consume + // more than 80%; calculate accordingly. + // + let r = std::cmp::min((31 * 100) / parent.width, 80); + + let chunks = Layout::default() + .direction(Direction::Horizontal) + .constraints( + [Constraint::Percentage(100 - r), Constraint::Percentage(r)] + .as_ref(), + ) + .split(parent); + + let latest = now as i64 - graph.offs as i64; + let earliest = Local.timestamp_opt(latest - graph.width as i64, 0).unwrap(); + let latest = Local.timestamp_opt(latest, 0).unwrap(); + + // + // We want a format that preserves horizontal real estate just a tad more + // than .to_rfc3339_opts()... + // + let fmt = "%Y-%m-%d %H:%M:%S"; + + let tz_offset = earliest.offset().fix().local_minus_utc(); + let tz = if tz_offset != 0 { + let hours = tz_offset / 3600; + let minutes = (tz_offset % 3600) / 60; + + if minutes != 0 { + format!("Z{:+}:{:02}", hours, minutes.abs()) + } else { + format!("Z{:+}", hours) + } + } else { + "Z".to_string() + }; + + let x_labels = vec![ + Span::styled( + format!("{}{}", earliest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + format!("{}{}", latest.format(fmt), tz), + Style::default().add_modifier(Modifier::BOLD), + ), + ]; + + let mut datasets = vec![]; + let selected = graph.legend.state.selected(); + + for (ndx, s) in graph.series.iter().enumerate() { + if let Some(selected) = selected { + if ndx != selected { + continue; + } + } + + datasets.push( + Dataset::default() + .name(&s.name) + .marker(symbols::Marker::Braille) + .style(Style::default().fg(s.color)) + .data(&s.data), + ); + } + + let chart = Chart::new(datasets) + .block( + Block::default() + .title(Span::styled( + graph.attributes.label(), + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL), + ) + .x_axis( + Axis::default() + .title(graph.attributes.x_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(x_labels) + .bounds([0.0, graph.width as f64]) + .labels_alignment(Alignment::Right), + ) + .y_axis( + Axis::default() + .title(graph.attributes.y_axis_label()) + .style(Style::default().fg(Color::Gray)) + .labels(vec![ + Span::styled( + graph.attributes.axis_value(graph.bounds[0]), + Style::default().add_modifier(Modifier::BOLD), + ), + Span::styled( + graph.attributes.axis_value(graph.bounds[1]), + Style::default().add_modifier(Modifier::BOLD), + ), + ]) + .bounds(graph.bounds), + ); + + f.render_widget(chart, chunks[0]); + + let mut rows = vec![]; + + for s in &graph.series { + let val = match s.raw.last() { + None | Some(None) => "-".to_string(), + Some(Some(val)) => graph.attributes.legend_value((*val).into()), + }; + + rows.push(ListItem::new(Line::from(vec![ + Span::styled( + format!("{:<20}", s.name), + Style::default().fg(s.color), + ), + Span::styled(format!("{:>8}", val), Style::default().fg(s.color)), + ]))); + } + + let list = List::new(rows) + .block( + Block::default() + .borders(Borders::ALL) + .title(graph.attributes.legend_label()), + ) + .highlight_style( + Style::default() + .bg(Color::LightGreen) + .fg(Color::Black) + .add_modifier(Modifier::BOLD), + ); + + // We can now render the item list + f.render_stateful_widget(list, chunks[1], &mut graph.legend.state); +} + +fn draw_graphs(f: &mut Frame, parent: Rect, dashboard: &mut Dashboard) { + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints( + [ + Constraint::Ratio(1, 2), + Constraint::Ratio(1, 4), + Constraint::Ratio(1, 4), + ] + .as_ref(), + ) + .split(parent); + + let sp = dashboard.sps[dashboard.selected_sp]; + + for (i, k) in dashboard.kinds.iter().enumerate() { + if let Some(graph) = dashboard.flipped.get_mut(k) { + draw_graph(f, screen[i], graph, dashboard.time); + } else { + draw_graph( + f, + screen[i], + dashboard.graphs.get_mut(&(sp, *k)).unwrap(), + dashboard.time, + ); + } + } +} + +fn draw_status(f: &mut Frame, parent: Rect, status: &[(&str, &str)]) { + let mut bar = vec![]; + + for i in 0..status.len() { + let s = &status[i]; + + bar.push(Span::styled( + s.0, + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::styled( + ": ", + Style::default().add_modifier(Modifier::BOLD), + )); + + bar.push(Span::raw(s.1)); + + if i < status.len() - 1 { + bar.push(Span::raw(" | ")); + } + } + + let text = vec![Line::from(bar)]; + + let para = Paragraph::new(text) + .alignment(Alignment::Right) + .style(Style::default().fg(Color::White).bg(Color::Black)); + + f.render_widget(para, parent); +} + +fn draw(f: &mut Frame, dashboard: &mut Dashboard) { + let size = f.size(); + + let screen = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Min(1), Constraint::Length(1)].as_ref()) + .split(size); + + draw_graphs(f, screen[0], dashboard); + draw_status(f, screen[1], &dashboard.status()); +} diff --git a/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs new file mode 100644 index 0000000000..d00bebd96c --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/mgs/sensors.rs @@ -0,0 +1,950 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Implementation of the "mgs sensors" subcommand + +use anyhow::{bail, Context}; +use clap::Args; +use gateway_client::types::MeasurementErrorCode; +use gateway_client::types::MeasurementKind; +use gateway_client::types::SpComponentDetails; +use gateway_client::types::SpIdentifier; +use gateway_client::types::SpIgnition; +use gateway_client::types::SpType; +use multimap::MultiMap; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +#[derive(Debug, Args)] +pub(crate) struct SensorsArgs { + /// verbose messages + #[clap(long, short)] + pub verbose: bool, + + /// restrict to specified sled(s) + #[clap(long, use_value_delimiter = true)] + pub sled: Vec, + + /// exclude sleds rather than include them + #[clap(long, short)] + pub exclude: bool, + + /// include switches + #[clap(long)] + pub switches: bool, + + /// include PSC + #[clap(long)] + pub psc: bool, + + /// print sensors every second + #[clap(long, short)] + pub sleep: bool, + + /// parseable output + #[clap(long, short)] + pub parseable: bool, + + /// show latencies + #[clap(long)] + pub show_latencies: bool, + + /// restrict sensors by type of sensor + #[clap( + long, + short, + value_name = "sensor type", + use_value_delimiter = true + )] + pub types: Option>, + + /// restrict sensors by name + #[clap( + long, + short, + value_name = "sensor name", + use_value_delimiter = true + )] + pub named: Option>, + + /// simulate using specified file as input + #[clap(long, short)] + pub input: Option, + + /// start time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub start: Option, + + /// end time, if using an input file + #[clap(long, value_name = "time", requires = "input")] + pub end: Option, + + /// duration, if using an input file + #[clap( + long, + value_name = "seconds", + requires = "input", + conflicts_with = "end" + )] + pub duration: Option, +} + +impl SensorsArgs { + fn matches_sp(&self, sp: &SpIdentifier) -> bool { + match sp.type_ { + SpType::Sled => { + let matched = if !self.sled.is_empty() { + self.sled.contains(&sp.slot) + } else { + true + }; + + matched != self.exclude + } + SpType::Switch => self.switches, + SpType::Power => self.psc, + } + } +} + +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Sensor { + pub name: String, + pub kind: MeasurementKind, +} + +impl Sensor { + fn units(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "°C", + MeasurementKind::Current | MeasurementKind::InputCurrent => "A", + MeasurementKind::Voltage | MeasurementKind::InputVoltage => "V", + MeasurementKind::Speed => "RPM", + MeasurementKind::Power => "W", + } + } + + fn format(&self, value: f32, parseable: bool) -> String { + if parseable { + format!("{value}") + } else { + match self.kind { + MeasurementKind::Speed => { + // + // This space is deliberate: other units (°C, V, A) look + // more natural when directly attached to their value -- + // but RPM looks decidedly unnatural without a space. + // + format!("{value:0} RPM") + } + _ => { + format!("{value:.2}{}", self.units()) + } + } + } + } + + fn to_kind_string(&self) -> &str { + match self.kind { + MeasurementKind::Temperature => "temp", + MeasurementKind::Power => "power", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::InputCurrent => "input-current", + MeasurementKind::InputVoltage => "input-voltage", + MeasurementKind::Speed => "speed", + } + } + + fn from_string(name: &str, kind: &str) -> Option { + let k = match kind { + "temp" | "temperature" => Some(MeasurementKind::Temperature), + "power" => Some(MeasurementKind::Power), + "current" => Some(MeasurementKind::Current), + "voltage" => Some(MeasurementKind::Voltage), + "input-current" => Some(MeasurementKind::InputCurrent), + "input-voltage" => Some(MeasurementKind::InputVoltage), + "speed" => Some(MeasurementKind::Speed), + _ => None, + }; + + k.map(|kind| Sensor { name: name.to_string(), kind }) + } +} + +pub(crate) enum SensorInput { + MgsClient(gateway_client::Client), + CsvReader(csv::Reader, csv::Position), +} + +#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct SensorId(u32); + +#[derive(Debug)] +pub(crate) struct SensorMetadata { + pub sensors_by_sensor: MultiMap, + pub sensors_by_sensor_and_sp: + HashMap>, + pub sensors_by_id: + HashMap, + pub sensors_by_sp: MultiMap, + pub work_by_sp: + HashMap)>>, + #[allow(dead_code)] + pub start_time: Option, + pub end_time: Option, +} + +struct SensorSpInfo { + info: Vec<(SpIdentifier, SpInfo)>, + time: u64, + latencies: Option>, +} + +pub(crate) struct SensorValues { + pub values: HashMap>, + pub latencies: Option>, + pub time: u64, +} + +/// +/// We identify a device as either a physical device (i.e., when connecting +/// to MGS), or as a field in the CSV header (i.e., when processing data +/// postmortem. It's handy to have this as enum to allow most of the code +/// to be agnostic to the underlying source, but callers of ['device'] and +/// ['field'] are expected to know which of these they're dealing with. +/// +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub(crate) enum DeviceIdentifier { + Field(usize), + Device(String), +} + +impl DeviceIdentifier { + fn device(&self) -> &String { + match self { + Self::Device(ref device) => device, + _ => panic!(), + } + } + + fn field(&self) -> usize { + match self { + Self::Field(field) => *field, + _ => panic!(), + } + } +} + +struct SpInfo { + devices: MultiMap)>, + timestamps: Vec, +} + +async fn sp_info( + mgs_client: gateway_client::Client, + type_: SpType, + slot: u32, +) -> Result { + let mut devices = MultiMap::new(); + let mut timestamps = vec![]; + + timestamps.push(std::time::Instant::now()); + + // + // First, get a component list. + // + let components = mgs_client.sp_component_list(type_, slot).await?; + timestamps.push(std::time::Instant::now()); + + // + // Now, for every component, we're going to get its details: for those + // that are sensors (and contain measurements), we will store the name + // of the sensor as well as the retrieved value. + // + for c in &components.components { + for s in mgs_client + .sp_component_get(type_, slot, &c.component) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind, name, value } => Some( + (Sensor { name: name.clone(), kind: *kind }, Some(*value)), + ), + SpComponentDetails::MeasurementError { kind, name, error } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(( + Sensor { name: name.clone(), kind: *kind }, + None, + )), + } + } + _ => None, + }) + { + devices.insert(DeviceIdentifier::Device(c.component.clone()), s); + } + } + + timestamps.push(std::time::Instant::now()); + + Ok(SpInfo { devices, timestamps }) +} + +async fn sp_info_mgs( + mgs_client: &gateway_client::Client, + args: &SensorsArgs, +) -> Result { + let mut rval = vec![]; + let mut latencies = HashMap::new(); + + // + // First, get all of the SPs that we can see via Ignition + // + let all_sp_list = + mgs_client.ignition_list().await.context("listing ignition")?; + + let mut sp_list = all_sp_list + .iter() + .filter_map(|ignition| { + if matches!(ignition.details, SpIgnition::Yes { .. }) + && ignition.id.type_ == SpType::Sled + { + if args.matches_sp(&ignition.id) { + return Some(ignition.id); + } + } + None + }) + .collect::>(); + + if args.switches { + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 0 }); + sp_list.push(SpIdentifier { type_: SpType::Switch, slot: 1 }); + } + + if args.psc { + sp_list.push(SpIdentifier { type_: SpType::Power, slot: 0 }); + } + + sp_list.sort(); + + let now = std::time::Instant::now(); + + let mut handles = vec![]; + for sp_id in sp_list { + let handle = + tokio::spawn(sp_info(mgs_client.clone(), sp_id.type_, sp_id.slot)); + + handles.push((sp_id, handle)); + } + + for (sp_id, handle) in handles { + match handle.await.unwrap() { + Ok(info) => { + let l0 = info.timestamps[1].duration_since(info.timestamps[0]); + let l1 = info.timestamps[2].duration_since(info.timestamps[1]); + + if args.verbose { + eprintln!( + "mgs: latencies for {sp_id:?}: {l1:.1?} {l0:.1?}", + ); + } + + latencies.insert(sp_id, l0 + l1); + rval.push((sp_id, info)); + } + + Err(err) => { + eprintln!("failed to read devices for {:?}: {:?}", sp_id, err); + } + } + } + + if args.verbose { + eprintln!("total discovery time {:?}", now.elapsed()); + } + + Ok(SensorSpInfo { + info: rval, + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + latencies: Some(latencies), + }) +} + +fn sp_info_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + args: &SensorsArgs, +) -> Result { + let mut sps = vec![]; + let headers = reader.headers()?; + + let expected = ["TIME", "SENSOR", "KIND"]; + let len = expected.len(); + let hlen = headers.len(); + + if hlen < len { + bail!("expected as least {len} fields (found {headers:?})"); + } + + for ndx in 0..len { + if &headers[ndx] != expected[ndx] { + bail!( + "malformed headers: expected {}, found {} ({headers:?})", + &expected[ndx], + &headers[ndx] + ); + } + } + + for ndx in len..hlen { + let field = &headers[ndx]; + let parts: Vec<&str> = field.splitn(2, '-').collect(); + + if parts.len() != 2 { + bail!("malformed field \"{field}\""); + } + + let type_ = match parts[0] { + "SLED" => SpType::Sled, + "SWITCH" => SpType::Switch, + "POWER" => SpType::Power, + _ => { + bail!("unknown type {}", parts[0]); + } + }; + + let slot = parts[1].parse::().or_else(|_| { + bail!("invalid slot in \"{field}\""); + })?; + + let sp = SpIdentifier { type_, slot }; + + if args.matches_sp(&sp) { + sps.push(Some(sp)); + } else { + sps.push(None); + } + } + + let mut iter = reader.records(); + let mut sensors = HashSet::new(); + let mut by_sp = MultiMap::new(); + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + if time.is_none() { + let t = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(start) = args.start { + if t < start { + continue; + } + } + + if let Some(end) = args.end { + if let Some(start) = args.start { + if start > end { + bail!( + "specified start time is later than end time" + ); + } + } + + if t > end { + bail!( + "specified end time ({end}) is earlier \ + than time of earliest record ({t})" + ); + } + } + + time = Some(t); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if sensors.get(&sensor).is_some() { + break; + } + + sensors.insert(sensor.clone()); + + for (ndx, sp) in sps.iter().enumerate() { + if let Some(sp) = sp { + let value = match record[ndx + len].parse::() { + Ok(value) => Some(value), + _ => { + // + // We want to distinguish between the device + // having an error ("X") and it being absent + // ("-"); if it's absent, we don't want to add + // it at all. + // + match &record[ndx + len] { + "X" => {} + "-" => continue, + _ => { + bail!( + "line {}: unrecognized value \ + \"{}\" in field {}", + position.line(), + record[ndx + len].to_string(), + ndx + len + ); + } + } + + None + } + }; + + by_sp.insert(sp, (sensor.clone(), value)); + } + } + } + } else { + break; + } + } + + if time.is_none() { + bail!("no data found"); + } + + let mut rval = vec![]; + + for (field, sp) in sps.iter().enumerate() { + let mut devices = MultiMap::new(); + + if let Some(sp) = sp { + if let Some(v) = by_sp.remove(sp) { + devices.insert_many(DeviceIdentifier::Field(field + len), v); + } + + rval.push((*sp, SpInfo { devices, timestamps: vec![] })); + } + } + + Ok(SensorSpInfo { info: rval, time: time.unwrap(), latencies: None }) +} + +pub(crate) async fn sensor_metadata( + input: &mut SensorInput, + args: &SensorsArgs, +) -> Result<(Arc, SensorValues), anyhow::Error> { + let by_kind = if let Some(types) = &args.types { + let mut h = HashSet::new(); + + for t in types { + h.insert(match Sensor::from_string("", t) { + None => bail!("invalid sensor kind {t}"), + Some(s) => s.kind, + }); + } + + Some(h) + } else { + None + }; + + let by_name = args + .named + .as_ref() + .map(|named| named.into_iter().collect::>()); + + let info = match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_info_mgs(mgs_client, args).await? + } + SensorInput::CsvReader(reader, position) => { + sp_info_csv(reader, position, args)? + } + }; + + let mut sensors_by_sensor = MultiMap::new(); + let mut sensors_by_sensor_and_sp = HashMap::new(); + let mut sensors_by_id = HashMap::new(); + let mut sensors_by_sp = MultiMap::new(); + let mut values = HashMap::new(); + let mut work_by_sp = HashMap::new(); + + let mut current = 0; + let time = info.time; + + for (sp_id, info) in info.info { + let mut sp_work = vec![]; + + for (device, sensors) in info.devices { + let mut device_work = vec![]; + + for (sensor, value) in sensors { + if let Some(ref by_kind) = by_kind { + if by_kind.get(&sensor.kind).is_none() { + continue; + } + } + + if let Some(ref by_name) = by_name { + if by_name.get(&sensor.name).is_none() { + continue; + } + } + + let id = SensorId(current); + current += 1; + + sensors_by_id + .insert(id, (sp_id, sensor.clone(), device.clone())); + + if value.is_none() && args.verbose { + eprintln!( + "mgs: error for {sp_id:?} on {sensor:?} ({device:?})" + ); + } + + sensors_by_sensor.insert(sensor.clone(), id); + + let by_sp = sensors_by_sensor_and_sp + .entry(sensor) + .or_insert_with(|| HashMap::new()); + by_sp.insert(sp_id, id); + sensors_by_sp.insert(sp_id, id); + values.insert(id, value); + + device_work.push(id); + } + + sp_work.push((device, device_work)); + } + + work_by_sp.insert(sp_id, sp_work); + } + + Ok(( + Arc::new(SensorMetadata { + sensors_by_sensor, + sensors_by_sensor_and_sp, + sensors_by_id, + sensors_by_sp, + work_by_sp, + start_time: args.start, + end_time: match args.end { + Some(end) => Some(end), + None => args.duration.map(|duration| time + duration), + }, + }), + SensorValues { values, time, latencies: info.latencies }, + )) +} + +async fn sp_read_sensors( + mgs_client: &gateway_client::Client, + id: &SpIdentifier, + metadata: &SensorMetadata, +) -> Result<(Vec<(SensorId, Option)>, Duration), anyhow::Error> { + let work = metadata.work_by_sp.get(id).unwrap(); + let mut rval = vec![]; + + let start = std::time::Instant::now(); + + for (component, ids) in work.iter() { + for (value, id) in mgs_client + .sp_component_get(id.type_, id.slot, component.device()) + .await? + .iter() + .filter_map(|detail| match detail { + SpComponentDetails::Measurement { kind: _, name: _, value } => { + Some(Some(*value)) + } + SpComponentDetails::MeasurementError { error, .. } => { + match error { + MeasurementErrorCode::NoReading + | MeasurementErrorCode::NotPresent => None, + _ => Some(None), + } + } + _ => None, + }) + .zip(ids.iter()) + { + rval.push((*id, value)); + } + } + + Ok((rval, start.elapsed())) +} + +async fn sp_data_mgs( + mgs_client: &gateway_client::Client, + metadata: &Arc, +) -> Result { + let mut values = HashMap::new(); + let mut latencies = HashMap::new(); + let mut handles = vec![]; + + for sp_id in metadata.sensors_by_sp.keys() { + let mgs_client = mgs_client.clone(); + let id = *sp_id; + let metadata = Arc::clone(&metadata); + + let handle = tokio::spawn(async move { + sp_read_sensors(&mgs_client, &id, &metadata).await + }); + + handles.push((id, handle)); + } + + for (id, handle) in handles { + let (rval, latency) = handle.await.unwrap()?; + + latencies.insert(id, latency); + + for (id, value) in rval { + values.insert(id, value); + } + } + + Ok(SensorValues { + values, + latencies: Some(latencies), + time: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs(), + }) +} + +fn sp_data_csv( + reader: &mut csv::Reader, + position: &mut csv::Position, + metadata: &SensorMetadata, +) -> Result { + let headers = reader.headers()?; + let hlen = headers.len(); + let mut values = HashMap::new(); + + reader.seek(position.clone())?; + let mut iter = reader.records(); + + let mut time = None; + + loop { + *position = iter.reader().position().clone(); + + if let Some(record) = iter.next() { + let record = record?; + + if record.len() != hlen { + bail!("bad record length at line {}", position.line()); + } + + let now = record[0].parse::().or_else(|_| { + bail!("bad time at line {}", position.line()); + })?; + + if let Some(time) = time { + if now != time { + break; + } + } else { + if let Some(end) = metadata.end_time { + if now > end { + time = Some(0); + break; + } + } + + time = Some(now); + } + + if let Some(sensor) = Sensor::from_string(&record[1], &record[2]) { + if let Some(ids) = metadata.sensors_by_sensor.get_vec(&sensor) { + for id in ids { + let (_, _, d) = metadata.sensors_by_id.get(id).unwrap(); + let value = match record[d.field()].parse::() { + Ok(value) => Some(value), + _ => None, + }; + + values.insert(*id, value); + } + } + } else { + bail!("bad sensor at line {}", position.line()); + } + } else { + time = Some(0); + break; + } + } + + Ok(SensorValues { values, latencies: None, time: time.unwrap() }) +} + +pub(crate) async fn sensor_data( + input: &mut SensorInput, + metadata: &Arc, +) -> Result { + match input { + SensorInput::MgsClient(ref mgs_client) => { + sp_data_mgs(mgs_client, metadata).await + } + SensorInput::CsvReader(reader, position) => { + sp_data_csv(reader, position, &metadata) + } + } +} + +/// +/// Runs `omdb mgs sensors` +/// +pub(crate) async fn cmd_mgs_sensors( + omdb: &crate::Omdb, + log: &slog::Logger, + mgs_args: &crate::mgs::MgsArgs, + args: &SensorsArgs, +) -> Result<(), anyhow::Error> { + let mut input = if let Some(ref input) = args.input { + let file = File::open(input) + .with_context(|| format!("failed to open {input}"))?; + SensorInput::CsvReader( + csv::Reader::from_reader(file), + csv::Position::new(), + ) + } else { + SensorInput::MgsClient(mgs_args.mgs_client(omdb, log).await?) + }; + + let (metadata, mut values) = sensor_metadata(&mut input, args).await?; + + let mut sensors = metadata.sensors_by_sensor.keys().collect::>(); + sensors.sort(); + + let mut sps = metadata.sensors_by_sp.keys().collect::>(); + sps.sort(); + + let print_value = |v| { + if args.parseable { + print!(",{v}"); + } else { + print!(" {v:>8}"); + } + }; + + let print_header = || { + if !args.parseable { + print!("{:20} ", "NAME"); + } else { + print!("TIME,SENSOR,KIND"); + } + + for sp in &sps { + print_value(format!( + "{}-{}", + crate::mgs::sp_type_to_str(&sp.type_).to_uppercase(), + sp.slot + )); + } + + println!(); + }; + + let print_name = |sensor: &Sensor, now: u64| { + if !args.parseable { + print!("{:20} ", sensor.name); + } else { + print!("{now},{},{}", sensor.name, sensor.to_kind_string()); + } + }; + + let print_latency = |now: u64| { + if !args.parseable { + print!("{:20} ", "LATENCY"); + } else { + print!("{now},{},{}", "LATENCY", "latency"); + } + }; + + let mut wakeup = + tokio::time::Instant::now() + tokio::time::Duration::from_millis(1000); + + print_header(); + + loop { + for sensor in &sensors { + print_name(sensor, values.time); + + let by_sp = metadata.sensors_by_sensor_and_sp.get(sensor).unwrap(); + + for sp in &sps { + print_value(if let Some(id) = by_sp.get(sp) { + if let Some(value) = values.values.get(id) { + match value { + Some(value) => { + sensor.format(*value, args.parseable) + } + None => "X".to_string(), + } + } else { + "?".to_string() + } + } else { + "-".to_string() + }); + } + + println!(); + } + + if args.show_latencies { + if let Some(latencies) = values.latencies { + print_latency(values.time); + + for sp in &sps { + print_value(if let Some(latency) = latencies.get(sp) { + format!("{}ms", latency.as_millis()) + } else { + "?".to_string() + }); + } + } + + println!(); + } + + if !args.sleep { + if args.input.is_none() { + break; + } + } else { + tokio::time::sleep_until(wakeup).await; + wakeup += tokio::time::Duration::from_millis(1000); + } + + values = sensor_data(&mut input, &metadata).await?; + + if args.input.is_some() && values.time == 0 { + break; + } + + if !args.parseable { + print_header(); + } + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 2790b0ef83..7688372984 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -270,7 +270,9 @@ Debug a specific Management Gateway Service instance Usage: omdb mgs [OPTIONS] Commands: + dashboard Dashboard of SPs inventory Show information about devices and components visible to MGS + sensors Show information about sensors, as gleaned by MGS help Print this message or the help of the given subcommand(s) Options: From 7e0ce9905ecf46e815f370b4cb5b1dfc05a36096 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Tue, 6 Feb 2024 11:14:20 -0600 Subject: [PATCH 06/27] Tweak a few API endpoint summaries and descriptions (#4999) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR only changes descriptive text in the OpenAPI definition, so it doesn't really have to go into the RC, but it is also low risk to pull in. --- Saw these on the [integration branch docs](https://docs-git-integration-oxidecomputer.vercel.app/api/ip_pool_silo_update) and felt they could use some work. ![Screenshot 2024-02-06 at 10 06 38 AM](https://github.com/oxidecomputer/omicron/assets/3612203/c2dbdf3d-da69-43f6-9568-85476af2d48e) Also got rid of a few stray periods: ![Screenshot 2024-02-06 at 10 06 50 AM](https://github.com/oxidecomputer/omicron/assets/3612203/42a70202-5cb2-49b2-9da5-5a57b49a7b99) --- nexus/src/external_api/http_entrypoints.rs | 39 +++++++++++++--------- openapi/nexus.json | 28 ++++++++-------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ccd8cebad6..28755e5959 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -759,7 +759,11 @@ async fn silo_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List IP pools available within silo +/// List IP pools linked to silo +/// +/// Linked IP pools are available to users in the specified silo. A silo can +/// have at most one default pool. IPs are allocated from the default pool when +/// users ask for one without specifying a pool. #[endpoint { method = GET, path = "/v1/system/silos/{silo}/ip-pools", @@ -803,7 +807,7 @@ async fn silo_ip_pool_list( /// Delete a silo /// -/// Delete a silo by name. +/// Delete a silo by name or ID. #[endpoint { method = DELETE, path = "/v1/system/silos/{silo}", @@ -1569,7 +1573,11 @@ async fn ip_pool_silo_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool available within a silo +/// Link an IP pool to a silo +/// +/// Users in linked silos can allocate external IPs from this pool for their +/// instances. A silo can have at most one default pool. IPs are allocated from +/// the default pool when users ask for one without specifying a pool. #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/silos", @@ -1620,10 +1628,12 @@ async fn ip_pool_silo_unlink( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Make an IP pool default or not-default for a silo +/// Make IP pool default for silo /// -/// When a pool is made default for a silo, any existing default will remain -/// linked to the silo, but will no longer be the default. +/// When a user asks for an IP (e.g., at instance create time) without +/// specifying a pool, the IP comes from the default pool if a default is +/// configured. When a pool is made the default for a silo, any existing default +/// will remain linked to the silo, but will no longer be the default. #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}/silos/{silo}", @@ -1650,7 +1660,7 @@ async fn ip_pool_silo_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the IP pool used for Oxide services +/// Fetch the Oxide service IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools-service", @@ -1765,10 +1775,9 @@ async fn ip_pool_range_remove( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List ranges for the IP pool used for Oxide services +/// List IP ranges for the Oxide service pool /// -/// List ranges for the IP pool used for Oxide services. Ranges are ordered by -/// their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools-service/ranges", @@ -1809,7 +1818,7 @@ async fn ip_pool_service_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool used for Oxide services +/// Add IP range to Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/add", @@ -1830,7 +1839,7 @@ async fn ip_pool_service_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool used for Oxide services +/// Remove IP range from Oxide service pool #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/remove", @@ -3539,7 +3548,7 @@ async fn networking_bgp_announce_set_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Enable a BFD session. +/// Enable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-enable", @@ -3560,7 +3569,7 @@ async fn networking_bfd_enable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Disable a BFD session. +/// Disable a BFD session #[endpoint { method = POST, path = "/v1/system/networking/bfd-disable", @@ -3581,7 +3590,7 @@ async fn networking_bfd_disable( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get BFD status. +/// Get BFD status #[endpoint { method = GET, path = "/v1/system/networking/bfd-status", diff --git a/openapi/nexus.json b/openapi/nexus.json index 7aedd1b523..8baf1a6316 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -5331,7 +5331,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool available within a silo", + "summary": "Link an IP pool to a silo", + "description": "Users in linked silos can allocate external IPs from this pool for their instances. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "ip_pool_silo_link", "parameters": [ { @@ -5379,8 +5380,8 @@ "tags": [ "system/networking" ], - "summary": "Make an IP pool default or not-default for a silo", - "description": "When a pool is made default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", + "summary": "Make IP pool default for silo", + "description": "When a user asks for an IP (e.g., at instance create time) without specifying a pool, the IP comes from the default pool if a default is configured. When a pool is made the default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", "operationId": "ip_pool_silo_update", "parameters": [ { @@ -5472,7 +5473,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch the IP pool used for Oxide services", + "summary": "Fetch the Oxide service IP pool", "operationId": "ip_pool_service_view", "responses": { "200": { @@ -5499,8 +5500,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for the IP pool used for Oxide services", - "description": "List ranges for the IP pool used for Oxide services. Ranges are ordered by their first address.", + "summary": "List IP ranges for the Oxide service pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_service_range_list", "parameters": [ { @@ -5552,7 +5553,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool used for Oxide services", + "summary": "Add IP range to Oxide service pool", "operationId": "ip_pool_service_range_add", "requestBody": { "content": { @@ -5589,7 +5590,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool used for Oxide services", + "summary": "Remove IP range from Oxide service pool", "operationId": "ip_pool_service_range_remove", "requestBody": { "content": { @@ -5909,7 +5910,7 @@ "tags": [ "system/networking" ], - "summary": "Disable a BFD session.", + "summary": "Disable a BFD session", "operationId": "networking_bfd_disable", "requestBody": { "content": { @@ -5939,7 +5940,7 @@ "tags": [ "system/networking" ], - "summary": "Enable a BFD session.", + "summary": "Enable a BFD session", "operationId": "networking_bfd_enable", "requestBody": { "content": { @@ -5969,7 +5970,7 @@ "tags": [ "system/networking" ], - "summary": "Get BFD status.", + "summary": "Get BFD status", "operationId": "networking_bfd_status", "responses": { "200": { @@ -6980,7 +6981,7 @@ "system/silos" ], "summary": "Delete a silo", - "description": "Delete a silo by name.", + "description": "Delete a silo by name or ID.", "operationId": "silo_delete", "parameters": [ { @@ -7011,7 +7012,8 @@ "tags": [ "system/silos" ], - "summary": "List IP pools available within silo", + "summary": "List IP pools linked to silo", + "description": "Linked IP pools are available to users in the specified silo. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "silo_ip_pool_list", "parameters": [ { From 04d3b76a1f36d8a5386a5dfc3e169c3717089df0 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 6 Feb 2024 10:44:10 -0800 Subject: [PATCH 07/27] Unbreak the build (ratatui x omdb) (#5001) I suspect we had a race condition in our merge-queue-less world, where ratatui was updated to `0.26.0`, and needed slightly different parameters. This PR fixes that compatibility error. --- dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs index 20d651bfdf..153618b7c0 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs @@ -960,7 +960,7 @@ fn draw_graph(f: &mut Frame, parent: Rect, graph: &mut Graph, now: u64) { datasets.push( Dataset::default() - .name(&s.name) + .name(&*s.name) .marker(symbols::Marker::Braille) .style(Style::default().fg(s.color)) .data(&s.data), From 0df118ae8a93abf681c27d018abd4397acc54d15 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 6 Feb 2024 15:45:45 -0500 Subject: [PATCH 08/27] [sled-agent] Ignore zone order when checking for changes (#5006) This addresses one of two problems mentioned in #4990. sled-agent's `PUT /omicron-zones` endpoint is expected to be idempotent, and it checks that if it receives a zone config with the same generation as it currently has, that the contents are identical. This check was just a straight equality check between two `Vec`s, because we have been careful on the Nexus side to always sort the zone list consistently, but on #4990 we see an error caused solely by the ordering of these `Vec`s. When sled-agent receives a config with a new generation number, sled-agent itself may reorder the zone list between receiving it from Nexus and storing it in its ledger. This will cause the first `PUT` of the new generation to succeed, but any subsequent `PUT`s with identical contents to fail the equality check. Instead of trying ensure that sled-agent sorts the zone list in its ledger consistently with how Nexus sorts them, this PR changes the comparison to explicitly sort both the incoming list and the list loaded from the ledger prior to comparing them. --- sled-agent/src/services.rs | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 77b6bcbed4..d2f440024c 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -2799,7 +2799,7 @@ impl ServiceManager { /// boot. pub async fn ensure_all_omicron_zones_persistent( &self, - request: OmicronZonesConfig, + mut request: OmicronZonesConfig, ) -> Result<(), Error> { let log = &self.inner.log; @@ -2838,11 +2838,26 @@ impl ServiceManager { // If the generation is the same as what we're running, but the contents // aren't, that's a problem, too. - if ledger_zone_config.omicron_generation == request.generation - && ledger_zone_config.clone().to_omicron_zones_config().zones - != request.zones - { - return Err(Error::RequestedConfigConflicts(request.generation)); + if ledger_zone_config.omicron_generation == request.generation { + // Nexus should send us consistent zone orderings; however, we may + // reorder the zone list inside `ensure_all_omicron_zones`. To avoid + // equality checks failing only because the two lists are ordered + // differently, sort them both here before comparing. + let mut ledger_zones = + ledger_zone_config.clone().to_omicron_zones_config().zones; + + // We sort by ID because we assume no two zones have the same ID. If + // that assumption is wrong, we may return an error here where the + // conflict is soley the list orders, but in such a case that's the + // least of our problems. + ledger_zones.sort_by_key(|z| z.id); + request.zones.sort_by_key(|z| z.id); + + if ledger_zones != request.zones { + return Err(Error::RequestedConfigConflicts( + request.generation, + )); + } } let new_config = self From 8f1e4191fb75592d130fb4b5ed7f9fbdba1dbd20 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 6 Feb 2024 13:01:33 -0800 Subject: [PATCH 09/27] planner generated duplicate IPs (#5005) --- nexus/deployment/src/blueprint_builder.rs | 224 ++++++++++++++-------- nexus/deployment/src/planner.rs | 7 + nexus/types/src/deployment.rs | 55 ++++-- 3 files changed, 188 insertions(+), 98 deletions(-) diff --git a/nexus/deployment/src/blueprint_builder.rs b/nexus/deployment/src/blueprint_builder.rs index ac2fe70e6b..904e768e1b 100644 --- a/nexus/deployment/src/blueprint_builder.rs +++ b/nexus/deployment/src/blueprint_builder.rs @@ -77,7 +77,7 @@ pub struct BlueprintBuilder<'a> { // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. - omicron_zones: BTreeMap, + zones: BlueprintZones<'a>, zones_in_service: BTreeSet, creator: String, comments: Vec, @@ -151,7 +151,7 @@ impl<'a> BlueprintBuilder<'a> { parent_blueprint, policy, sled_ip_allocators: BTreeMap::new(), - omicron_zones: BTreeMap::new(), + zones: BlueprintZones::new(parent_blueprint), zones_in_service: parent_blueprint.zones_in_service.clone(), creator: creator.to_owned(), comments: Vec::new(), @@ -159,41 +159,10 @@ impl<'a> BlueprintBuilder<'a> { } /// Assemble a final [`Blueprint`] based on the contents of the builder - pub fn build(mut self) -> Blueprint { + pub fn build(self) -> Blueprint { // Collect the Omicron zones config for each in-service sled. - let omicron_zones = self - .policy - .sleds - .keys() - .map(|sled_id| { - // Start with self.omicron_zones, which contains entries for any - // sled whose zones config is changing in this blueprint. - let mut zones = self - .omicron_zones - .remove(sled_id) - // If it's not there, use the config from the parent - // blueprint. - .or_else(|| { - self.parent_blueprint - .omicron_zones - .get(sled_id) - .cloned() - }) - // If it's not there either, then this must be a new sled - // and we haven't added any zones to it yet. Use the - // standard initial config. - .unwrap_or_else(|| OmicronZonesConfig { - generation: Generation::new(), - zones: vec![], - }); - - // This is not strictly necessary. But for testing, it's - // helpful for things to be in sorted order. - zones.zones.sort_by_key(|zone| zone.id); - - (*sled_id, zones) - }) - .collect(); + let omicron_zones = + self.zones.into_omicron_zones(self.policy.sleds.keys().copied()); Blueprint { id: Uuid::new_v4(), omicron_zones, @@ -222,13 +191,9 @@ impl<'a> BlueprintBuilder<'a> { ) -> Result { // If there's already an NTP zone on this sled, do nothing. let has_ntp = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| z.zone_type.is_ntp()) - }) - .unwrap_or(false); + .zones + .current_sled_zones(sled_id) + .any(|z| z.zone_type.is_ntp()); if has_ntp { return Ok(Ensure::NotNeeded); } @@ -286,20 +251,14 @@ impl<'a> BlueprintBuilder<'a> { pool_name: ZpoolName, ) -> Result { // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = self - .parent_blueprint - .omicron_zones - .get(&sled_id) - .map(|found_zones| { - found_zones.zones.iter().any(|z| { - matches!( - &z.zone_type, - OmicronZoneType::Crucible { dataset, .. } - if dataset.pool_name == pool_name - ) - }) - }) - .unwrap_or(false); + let has_crucible_on_this_pool = + self.zones.current_sled_zones(sled_id).any(|z| { + matches!( + &z.zone_type, + OmicronZoneType::Crucible { dataset, .. } + if dataset.pool_name == pool_name + ) + }); if has_crucible_on_this_pool { return Ok(Ensure::NotNeeded); } @@ -344,27 +303,7 @@ impl<'a> BlueprintBuilder<'a> { ))); } - let sled_zones = - self.omicron_zones.entry(sled_id).or_insert_with(|| { - if let Some(old_sled_zones) = - self.parent_blueprint.omicron_zones.get(&sled_id) - { - OmicronZonesConfig { - generation: old_sled_zones.generation.next(), - zones: old_sled_zones.zones.clone(), - } - } else { - // The first generation is reserved to mean the one - // containing no zones. See - // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start - // with the next one. - OmicronZonesConfig { - generation: Generation::new().next(), - zones: vec![], - } - } - }); - + let sled_zones = self.zones.change_sled_zones(sled_id); sled_zones.zones.push(zone); Ok(()) } @@ -398,10 +337,8 @@ impl<'a> BlueprintBuilder<'a> { // Record each of the sled's zones' underlay addresses as // allocated. - if let Some(sled_zones) = self.omicron_zones.get(&sled_id) { - for z in &sled_zones.zones { - allocator.reserve(z.underlay_address); - } + for z in self.zones.current_sled_zones(sled_id) { + allocator.reserve(z.underlay_address); } allocator @@ -420,10 +357,109 @@ impl<'a> BlueprintBuilder<'a> { } } +/// Helper for working with sets of zones on each sled +/// +/// Tracking the set of zones is slightly non-trivial because we need to bump +/// the per-sled generation number iff the zones are changed. So we need to +/// keep track of whether we've changed the zones relative to the parent +/// blueprint. We do this by keeping a copy of any `OmicronZonesConfig` that +/// we've changed and a _reference_ to the parent blueprint's zones. This +/// struct makes it easy for callers iterate over the right set of zones. +struct BlueprintZones<'a> { + changed_zones: BTreeMap, + parent_zones: &'a BTreeMap, +} + +impl<'a> BlueprintZones<'a> { + pub fn new(parent_blueprint: &'a Blueprint) -> BlueprintZones { + BlueprintZones { + changed_zones: BTreeMap::new(), + parent_zones: &parent_blueprint.omicron_zones, + } + } + + /// Returns a mutable reference to a sled's Omicron zones *because* we're + /// going to change them. It's essential that the caller _does_ change them + /// because we will have bumped the generation number and we don't want to + /// do that if no changes are being made. + pub fn change_sled_zones( + &mut self, + sled_id: Uuid, + ) -> &mut OmicronZonesConfig { + self.changed_zones.entry(sled_id).or_insert_with(|| { + if let Some(old_sled_zones) = self.parent_zones.get(&sled_id) { + OmicronZonesConfig { + generation: old_sled_zones.generation.next(), + zones: old_sled_zones.zones.clone(), + } + } else { + // The first generation is reserved to mean the one + // containing no zones. See + // OMICRON_ZONES_CONFIG_INITIAL_GENERATION. So we start + // with the next one. + OmicronZonesConfig { + generation: Generation::new().next(), + zones: vec![], + } + } + }) + } + + /// Iterates over the list of Omicron zones currently configured for this + /// sled in the blueprint that's being built + pub fn current_sled_zones( + &self, + sled_id: Uuid, + ) -> Box + '_> { + if let Some(sled_zones) = self + .changed_zones + .get(&sled_id) + .or_else(|| self.parent_zones.get(&sled_id)) + { + Box::new(sled_zones.zones.iter()) + } else { + Box::new(std::iter::empty()) + } + } + + /// Produces an owned map of zones for the requested sleds + pub fn into_omicron_zones( + mut self, + sled_ids: impl Iterator, + ) -> BTreeMap { + sled_ids + .map(|sled_id| { + // Start with self.changed_zones, which contains entries for any + // sled whose zones config is changing in this blueprint. + let mut zones = self + .changed_zones + .remove(&sled_id) + // If it's not there, use the config from the parent + // blueprint. + .or_else(|| self.parent_zones.get(&sled_id).cloned()) + // If it's not there either, then this must be a new sled + // and we haven't added any zones to it yet. Use the + // standard initial config. + .unwrap_or_else(|| OmicronZonesConfig { + generation: Generation::new(), + zones: vec![], + }); + + // This is not strictly necessary. But for testing, it's + // helpful for things to be in sorted order. + zones.zones.sort_by_key(|zone| zone.id); + + (sled_id, zones) + }) + .collect() + } +} + #[cfg(test)] pub mod test { use super::BlueprintBuilder; use ipnet::IpAdd; + use nexus_types::deployment::Blueprint; use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolName; @@ -544,6 +580,25 @@ pub mod test { sled_ip } + /// Checks various conditions that should be true for all blueprints + pub fn verify_blueprint(blueprint: &Blueprint) { + let mut underlay_ips: BTreeMap = + BTreeMap::new(); + for sled_zones in blueprint.omicron_zones.values() { + for zone in &sled_zones.zones { + if let Some(previous) = + underlay_ips.insert(zone.underlay_address, zone) + { + panic!( + "found duplicate underlay IP {} in zones {} and \ + {}\n\nblueprint: {:#?}", + zone.underlay_address, zone.id, previous.id, blueprint + ); + } + } + } + } + #[test] fn test_initial() { // Test creating a blueprint from a collection and verifying that it @@ -556,6 +611,7 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint_initial); // Since collections don't include what was in service, we have to // provide that ourselves. For our purposes though we don't care. @@ -577,6 +633,7 @@ pub mod test { "test_basic", ); let blueprint = builder.build(); + verify_blueprint(&blueprint); let diff = blueprint_initial.diff(&blueprint); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -596,6 +653,7 @@ pub mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); let mut builder = BlueprintBuilder::new_based_on(&blueprint1, &policy, "test_basic"); @@ -613,6 +671,7 @@ pub mod test { } let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); let diff = blueprint1.diff(&blueprint2); println!( "initial blueprint -> next blueprint (expected no changes):\n{}", @@ -636,6 +695,7 @@ pub mod test { } let blueprint3 = builder.build(); + verify_blueprint(&blueprint3); let diff = blueprint2.diff(&blueprint3); println!("expecting new NTP and Crucible zones:\n{}", diff); diff --git a/nexus/deployment/src/planner.rs b/nexus/deployment/src/planner.rs index 0a8e1f0b81..002a1dbe2e 100644 --- a/nexus/deployment/src/planner.rs +++ b/nexus/deployment/src/planner.rs @@ -154,6 +154,7 @@ mod test { use super::Planner; use crate::blueprint_builder::test::example; use crate::blueprint_builder::test::policy_add_sled; + use crate::blueprint_builder::test::verify_blueprint; use crate::blueprint_builder::BlueprintBuilder; use nexus_inventory::now_db_precision; use nexus_types::inventory::OmicronZoneType; @@ -177,6 +178,7 @@ mod test { "the_test", ) .expect("failed to create initial blueprint"); + verify_blueprint(&blueprint1); // Now run the planner. It should do nothing because our initial // system didn't have any issues that the planner currently knows how to @@ -196,6 +198,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint2); // Now add a new sled. let new_sled_id = @@ -229,6 +232,7 @@ mod test { )); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint3); // Check that with no change in inventory, the planner makes no changes. // It needs to wait for inventory to reflect the new NTP zone before @@ -247,6 +251,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint4); // Now update the inventory to have the requested NTP zone. assert!(collection @@ -298,6 +303,7 @@ mod test { panic!("unexpectedly added a non-Crucible zone"); }; } + verify_blueprint(&blueprint5); // Check that there are no more steps let blueprint6 = Planner::new_based_on( @@ -315,6 +321,7 @@ mod test { assert_eq!(diff.sleds_added().count(), 0); assert_eq!(diff.sleds_removed().count(), 0); assert_eq!(diff.sleds_changed().count(), 0); + verify_blueprint(&blueprint6); logctx.cleanup_successful(); } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 3b4c3b3142..324768b9d8 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -466,10 +466,11 @@ impl<'a> OmicronZonesDiff<'a> { for z in &bbsledzones.zones { writeln!( f, - "{} zone {} type {} ({})", + "{} zone {} type {} underlay IP {} ({})", prefix, z.id, z.zone_type.label(), + z.underlay_address, label )?; } @@ -529,44 +530,65 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { DiffZoneChangedHow::DetailsChanged => { writeln!( f, - "- zone {} type {} (changed)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (changed)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (changed)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::RemovedFromService => { writeln!( f, - "- zone {} type {} (in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (removed from service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (removed from service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::AddedToService => { writeln!( f, - "- zone {} type {} (not in service)", - zone_id, zone_type, + "- zone {} type {} underlay IP {} \ + (not in service)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; writeln!( f, - "+ zone {} type {} (added to service)", - zone_id, zone2_type, + "+ zone {} type {} underlay IP {} \ + (added to service)", + zone_id, + zone2_type, + zone_changes.zone_after.underlay_address, )?; } DiffZoneChangedHow::NoChanges => { writeln!( f, - " zone {} type {} (unchanged)", - zone_id, zone_type, + " zone {} type {} underlay IP {} \ + (unchanged)", + zone_id, + zone_type, + zone_changes.zone_before.underlay_address, )?; } } @@ -575,8 +597,9 @@ impl<'a> std::fmt::Display for OmicronZonesDiff<'a> { for zone in sled_changes.zones_added() { writeln!( f, - "+ zone {} type {} (added)", + "+ zone {} type {} underlay IP {} (added)", zone.id, + zone.underlay_address, zone.zone_type.label(), )?; } From dd8d1aa1fb2df3f81292d48742f009a5064f93d2 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 6 Feb 2024 18:14:20 -0500 Subject: [PATCH 10/27] Don't constrain baseboard when adding sled (#4987) We were artificially limiting how we added sleds to a rack by forcing them to be of type `Baseboard::Gimlet`. Instead of constructing a `Baseboard` inside nexus, we instead send down the serial and part numbers and use those to uniquely identify the sled. We ignore whether it's a PC or Gimlet as long as the ids match. This is similar to how the inventory works and allows adding a sled to a rack on the falcon a4x2 testbed. --- nexus/src/app/rack.rs | 32 +++++------------------------- openapi/sled-agent.json | 20 ++++++++++++++++++- sled-agent/src/bootstrap/params.rs | 16 +++++++++++---- sled-agent/src/sled_agent.rs | 26 +++++++++++++++--------- 4 files changed, 53 insertions(+), 41 deletions(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 569153f23e..2b38c62b23 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -52,7 +52,6 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; -use omicron_common::api::external::ResourceType; use omicron_common::api::internal::shared::ExternalPortDiscovery; use sled_agent_client::types::AddSledRequest; use sled_agent_client::types::EarlyNetworkConfigBody; @@ -871,36 +870,15 @@ impl super::Nexus { ) .await?; - // Grab the SPs from the last collection - let collection = - self.db_datastore.inventory_get_latest_collection(opctx).await?; - - // If there isn't a collection, we don't know about the sled - let Some(collection) = collection else { - return Err(Error::unavail("no inventory data available")); - }; - - // Find the revision - let Some(sp) = collection.sps.get(&baseboard_id) else { - return Err(Error::ObjectNotFound { - type_name: ResourceType::Sled, - lookup_type: - omicron_common::api::external::LookupType::ByCompositeId( - format!("{sled:?}"), - ), - }); - }; - - // Convert the baseboard as necessary - let baseboard = sled_agent_client::types::Baseboard::Gimlet { - identifier: sled.serial.clone(), - model: sled.part.clone(), - revision: sp.baseboard_revision.into(), + // Convert `UninitializedSledId` to the sled-agent type + let baseboard_id = sled_agent_client::types::BaseboardId { + serial_number: sled.serial.clone(), + part_number: sled.part.clone(), }; // Make the call to sled-agent let req = AddSledRequest { - sled_id: baseboard, + sled_id: baseboard_id, start_request: StartSledAgentRequest { generation: 0, schema_version: 1, diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 4b53397ffb..395394defb 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -1240,7 +1240,7 @@ "type": "object", "properties": { "sled_id": { - "$ref": "#/components/schemas/Baseboard" + "$ref": "#/components/schemas/BaseboardId" }, "start_request": { "$ref": "#/components/schemas/StartSledAgentRequest" @@ -1319,6 +1319,24 @@ } ] }, + "BaseboardId": { + "description": "A representation of a Baseboard ID as used in the inventory subsystem This type is essentially the same as a `Baseboard` except it doesn't have a revision or HW type (Gimlet, PC, Unknown).", + "type": "object", + "properties": { + "part_number": { + "description": "Oxide Part Number", + "type": "string" + }, + "serial_number": { + "description": "Serial number (unique for a given part number)", + "type": "string" + } + }, + "required": [ + "part_number", + "serial_number" + ] + }, "BgpConfig": { "type": "object", "properties": { diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 79189e7f49..b684d96763 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -174,10 +174,21 @@ impl TryFrom for RackInitializeRequest { pub type Certificate = nexus_client::types::Certificate; pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; +/// A representation of a Baseboard ID as used in the inventory subsystem +/// This type is essentially the same as a `Baseboard` except it doesn't have a +/// revision or HW type (Gimlet, PC, Unknown). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + /// A request to Add a given sled after rack initialization has occurred #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] pub struct AddSledRequest { - pub sled_id: Baseboard, + pub sled_id: BaseboardId, pub start_request: StartSledAgentRequest, } @@ -255,9 +266,6 @@ pub struct StartSledAgentRequestBody { /// true. pub is_lrtq_learner: bool, - // Note: The order of these fields is load bearing, because we serialize - // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it - // must come after non-table fields. /// Portion of the IP space to be managed by the Sled Agent. pub subnet: Ipv6Subnet, } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index eaf354db26..bcc354232e 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -9,7 +9,7 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkSetupError, }; -use crate::bootstrap::params::StartSledAgentRequest; +use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; use crate::instance_manager::{InstanceManager, ReservoirMode}; use crate::long_running_tasks::LongRunningTaskHandles; @@ -1187,8 +1187,8 @@ pub enum AddSledError { }, #[error("Failed to connect to DDM")] DdmAdminClient(#[source] ddm_admin_client::DdmError), - #[error("Failed to learn bootstrap ip for {0}")] - NotFound(Baseboard), + #[error("Failed to learn bootstrap ip for {0:?}")] + NotFound(BaseboardId), #[error("Failed to initialize {sled_id}: {err}")] BootstrapTcpClient { sled_id: Baseboard, @@ -1199,7 +1199,7 @@ pub enum AddSledError { /// Add a sled to an initialized rack. pub async fn sled_add( log: Logger, - sled_id: Baseboard, + sled_id: BaseboardId, request: StartSledAgentRequest, ) -> Result<(), AddSledError> { // Get all known bootstrap addresses via DDM @@ -1227,16 +1227,20 @@ pub async fn sled_add( }) .collect::>(); - // Execute the futures until we find our matching sled or done searching + // Execute the futures until we find our matching sled or are done searching let mut target_ip = None; + let mut found_baseboard = None; while let Some((ip, result)) = addrs_to_sleds.next().await { match result { Ok(baseboard) => { // Convert from progenitor type back to `sled-hardware` // type. - let found = baseboard.into_inner().into(); - if sled_id == found { + let found: Baseboard = baseboard.into_inner().into(); + if sled_id.serial_number == found.identifier() + && sled_id.part_number == found.model() + { target_ip = Some(ip); + found_baseboard = Some(found); break; } } @@ -1259,10 +1263,14 @@ pub async fn sled_add( log.new(o!("BootstrapAgentClient" => bootstrap_addr.to_string())), ); + // Safe to unwrap, because we would have bailed when checking target_ip + // above otherwise. baseboard and target_ip are set together. + let baseboard = found_baseboard.unwrap(); + client.start_sled_agent(&request).await.map_err(|err| { - AddSledError::BootstrapTcpClient { sled_id: sled_id.clone(), err } + AddSledError::BootstrapTcpClient { sled_id: baseboard.clone(), err } })?; - info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %sled_id); + info!(log, "Peer agent initialized"; "peer_bootstrap_addr" => %bootstrap_addr, "peer_id" => %baseboard); Ok(()) } From 73d7989c3a903625acd7078df5f0e958a17d7a6b Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 6 Feb 2024 17:40:39 -0800 Subject: [PATCH 11/27] [direnv] make nix flake opt-in (#5011) On a non-NixOS platform I'm seeing issues building both on main and on #4961, of the form shown at https://github.com/oxidecomputer/omicron/pull/4961#issuecomment-1930996559. Fix this by making use of the flake opt-in for now. --- .envrc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.envrc b/.envrc index 036459a4a9..48df8e3c63 100644 --- a/.envrc +++ b/.envrc @@ -6,7 +6,7 @@ PATH_add out/clickhouse PATH_add out/dendrite-stub/bin PATH_add out/mgd/root/opt/oxide/mgd/bin -if nix flake show &> /dev/null +if [ "$OMICRON_USE_FLAKE" = 1 ] && nix flake show &> /dev/null then use flake; -fi \ No newline at end of file +fi From 131e00c2eb0f1beb22afdec639fde149ad888805 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Wed, 7 Feb 2024 00:00:08 -0800 Subject: [PATCH 12/27] bump maghemite (#5013) --- package-manifest.toml | 12 ++++++------ tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index 8944e59c37..ee20bfd307 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -446,10 +446,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//maghemite.sha256.txt -source.sha256 = "36e976ae9b1517b358ec7eadd5fb03f5d40d54074ff830a79895f8fc3e643935" +source.sha256 = "097553ad7c8cb50f23852e9d6332d9c4e58050fddaa7137bfd5e2859354c2f25" output.type = "tarball" [package.mg-ddm] @@ -462,10 +462,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "bc3137751db24d2e44eca7118f6ca825ed3e9df736480fc210392802cd063dd8" +source.sha256 = "cf42b987a81dc1ff102f8f603ff90d8fe9d8a3db890a19810a3ddbb04ab1b526" output.type = "zone" output.intermediate_only = true @@ -477,10 +477,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "712b2487d9b141234af98b6578bc5f77420bdb03" +source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" +source.sha256 = "92e96984663d2d57d01c200685a47d998a1fd75ea89777e79c00095ebc8de9aa" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 8ee3001179..f300b40aa1 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" SHA2="0b0dbc2f8bbc5d2d9be92d64c4865f8f9335355aae62f7de9f67f81dfb3f1803" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 3fa53a9483..7a650e00a3 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="712b2487d9b141234af98b6578bc5f77420bdb03" +COMMIT="c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" SHA2="0ac038bbaa54d0ae0ac5ccaeff48f03070618372cca26c9d09b716b909bf9355" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 1dacea54dc..0c2898f954 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="2c54146a133b5f12587d9fb89f85ef0a0ca6278efc8c6fe4859782e886e6c774" -MGD_LINUX_SHA256="248732202f5102bf0947f5f91871379b6c6945fe387d4272cebe6e08f1b58184" \ No newline at end of file +CIDL_SHA256="92e96984663d2d57d01c200685a47d998a1fd75ea89777e79c00095ebc8de9aa" +MGD_LINUX_SHA256="2d8f090161cbabddafa677954a3e6a69eff77ad9a73c686452884528260f0616" \ No newline at end of file From f2f2bfae145f91486ac0444fa17aac51278fdecc Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 7 Feb 2024 08:57:19 -0800 Subject: [PATCH 13/27] pin libxmlsec1 1.3.2 in CI on macOS (#5007) See also #4920 A [temporary](https://xkcd.com/2730/) hack to keep the macOS job mildly happy while I muster the energy to fix this upstream in samael. --- .github/workflows/rust.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fa99017b0d..05ff2ed879 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -30,7 +30,16 @@ jobs: steps: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo + if: ${{ startsWith(matrix.os, 'ubuntu') }} run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list + # https://github.com/oxidecomputer/omicron/issues/4920 + - name: Pin libxmlsec1 to 1.3.2 + if: ${{ startsWith(matrix.os, 'macos') }} + run: | + curl -fLOsS --retry 5 https://raw.githubusercontent.com/Homebrew/homebrew-core/081149b0d2720c2759b6ac8253e33b27f6d6c1cd/Formula/lib/libxmlsec1.rb + brew install ./libxmlsec1.rb + brew pin libxmlsec1 + rm -f libxmlsec1.rb - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 From b7a00f39fc26d0799f77f3f27eb5e12bbe6e99f2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 7 Feb 2024 16:20:11 -0500 Subject: [PATCH 14/27] Provide mechanism for RPWs to execute sagas (#4790) This commit adds a `SagaRequest` channel that can be shared with reliable persistent workflow (RPW) background tasks so that a saga can be executed as part of those task's activation. This functionality will be used as part of downstairs replacement: when a physical disk is marked as no longer in use, a series of RPWs and sagas will ensure that any Crucible resources that were allocated to that physical disk are moved, and make any necessary volume changes. --- common/src/nexus_config.rs | 21 ++++- dev-tools/omdb/src/bin/omdb/nexus.rs | 26 ++++++ dev-tools/omdb/tests/env.out | 12 +++ dev-tools/omdb/tests/successes.out | 12 +++ nexus/examples/config.toml | 1 + nexus/src/app/background/common.rs | 80 +++++++++++++++++++ nexus/src/app/background/init.rs | 30 +++++++ nexus/src/app/background/mod.rs | 1 + .../src/app/background/region_replacement.rs | 57 +++++++++++++ nexus/src/app/mod.rs | 38 +++++++++ nexus/src/app/sagas/mod.rs | 21 +++++ nexus/tests/config.test.toml | 1 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 14 files changed, 299 insertions(+), 3 deletions(-) create mode 100644 nexus/src/app/background/region_replacement.rs diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs index 24f4c34797..2545d4cb91 100644 --- a/common/src/nexus_config.rs +++ b/common/src/nexus_config.rs @@ -340,6 +340,8 @@ pub struct BackgroundTaskConfig { pub sync_service_zone_nat: SyncServiceZoneNatConfig, /// configuration for the bfd manager task pub bfd_manager: BfdManagerConfig, + /// configuration for region replacement task + pub region_replacement: RegionReplacementConfig, } #[serde_as] @@ -444,6 +446,14 @@ pub struct BlueprintTasksConfig { pub period_secs_execute: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionReplacementConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -548,8 +558,8 @@ mod test { ConfigDropshotWithTls, ConsoleConfig, Database, DeploymentConfig, DnsTasksConfig, DpdConfig, ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError, LoadErrorKind, MgdConfig, NatCleanupConfig, - PackageConfig, PhantomDiskConfig, SchemeName, TimeseriesDbConfig, - Tunables, UpdatesConfig, + PackageConfig, PhantomDiskConfig, RegionReplacementConfig, SchemeName, + TimeseriesDbConfig, Tunables, UpdatesConfig, }; use crate::address::{Ipv6Subnet, RACK_PREFIX}; use crate::api::internal::shared::SwitchLocation; @@ -706,6 +716,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -819,7 +830,10 @@ mod test { }, sync_service_zone_nat: SyncServiceZoneNatConfig { period_secs: Duration::from_secs(30) - } + }, + region_replacement: RegionReplacementConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -882,6 +896,7 @@ mod test { blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 + region_replacement.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index f00c05f1ec..9904263067 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -639,6 +639,32 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ); } }; + } else if name == "region_replacement" { + #[derive(Deserialize)] + struct TaskSuccess { + /// how many region replacements were started ok + region_replacement_started_ok: usize, + + /// how many region replacements could not be started + region_replacement_started_err: usize, + } + + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(success) => { + println!( + " number of region replacements started ok: {}", + success.region_replacement_started_ok + ); + println!( + " number of region replacement start errors: {}", + success.region_replacement_started_err + ); + } + }; } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 72e9d2e8fc..0600945194 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -83,6 +83,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -169,6 +173,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -242,6 +250,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 416b669068..1cd85262f6 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -277,6 +277,10 @@ task: "phantom_disks" detects and un-deletes phantom disks +task: "region_replacement" + detects if a region requires replacing and begins the process + + task: "service_zone_nat_tracker" ensures service zone nat records are recorded in NAT RPW table @@ -407,6 +411,14 @@ task: "phantom_disks" number of phantom disks deleted: 0 number of phantom disk delete errors: 0 +task: "region_replacement" + configured period: every 30s + currently executing: no + last completed activation: iter 2, triggered by an explicit signal + started at (s ago) and ran for ms + number of region replacements started ok: 0 + number of region replacement start errors: 0 + task: "service_zone_nat_tracker" configured period: every 30s currently executing: no diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 4263c34f3d..ac9d894050 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -109,6 +109,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/common.rs b/nexus/src/app/background/common.rs index f954a35639..e0d8f32316 100644 --- a/nexus/src/app/background/common.rs +++ b/nexus/src/app/background/common.rs @@ -467,6 +467,7 @@ mod test { use super::BackgroundTask; use super::Driver; use crate::app::background::common::ActivationReason; + use crate::app::sagas::SagaRequest; use assert_matches::assert_matches; use chrono::Utc; use futures::future::BoxFuture; @@ -477,6 +478,7 @@ mod test { use std::time::Instant; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TryRecvError; + use tokio::sync::mpsc::Sender; use tokio::sync::watch; type ControlPlaneTestContext = @@ -814,4 +816,82 @@ mod test { // such a task that would allow us to reliably distinguish between these // two without also spending a lot of wall-clock time on this test. } + + /// Simple BackgroundTask impl that sends a test-only SagaRequest + struct SagaRequestTask { + saga_request: Sender, + } + + impl SagaRequestTask { + fn new(saga_request: Sender) -> SagaRequestTask { + SagaRequestTask { saga_request } + } + } + + impl BackgroundTask for SagaRequestTask { + fn activate<'a>( + &'a mut self, + _: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let _ = self.saga_request.send(SagaRequest::TestOnly).await; + serde_json::Value::Null + } + .boxed() + } + } + + #[nexus_test(server = crate::Server)] + async fn test_saga_request_flow(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let t1 = SagaRequestTask::new(saga_request); + + let mut driver = Driver::new(); + let (_dep_tx1, dep_rx1) = watch::channel(0); + + let h1 = driver.register( + "t1".to_string(), + "test saga request flow task".to_string(), + Duration::from_secs(300), // should not fire in this test + Box::new(t1), + opctx.child(std::collections::BTreeMap::new()), + vec![Box::new(dep_rx1.clone())], + ); + + assert!(matches!( + saga_request_recv.try_recv(), + Err(mpsc::error::TryRecvError::Empty), + )); + + driver.activate(&h1); + + // wait 1 second for the saga request to arrive + tokio::select! { + _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => { + assert!(false); + } + + saga_request = saga_request_recv.recv() => { + match saga_request { + None => { + assert!(false); + } + + Some(saga_request) => { + assert!(matches!( + saga_request, + SagaRequest::TestOnly, + )); + } + } + } + } + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 95fe5c933e..9d078f10d0 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -15,7 +15,9 @@ use super::external_endpoints; use super::inventory_collection; use super::nat_cleanup; use super::phantom_disks; +use super::region_replacement; use super::sync_service_zone_nat::ServiceZoneNatTracker; +use crate::app::sagas::SagaRequest; use nexus_db_model::DnsGroup; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -25,6 +27,7 @@ use omicron_common::nexus_config::DnsTasksConfig; use std::collections::BTreeMap; use std::collections::HashMap; use std::sync::Arc; +use tokio::sync::mpsc::Sender; use uuid::Uuid; /// Describes ongoing background tasks and provides interfaces for working with @@ -72,10 +75,15 @@ pub struct BackgroundTasks { /// task handle for the service zone nat tracker pub task_service_zone_nat_tracker: common::TaskHandle, + + /// task handle for the task that detects if regions need replacement and + /// begins the process + pub task_region_replacement: common::TaskHandle, } impl BackgroundTasks { /// Kick off all background tasks + #[allow(clippy::too_many_arguments)] pub fn start( opctx: &OpContext, datastore: Arc, @@ -84,6 +92,7 @@ impl BackgroundTasks { mgd_clients: &HashMap>, nexus_id: Uuid, resolver: internal_dns::resolver::Resolver, + saga_request: Sender, ) -> BackgroundTasks { let mut driver = common::Driver::new(); @@ -243,6 +252,26 @@ impl BackgroundTasks { ) }; + // Background task: detect if a region needs replacement and begin the + // process + let task_region_replacement = { + let detector = region_replacement::RegionReplacementDetector::new( + datastore, + saga_request.clone(), + ); + + let task = driver.register( + String::from("region_replacement"), + String::from("detects if a region requires replacing and begins the process"), + config.region_replacement.period_secs, + Box::new(detector), + opctx.child(BTreeMap::new()), + vec![], + ); + + task + }; + BackgroundTasks { driver, task_internal_dns_config, @@ -258,6 +287,7 @@ impl BackgroundTasks { task_blueprint_loader, task_blueprint_executor, task_service_zone_nat_tracker, + task_region_replacement, } } diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs index 2c5fa0ab3c..27cdddfe15 100644 --- a/nexus/src/app/background/mod.rs +++ b/nexus/src/app/background/mod.rs @@ -16,6 +16,7 @@ mod init; mod inventory_collection; mod nat_cleanup; mod phantom_disks; +mod region_replacement; mod status; mod sync_service_zone_nat; diff --git a/nexus/src/app/background/region_replacement.rs b/nexus/src/app/background/region_replacement.rs new file mode 100644 index 0000000000..fc92f888b9 --- /dev/null +++ b/nexus/src/app/background/region_replacement.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting regions that need replacing and beginning that +//! process +//! +//! TODO this is currently a placeholder for a future PR + +use super::common::BackgroundTask; +use crate::app::sagas::SagaRequest; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::mpsc::Sender; + +pub struct RegionReplacementDetector { + _datastore: Arc, + _saga_request: Sender, +} + +impl RegionReplacementDetector { + pub fn new( + datastore: Arc, + saga_request: Sender, + ) -> Self { + RegionReplacementDetector { + _datastore: datastore, + _saga_request: saga_request, + } + } +} + +impl BackgroundTask for RegionReplacementDetector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let log = &opctx.log; + warn!(&log, "region replacement task started"); + + // TODO + + warn!(&log, "region replacement task done"); + + json!({ + "region_replacement_started_ok": 0, + "region_replacement_started_err": 0, + }) + } + .boxed() + } +} diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index c9ca4db73e..7a9a26b05f 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -6,6 +6,7 @@ use self::external_endpoints::NexusCertResolver; use crate::app::oximeter::LazyTimeseriesClient; +use crate::app::sagas::SagaRequest; use crate::config; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -362,6 +363,8 @@ impl Nexus { Arc::clone(&db_datastore), ); + let (saga_request, mut saga_request_recv) = SagaRequest::channel(); + let background_tasks = background::BackgroundTasks::start( &background_ctx, Arc::clone(&db_datastore), @@ -370,6 +373,7 @@ impl Nexus { &mg_clients, config.deployment.id, resolver.clone(), + saga_request, ); let external_resolver = { @@ -484,6 +488,29 @@ impl Nexus { } }); + // Spawn a task to receive SagaRequests from RPWs, and execute them + { + let nexus = nexus.clone(); + tokio::spawn(async move { + loop { + match saga_request_recv.recv().await { + None => { + // If this channel is closed, then RPWs will not be + // able to request that sagas be run. This will + // likely only occur when Nexus itself is shutting + // down, so emit an error and exit the task. + error!(&nexus.log, "saga request channel closed!"); + break; + } + + Some(saga_request) => { + nexus.handle_saga_request(saga_request).await; + } + } + } + }); + } + Ok(nexus) } @@ -828,6 +855,17 @@ impl Nexus { pub(crate) async fn resolver(&self) -> internal_dns::resolver::Resolver { self.internal_resolver.clone() } + + /// Reliable persistent workflows can request that sagas be executed by + /// sending a SagaRequest to a supplied channel. Execute those here. + pub(crate) async fn handle_saga_request(&self, saga_request: SagaRequest) { + match saga_request { + #[cfg(test)] + SagaRequest::TestOnly => { + unimplemented!(); + } + } + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 1bd85ecf32..e9f800c61b 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -17,6 +17,7 @@ use steno::ActionContext; use steno::ActionError; use steno::SagaType; use thiserror::Error; +use tokio::sync::mpsc; use uuid::Uuid; pub mod disk_create; @@ -408,3 +409,23 @@ where ) .await } + +/// Reliable persistent workflows can request that sagas be run as part of their +/// activation by sending a SagaRequest through a supplied channel to Nexus. +pub enum SagaRequest { + #[cfg(test)] + TestOnly, +} + +impl SagaRequest { + pub fn channel() -> (mpsc::Sender, mpsc::Receiver) + { + // Limit the maximum number of saga requests that background tasks can + // queue for Nexus to run. + // + // Note this value was chosen arbitrarily! + const MAX_QUEUED_SAGA_REQUESTS: usize = 128; + + mpsc::channel(MAX_QUEUED_SAGA_REQUESTS) + } +} diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 3571388747..8d37f9e3ef 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -103,6 +103,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 100 blueprints.period_secs_execute = 600 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 8fc2429169..40ed41bfda 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 15f0a4ebe1..2e259aa42f 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -51,6 +51,7 @@ phantom_disks.period_secs = 30 blueprints.period_secs_load = 10 blueprints.period_secs_execute = 60 sync_service_zone_nat.period_secs = 30 +region_replacement.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From a3d9d3716d4ca6c78428b2993212249f6c1b6233 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Feb 2024 14:14:22 -0800 Subject: [PATCH 15/27] nix flake: Manage clickhouse, Cockroachdb, Dendrite-Stub, and Maghemite deps (#4961) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, running Omicron's test suite requires Clickhouse, Cockroachdb, Dendrite-stub, and Maghemite binaries to be present in `./out`. These are downloaded by a set of scripts in `tools/`, such as `tools/ci_download_clickhouse`, et cetera. This branch updates the Nix flake to handle downloading these dependencies as part of the Nix configuration. This way, Nix users will not need to re-run these scripts when the dependencies on these binaries change, as the flake is automatically updated by `direnv`. The flake wraps the `clickhouse`, `clockroach`, `mgd`, `dpd`, and `swadm` binaries using `autoPatchelfHook`, to ensure that dynamically loaded libraries will work on NixOS. Then, the binaries are symlinked into the `out` directory identically to how the `tools/ci_download_${foo}` scripts work. After this change, I can now successfully use `omicron-dev run-all` on my NixOS machine! The contents of my `./out` directory looks like this: ```console $ tree ./out out ├── clickhouse │ ├── clickhouse ⇒ /nix/store/p0a6hx0l27v6d4n1x1ddy6fp48q2gw39-clickhouse/bin/clickhouse │ └── config.xml ⇒ /nix/store/01qhirxkns7x8rwh0d87hw47a3gc7hkg-clickhouse/etc/config.xml ├── cockroachdb │ └── bin ⇒ /nix/store/adzvkw7d373md98msa2p4ri2miirndbv-cockroachdb/bin ├── dendrite-stub ⇒ /nix/store/84fs1l2njmc2v1hkhjjk7kyspncriqnh-dendrite-stub └── mgd ⇒ /nix/store/rr23ga4m8yckqvmccc7p9jwqy326a8hb-mgd ``` --- .gitignore | 2 - clients/ddm-admin-client/build.rs | 4 +- clients/dpd-client/build.rs | 5 +- clients/mg-admin-client/build.rs | 5 +- flake.lock | 5 +- flake.nix | 451 +++++++++++++++++++++++++++--- tools/ci_download_clickhouse | 2 +- tools/clickhouse_version | 1 + 8 files changed, 420 insertions(+), 55 deletions(-) create mode 100644 tools/clickhouse_version diff --git a/.gitignore b/.gitignore index fc3cb4133a..fc5fd5f297 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,6 @@ README.html TODO.html logs out -tools/clickhouse* -tools/cockroach* /clickhouse/ /cockroachdb/ smf/nexus/root.json diff --git a/clients/ddm-admin-client/build.rs b/clients/ddm-admin-client/build.rs index da74ee9962..c51ec05faa 100644 --- a/clients/ddm-admin-client/build.rs +++ b/clients/ddm-admin-client/build.rs @@ -33,7 +33,9 @@ fn main() -> Result<()> { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. let local_path = - format!("../../out/downloads/ddm-admin-{commit}.json"); + env::var("DDM_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/ddm-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_ddm_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/clients/dpd-client/build.rs b/clients/dpd-client/build.rs index 6a65ab9495..536869b4a2 100644 --- a/clients/dpd-client/build.rs +++ b/clients/dpd-client/build.rs @@ -38,7 +38,10 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the // requisite openapi spec. - let local_path = format!("../../out/downloads/dpd-{commit}.json"); + let local_path = + env::var("DPD_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/dpd-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_dendrite_openapi` (after updating `tools/dendrite_openapi_version` if the dendrite commit in package-manifest.toml has changed)"); } diff --git a/clients/mg-admin-client/build.rs b/clients/mg-admin-client/build.rs index dcc7ae61cb..d9886d0ece 100644 --- a/clients/mg-admin-client/build.rs +++ b/clients/mg-admin-client/build.rs @@ -31,8 +31,9 @@ fn main() -> Result<()> { PackageSource::Prebuilt { commit, .. } => { // Report a relatively verbose error if we haven't downloaded the requisite // openapi spec. - let local_path = - format!("../../out/downloads/mg-admin-{commit}.json"); + let local_path = env::var("MG_OPENAPI_PATH").unwrap_or_else(|_| { + format!("../../out/downloads/mg-admin-{commit}.json") + }); if !Path::new(&local_path).exists() { bail!("{local_path} doesn't exist; rerun `tools/ci_download_maghemite_openapi` (after updating `tools/maghemite_mg_openapi_version` if the maghemite commit in package-manifest.toml has changed)"); } diff --git a/flake.lock b/flake.lock index 2c24a13714..f2dfc1b532 100644 --- a/flake.lock +++ b/flake.lock @@ -36,16 +36,13 @@ }, "root": { "inputs": { - "flake-utils": "flake-utils", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" } }, "rust-overlay": { "inputs": { - "flake-utils": [ - "flake-utils" - ], + "flake-utils": "flake-utils", "nixpkgs": [ "nixpkgs" ] diff --git a/flake.nix b/flake.nix index 65329cbbf7..1f9a992274 100644 --- a/flake.nix +++ b/flake.nix @@ -3,62 +3,425 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { url = "github:oxalica/rust-overlay"; - inputs = { - nixpkgs.follows = "nixpkgs"; - flake-utils.follows = "flake-utils"; - }; + inputs.nixpkgs.follows = "nixpkgs"; }; }; - outputs = { self, nixpkgs, flake-utils, rust-overlay }: - flake-utils.lib.eachDefaultSystem - (system: + outputs = { self, nixpkgs, rust-overlay, ... }: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { + inherit overlays; + system = "x86_64-linux"; + }; + # use the Rust toolchain defined in the `rust-toolchain.toml` file. + rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; + + buildInputs = with pkgs; [ + # libs + openssl + postgresql + xmlsec + sqlite + libclang + libxml2 + libtool + ]; + + nativeBuildInputs = with pkgs; [ + rustToolchain + cmake + stdenv + pkg-config + ]; + + openAPIVersion = with pkgs.lib; path: + let + file = strings.fileContents path; + parts = strings.splitString "\n" file; + extractHash = prefix: (line: trivial.pipe line [ + (elemAt parts) + (strings.removeSuffix "\"") + (strings.removePrefix "${prefix}=\"") + ]); + in + { + commit = extractHash "COMMIT" 0; + sha = extractHash "SHA2" 1; + }; + + downloadBuildomat = + let baseURL = "https://buildomat.eng.oxide.computer/public/file/oxidecomputer"; + in { kind, repo, file, commit, sha }: + builtins.fetchurl { + url = "${baseURL}/${repo}/${kind}/${commit}/${file}"; + sha256 = sha; + }; + + downloadOpenAPI = { repo, file, version }: + downloadBuildomat + { + inherit repo file; + kind = "openapi"; + commit = pkgs.lib.debug.traceValFn + (v: "${file}: commit=${v}") + version.commit; + sha = version.sha; + }; + + dendriteVersion = openAPIVersion + ./tools/dendrite_openapi_version; + mgVersion = openAPIVersion + ./tools/maghemite_mg_openapi_version; + + + dendriteOpenAPI = downloadOpenAPI + { + repo = "dendrite"; + file = "dpd.json"; + version = dendriteVersion; + }; + + ddmOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "ddm-admin.json"; + version = openAPIVersion ./tools/maghemite_ddm_openapi_version; + }; + + mgOpenAPI = downloadOpenAPI + { + repo = "maghemite"; + file = "mg-admin.json"; + version = mgVersion; + }; + + findSha = with pkgs.lib; + shas: (name: + let + upperName = strings.toUpper name; + prefix = "${upperName}=\""; + in + trivial.pipe shas [ + (lists.findFirst (strings.hasPrefix prefix) "") + (strings.removePrefix prefix) + (strings.removeSuffix "\"") + ]); + + dendrite-stub = with pkgs.lib; let - overlays = [ (import rust-overlay) ]; - pkgs = import nixpkgs { - inherit system overlays; + commit = dendriteVersion.commit; + repo = "dendrite"; + stubShas = + let + file = builtins.readFile + ./tools/dendrite_stub_checksums; + in + strings.splitString + "\n" + file; + findStubSha = name: findSha stubShas "CIDL_SHA256_${name}"; + fetchLinuxBin = file: + downloadBuildomat { + inherit commit file repo; + sha = findStubSha "linux_${file}"; + kind = "linux-bin"; + }; + + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findStubSha "illumos"; + kind = "image"; + file = "dendrite-stub.tar.gz"; + }; + swadm = fetchLinuxBin + "swadm"; + dpd = fetchLinuxBin + "dpd"; + in + with pkgs; stdenv.mkDerivation + { + name = "dendrite-stub"; + version = commit; + src = tarball; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + openssl + ]; + + installPhase = + let + binPath = "root/opt/oxide/dendrite/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${swadm} $out/${binPath}/swadm + chmod +x $out/${binPath}/swadm + cp ${dpd} $out/${binPath}/dpd + chmod +x $out/${binPath}/dpd + + mkdir -p $out/bin + ln -s $out/${binPath}/swadm $out/bin/swadm + ln -s $out/${binPath}/dpd $out/bin/dpd + ''; }; - # use the Rust toolchain defined in the `rust-toolchain.toml` file. - rustToolchain = pkgs.pkgsBuildHost.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml; - nativeBuildInputs = with pkgs; [ - rustToolchain - cmake - stdenv - pkg-config - ]; - buildInputs = with pkgs; [ - # libs - openssl - postgresql - xmlsec - sqlite - libclang - libxml2 - ]; + + mgd = with pkgs.lib; + let + commit = mgVersion.commit; + repo = "maghemite"; + shas = + let + file = builtins.readFile + ./tools/maghemite_mgd_checksums; + in + strings.splitString + "\n" + file; + # get stuff + tarball = downloadBuildomat + { + inherit commit repo; + sha = findSha shas "CIDL_SHA256"; + kind = "image"; + file = "mgd.tar.gz"; + }; + linuxBin = + downloadBuildomat + { + inherit commit repo; + sha = findSha shas "MGD_LINUX_SHA256"; + kind = "linux"; + file = "mgd"; + }; in with pkgs; - { - devShells.default = mkShell.override + stdenv.mkDerivation + { + name = "mgd"; + src = tarball; + version = commit; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + + installPhase = + let + binPath = "root/opt/oxide/mgd/bin"; + in + '' + mkdir -p $out/${binPath} + cp -r . $out/root + cp ${linuxBin} $out/${binPath}/mgd + chmod +x $out/${binPath}/mgd + + mkdir -p $out/bin + ln -s $out/${binPath}/mgd $out/bin/mgd + ''; + }; + + # reads the version for Clickhouse or Cockroachdb from the + # `tools/clickhouse_version` and `tools/cockroachdb_version` files. + readVersionFile = with pkgs.lib; file: trivial.pipe ./tools/${file} [ + (builtins.readFile) + (strings.removeSuffix "\n") + (strings.removePrefix "v") + (debug.traceValFn (v: "${file}: ${v}")) + ]; + + clickhouse = with pkgs; + let + name = "clickhouse"; + version = readVersionFile "${name}_version"; + # N.B. that unlike maghemite and dendrite, the Clickhouse hashes + # in `tools/clickhouse_checksums` are MD5 rather than SHA256, so we + # can't give Nix those hashes and must instead determine it ourselves. + # this means that we will have to update this SHA if the clickhouse + # version changes. + sha256 = "1lgxwh67apgl386ilpg0iy5xkyz12q4lgnz08zswjbxv88ra0qxj"; + src = builtins.fetchurl { - # use Clang as the C compiler for all C libraries - stdenv = clangStdenv; - } + inherit sha256; + url = "https://oxide-clickhouse-build.s3.us-west-2.amazonaws.com/${name}-v${version}.linux.tar.gz"; + }; + in + stdenv.mkDerivation + { + inherit src name version; + sourceRoot = "."; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; + + buildInputs = [ + glibc + gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + mkdir -p $out/etc + cp ./${name} $out/bin/${name} + cp ./._config.xml $out/bin/config.xml + ''; + }; + + cockroachdb = with pkgs; + let + name = "cockroachdb"; + binName = "cockroach"; + version = readVersionFile "${name}_version"; + src = builtins.fetchurl { - inherit buildInputs nativeBuildInputs; + url = "https://binaries.cockroachdb.com/${binName}-v${version}.linux-amd64.tgz"; + sha256 = "1aglbwh27275bicyvij11s3as4zypqwc26p9gyh5zr3y1s123hr4"; + }; + in + stdenv.mkDerivation + { + inherit name src version; + nativeBuildInputs = [ + # patch the binary to use the right dynamic library paths. + autoPatchelfHook + ]; - name = "omicron"; - DEP_PQ_LIBDIRS = " ${postgresql.lib}/lib"; - LIBCLANG_PATH = "${libclang.lib}/lib"; - OPENSSL_DIR = "${openssl.dev}"; - OPENSSL_LIB_DIR = "${openssl.out}/lib"; + buildInputs = [ + glibc + # gcc-unwrapped + ]; + installPhase = '' + mkdir -p $out/bin + cp ./${binName} $out/bin/${binName} + ''; + }; + in + { + packages.x86_64-linux = { + inherit dendrite-stub mgd cockroachdb clickhouse; + }; - # Needed by rustfmt-wrapper, see: - # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs - RUSTFMT = "${rustToolchain}/bin/rustfmt"; + checks.x86_64-linux = with pkgs; + let + # produces a check derivation that ensures a package's executable has + # the expected version. + mkVersionCheck = { pkg, cmd }: runCommand "check-${pkg.name}-version" + { + PATH = "${pkg.out}"; + } '' + actualVersion=$(${pkg.out}/bin/${cmd}) + if [ "$actualVersion" != "${pkg.version}" ]; then + echo "expected ${pkg.name} version \"${pkg.version}\", got \"$actualVersion\"" + exit 1 + fi + + # the check derivation must have an output. + touch $out + ''; + # produces a check derivation that ensures a package's executable + # runs. + mkExecCheck = { pkg, cmd }: runCommand "check-${pkg.name}-${cmd}-exec" + { } '' + ${pkg.out}/bin/${cmd} && touch $out + ''; + in + { + clickhouseVersion = mkVersionCheck + { + pkg = clickhouse; + cmd = "clickhouse server --version | cut -d ' ' -f 4"; }; - } - ); + + cockroachdbVersion = mkVersionCheck + { + pkg = cockroachdb; + cmd = "cockroach version --build-tag | tr -d 'v'"; + }; + + mgdCanExec = mkExecCheck { + pkg = mgd; + cmd = "mgd help"; + }; + + dpdCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "dpd help"; + }; + + swadmCanExec = mkExecCheck { + pkg = dendrite-stub; + cmd = "swadm help"; + }; + }; + + devShells.x86_64-linux.default = + pkgs.mkShell.override + { + # use Clang as the C compiler for all C libraries + stdenv = pkgs.clangStdenv; + } + { + inherit buildInputs; + nativeBuildInputs = nativeBuildInputs ++ [ + # Dendrite and maghemite, for running tests. + dendrite-stub + mgd + clickhouse + cockroachdb + ]; + + name = "omicron"; + DEP_PQ_LIBDIRS = "${pkgs.postgresql.lib}/lib"; + LIBCLANG_PATH = "${pkgs.libclang.lib}/lib"; + OPENSSL_DIR = "${pkgs.openssl.dev}"; + OPENSSL_LIB_DIR = "${pkgs.openssl.out}/lib"; + + MG_OPENAPI_PATH = mgOpenAPI; + DDM_OPENAPI_PATH = ddmOpenAPI; + DPD_OPENAPI_PATH = dendriteOpenAPI; + + # Needed by rustfmt-wrapper, see: + # https://github.com/oxidecomputer/rustfmt-wrapper/blob/main/src/lib.rs + RUSTFMT = "${rustToolchain}/bin/rustfmt"; + + shellHook = '' + rm out/mgd + rm out/dendrite-stub + rm -r out/clickhouse + rm -r out/cockroachdb + + mkdir -p out/clickhouse + mkdir -p out/cockroachdb/ + + ln -s ${mgd.out} -T out/mgd + ln -s ${dendrite-stub.out} -T out/dendrite-stub + ln -s ${clickhouse.out}/bin/clickhouse out/clickhouse/clickhouse + ln -s ${clickhouse.out}/etc/config.xml out/clickhouse + ln -s ${cockroachdb.out}/bin out/cockroachdb/bin + ''; + }; + }; } + + + + + + + diff --git a/tools/ci_download_clickhouse b/tools/ci_download_clickhouse index 03a5bff24c..675566fad7 100755 --- a/tools/ci_download_clickhouse +++ b/tools/ci_download_clickhouse @@ -20,7 +20,7 @@ DOWNLOAD_DIR="$TARGET_DIR/downloads" DEST_DIR="./$TARGET_DIR/clickhouse" # If you change this, you must also update the md5sums below -CIDL_VERSION="v22.8.9.24" +CIDL_VERSION="$(cat "$SOURCE_DIR/clickhouse_version")" source "$SOURCE_DIR/clickhouse_checksums" # Download from manually-populated S3 bucket for now diff --git a/tools/clickhouse_version b/tools/clickhouse_version new file mode 100644 index 0000000000..93b98bf738 --- /dev/null +++ b/tools/clickhouse_version @@ -0,0 +1 @@ +v22.8.9.24 \ No newline at end of file From 00d802453d79053318e41a77c8056b7c014f18c2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Feb 2024 16:03:55 -0800 Subject: [PATCH 16/27] tools: use SHA256 in `ci_download_cockroachdb` (#5017) Depends on #4961. Currently, the `tools/ci_download_cockroachdb` script uses MD5 as the checksum for the Cockroachdb tarball. This is unfortunate for two reasons: 1. Upstream Cockroachdb (and the corresponding Illumos build) publish SHA256 digests for these tarballs, rather than MD5s (see https://www.cockroachlabs.com/docs/releases/ and https://illumos.org/downloads/). Using SHA256 rather than MD5 digests should make updating to a new version easier. 2. Nix requires SHA256 checksums for files downloaded as build inputs. Currently, the Nix flake can use the SHA256 checksums for Maghemite and Dendrite from `tools/maghemite_mgd_checksums` and `tools/dendrite_stub_checksums`, meaning that updating these versions does not require manually changing the Nix flake. However, because we use MD5 rather than SHA256 checksums for Cockroachdb, updating the Cockroachdb version requires manually changing the version in the Nix flake, which is a shame, especially if someone unfamiliar with Nix has to do it... This commit changes `tools/cockroachdb_checksums` and the corresponding `tools/ci_download_cockroachdb` script to use SHA256 rather than MD5. I've changed the Nix flake to read the hash from this file rather than hard-coding it, so now, the cockroachdb version can be updated without touching the flake. --- flake.nix | 14 +++++++++---- tools/ci_download_cockroachdb | 39 ++++++++++++++--------------------- tools/cockroachdb_checksums | 6 +++--- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/flake.nix b/flake.nix index 1f9a992274..408dff5706 100644 --- a/flake.nix +++ b/flake.nix @@ -98,6 +98,8 @@ version = mgVersion; }; + # given a list of strings of the form `PREFIX="SHA256"`, finds the string + # starting with the provided `name` and returns the hash for that prefix. findSha = with pkgs.lib; shas: (name: let @@ -119,9 +121,7 @@ file = builtins.readFile ./tools/dendrite_stub_checksums; in - strings.splitString - "\n" - file; + strings.splitString "\n" file; findStubSha = name: findSha stubShas "CIDL_SHA256_${name}"; fetchLinuxBin = file: downloadBuildomat { @@ -288,10 +288,16 @@ name = "cockroachdb"; binName = "cockroach"; version = readVersionFile "${name}_version"; + sha256 = + let + shaFile = builtins.readFile ./tools/${name}_checksums; + shas = lib.strings.splitString "\n" shaFile; + in + findSha shas "CIDL_SHA256_LINUX"; src = builtins.fetchurl { + inherit sha256; url = "https://binaries.cockroachdb.com/${binName}-v${version}.linux-amd64.tgz"; - sha256 = "1aglbwh27275bicyvij11s3as4zypqwc26p9gyh5zr3y1s123hr4"; }; in stdenv.mkDerivation diff --git a/tools/ci_download_cockroachdb b/tools/ci_download_cockroachdb index ca484c000f..5755e7e665 100755 --- a/tools/ci_download_cockroachdb +++ b/tools/ci_download_cockroachdb @@ -13,7 +13,7 @@ set -o errexit SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" ARG0="$(basename "${BASH_SOURCE[0]}")" -# If you change this, you must also update the md5sums below +# If you change this, you must also update the sha256sums below CIDL_VERSION="$(cat "$SOURCE_DIR/cockroachdb_version")" source "$SOURCE_DIR/cockroachdb_checksums" @@ -49,6 +49,7 @@ function main # Configure this program configure_os "$CIDL_OS" CIDL_URL="$CIDL_URL_BASE/$TARBALL_FILENAME" + CIDL_SHA256FUNC="do_sha256sum" # Download the file. echo "URL: $CIDL_URL" @@ -60,9 +61,9 @@ function main local DO_DOWNLOAD="true" if [[ -f "$TARBALL_FILE" ]]; then # If the file exists with a valid checksum, we can skip downloading. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" == "$CIDL_MD5" ]]; then + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" == "$CIDL_SHA256" ]]; then DO_DOWNLOAD="false" fi fi @@ -72,12 +73,12 @@ function main do_download_curl "$CIDL_URL" "$TARBALL_FILE" || \ fail "failed to download file" - # Verify the md5sum. - calculated_md5="$($CIDL_MD5FUNC "$TARBALL_FILE")" || \ - fail "failed to calculate md5sum" - if [[ "$calculated_md5" != "$CIDL_MD5" ]]; then - fail "md5sum mismatch \ - (expected $CIDL_MD5, found $calculated_md5)" + # Verify the sha256sum. + calculated_sha256="$($CIDL_SHA256FUNC "$TARBALL_FILE")" || \ + fail "failed to calculate sha256sum" + if [[ "$calculated_sha256" != "$CIDL_SHA256" ]]; then + fail "sha256sum mismatch \ + (expected $CIDL_SHA256, found $calculated_sha256)" fi fi @@ -105,24 +106,21 @@ function configure_os darwin*) CIDL_BUILD="darwin-10.9-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_DARWIN" - CIDL_MD5FUNC="do_md5" + CIDL_SHA256="$CIDL_SHA256_DARWIN" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; linux-gnu*) CIDL_BUILD="linux-amd64" CIDL_SUFFIX="tgz" - CIDL_MD5="$CIDL_MD5_LINUX" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_LINUX" CIDL_URL_BASE="$CIDL_URL_COCKROACH" CIDL_ASSEMBLE="do_assemble_official" ;; solaris*) CIDL_BUILD="illumos" CIDL_SUFFIX="tar.gz" - CIDL_MD5="$CIDL_MD5_ILLUMOS" - CIDL_MD5FUNC="do_md5sum" + CIDL_SHA256="$CIDL_SHA256_ILLUMOS" CIDL_URL_BASE="$CIDL_URL_ILLUMOS" CIDL_ASSEMBLE="do_assemble_illumos" ;; @@ -143,14 +141,9 @@ function do_download_curl curl --silent --show-error --fail --location --output "$2" "$1" } -function do_md5 +function do_sha256sum { - md5 < "$1" -} - -function do_md5sum -{ - md5sum < "$1" | awk '{print $1}' + sha256sum < "$1" | awk '{print $1}' } function do_untar diff --git a/tools/cockroachdb_checksums b/tools/cockroachdb_checksums index 50e873100f..20b6e237f8 100644 --- a/tools/cockroachdb_checksums +++ b/tools/cockroachdb_checksums @@ -1,3 +1,3 @@ -CIDL_MD5_DARWIN="2db972c254b4e3b599e12110520178b5" -CIDL_MD5_LINUX="8c3170883e0a0be1a34b44090c067a8c" -CIDL_MD5_ILLUMOS="d8999aff364e5d70f226e139fda724a3" +CIDL_SHA256_DARWIN="1ca69e0911af11a73305c3c6f4650b912d70754900b5bf7b80a1d361efe36561" +CIDL_SHA256_LINUX="24c321820e7ee45fa07fe91ac138befe13ad860e41c6ed595ce58823205ff4a9" +CIDL_SHA256_ILLUMOS="f151714ba3a6e02caaaa59727482c36085e60d6bd2fa963938e9a3d8c8a77088" From bdf9a224acc44994a274ee6c7228892f29ebe827 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 7 Feb 2024 16:39:26 -0800 Subject: [PATCH 17/27] patch samael crate to include njaremko/samael#41 (#5023) Points cargo at https://github.com/oxidecomputer/samael/tree/oxide/omicron, which has njaremko/samael#41 merged in. Fixes #4920, reverts #5007, supersedes #5019. --- .github/workflows/rust.yml | 8 -------- Cargo.lock | 3 +-- Cargo.toml | 5 +++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 05ff2ed879..6ec296c3a9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -32,14 +32,6 @@ jobs: - name: Disable packages.microsoft.com repo if: ${{ startsWith(matrix.os, 'ubuntu') }} run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list - # https://github.com/oxidecomputer/omicron/issues/4920 - - name: Pin libxmlsec1 to 1.3.2 - if: ${{ startsWith(matrix.os, 'macos') }} - run: | - curl -fLOsS --retry 5 https://raw.githubusercontent.com/Homebrew/homebrew-core/081149b0d2720c2759b6ac8253e33b27f6d6c1cd/Formula/lib/libxmlsec1.rb - brew install ./libxmlsec1.rb - brew pin libxmlsec1 - rm -f libxmlsec1.rb - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 diff --git a/Cargo.lock b/Cargo.lock index 73ec3e4b4c..234fbc5e45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7623,8 +7623,7 @@ dependencies = [ [[package]] name = "samael" version = "0.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b75583aad4a51c50fc0af69c230d18078c9d5a69a98d0f6013d01053acf744f4" +source = "git+https://github.com/oxidecomputer/samael?branch=oxide/omicron#9e609a8f6fa0dd84e3bb8f579f46bd780c8be62b" dependencies = [ "base64", "bindgen", diff --git a/Cargo.toml b/Cargo.toml index 0c8e3245c9..65197da650 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -614,3 +614,8 @@ branch = "oxide/omicron" # to it. [patch.crates-io.omicron-workspace-hack] path = "workspace-hack" + +# Pulls in https://github.com/njaremko/samael/pull/41 +[patch.crates-io.samael] +git = "https://github.com/oxidecomputer/samael" +branch = "oxide/omicron" From e88e06bf8e5eed16b42ac78ca536b06fdd0dc183 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 7 Feb 2024 17:12:07 -0800 Subject: [PATCH 18/27] [sled-agent] Initialize zones more independently of each other (#5012) Resolves https://github.com/oxidecomputer/omicron/issues/5002 This change attempts to make zone setup simpler and more independent. Namely: If any particular zone cannot start, due to NTP timesync, internal DNS lookup, or a missing disk, it should be able to fail without necessarily preventing all other zones from initializing. --- sled-agent/src/params.rs | 21 + sled-agent/src/services.rs | 843 ++++++++++++++++++------------------- 2 files changed, 441 insertions(+), 423 deletions(-) diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f14a13aa41..52bfb20e5d 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -664,6 +664,27 @@ impl OmicronZoneType { *address, )) } + + /// Does this zone require time synchronization before it is initialized?" + /// + /// This function is somewhat conservative - the set of services + /// that can be launched before timesync has completed is intentionally kept + /// small, since it would be easy to add a service that expects time to be + /// reasonably synchronized. + pub fn requires_timesync(&self) -> bool { + match self { + // These zones can be initialized and started before time has been + // synchronized. For the NTP zones, this should be self-evident -- + // we need the NTP zone to actually perform time synchronization! + // + // The DNS zone is a bit of an exception here, since the NTP zone + // itself may rely on DNS lookups as a dependency. + OmicronZoneType::BoundaryNtp { .. } + | OmicronZoneType::InternalNtp { .. } + | OmicronZoneType::InternalDns { .. } => false, + _ => true, + } + } } impl crate::smf_helper::Service for OmicronZoneType { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index d2f440024c..6e99d2c8ef 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -57,7 +57,7 @@ use illumos_utils::running_zone::{ }; use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; use illumos_utils::zone::AddressRequest; -use illumos_utils::zone::Zones; +use illumos_utils::zpool::ZpoolName; use illumos_utils::{execute, PFEXEC}; use internal_dns::resolver::Resolver; use itertools::Itertools; @@ -80,8 +80,7 @@ use omicron_common::api::internal::shared::{ HostPortConfig, RackNetworkConfig, }; use omicron_common::backoff::{ - retry_notify, retry_policy_internal_service_aggressive, retry_policy_local, - BackoffError, + retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; use omicron_common::ledger::{self, Ledger, Ledgerable}; use omicron_common::nexus_config::{ @@ -101,7 +100,6 @@ use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::collections::HashSet; -use std::iter::FromIterator; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; @@ -112,6 +110,11 @@ use tokio::sync::{oneshot, MutexGuard}; use tokio::task::JoinHandle; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + const IPV6_UNSPECIFIED: IpAddr = IpAddr::V6(Ipv6Addr::UNSPECIFIED); #[derive(thiserror::Error, Debug)] @@ -160,6 +163,16 @@ pub enum Error { err: illumos_utils::running_zone::RunCommandError, }, + #[error("Cannot list zones")] + ZoneList(#[source] illumos_utils::zone::AdmError), + + #[error("Cannot remove zone")] + ZoneRemoval { + zone_name: String, + #[source] + err: illumos_utils::zone::AdmError, + }, + #[error("Failed to boot zone: {0}")] ZoneBoot(#[from] illumos_utils::running_zone::BootError), @@ -169,6 +182,9 @@ pub enum Error { #[error(transparent)] ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), + #[error("Failed to initialize zones: {errors:?}")] + ZoneEnsure { errors: Vec<(String, Error)> }, + #[error("Error contacting ddmd: {0}")] DdmError(#[from] DdmError), @@ -267,17 +283,47 @@ impl Error { impl From for omicron_common::api::external::Error { fn from(err: Error) -> Self { match err { - err @ Error::RequestedConfigConflicts(_) => { + Error::RequestedConfigConflicts(_) => { omicron_common::api::external::Error::invalid_request( &err.to_string(), ) } - err @ Error::RequestedConfigOutdated { .. } => { + Error::RequestedConfigOutdated { .. } => { omicron_common::api::external::Error::conflict(&err.to_string()) } - err @ Error::TimeNotSynchronized => { + Error::TimeNotSynchronized => { omicron_common::api::external::Error::unavail(&err.to_string()) } + Error::ZoneEnsure { errors } => { + // As a special case, if any zones failed to timesync, + // prioritize that error. + // + // This conversion to a 503 error was requested in + // https://github.com/oxidecomputer/omicron/issues/4776 , + // and we preserve that behavior here, even though we may + // launch many zones at the same time. + if let Some(err) = errors.iter().find_map(|(_, err)| { + if matches!(err, Error::TimeNotSynchronized) { + Some(err) + } else { + None + } + }) { + omicron_common::api::external::Error::unavail( + &err.to_string(), + ) + } else { + let internal_message = errors + .iter() + .map(|(name, err)| { + format!("failed to start {name}: {err:?}") + }) + .join("\n"); + omicron_common::api::external::Error::InternalError { + internal_message, + } + } + } _ => omicron_common::api::external::Error::InternalError { internal_message: err.to_string(), }, @@ -300,27 +346,6 @@ fn display_zone_init_errors(errors: &[(String, Box)]) -> String { output } -// Does this zone require time synchronization before it is initialized?" -// -// This function is somewhat conservative - the set of services -// that can be launched before timesync has completed is intentionally kept -// small, since it would be easy to add a service that expects time to be -// reasonably synchronized. -fn zone_requires_timesync(zone_type: &OmicronZoneType) -> bool { - match zone_type { - // These zones can be initialized and started before time has been - // synchronized. For the NTP zones, this should be self-evident -- - // we need the NTP zone to actually perform time synchronization! - // - // The DNS zone is a bit of an exception here, since the NTP zone - // itself may rely on DNS lookups as a dependency. - OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - | OmicronZoneType::InternalDns { .. } => false, - _ => true, - } -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -551,7 +576,33 @@ enum SledLocalZone { }, } -type ZoneMap = BTreeMap; +// The return type for `start_omicron_zones`. +// +// When multiple zones are started concurrently, some can fail while others +// succeed. This structure allows the function to return this nuanced +// information. +#[must_use] +struct StartZonesResult { + // The set of zones which have successfully started. + new_zones: Vec, + + // The set of (zone name, error) of zones that failed to start. + errors: Vec<(String, Error)>, +} + +// A running zone and the configuration which started it. +struct OmicronZone { + runtime: RunningZone, + config: OmicronZoneConfigLocal, +} + +impl OmicronZone { + fn name(&self) -> &str { + self.runtime.name() + } +} + +type ZoneMap = BTreeMap; /// Manages miscellaneous Sled-local services. pub struct ServiceManagerInner { @@ -718,7 +769,7 @@ impl ServiceManager { &self, // This argument attempts to ensure that the caller holds the right // lock. - _map: &MutexGuard<'_, BTreeMap>, + _map: &MutexGuard<'_, ZoneMap>, ) -> Result>, Error> { // First, try to load the current software's zone ledger. If that // works, we're done. @@ -893,84 +944,9 @@ impl ServiceManager { let omicron_zones_config = zones_config.clone().to_omicron_zones_config(); - // Initialize internal DNS only first: we need it to look up the - // boundary switch addresses. This dependency is implicit: when we call - // `ensure_all_omicron_zones` below, we eventually land in - // `opte_ports_needed()`, which for some service types (including Ntp - // but _not_ including InternalDns), we perform internal DNS lookups. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - None, - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!(z.zone_type, OmicronZoneType::InternalDns { .. }) - }, - ) - .await?; - - // Initialize NTP services next as they are required for time - // synchronization, which is a pre-requisite for the other services. We - // keep `OmicronZoneType::InternalDns` because - // `ensure_all_omicron_zones` is additive. - let all_zones_request = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(&all_zones_request), - omicron_zones_config.clone(), - |z: &OmicronZoneConfig| { - matches!( - z.zone_type, - OmicronZoneType::InternalDns { .. } - | OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - ) - }, - ) - .await?; - - drop(existing_zones); - - info!(&self.inner.log, "Waiting for sled time synchronization"); - - retry_notify( - retry_policy_local(), - || async { - match self.timesync_get().await { - Ok(TimeSync { sync: true, .. }) => { - info!(&self.inner.log, "Time is synchronized"); - Ok(()) - } - Ok(ts) => Err(BackoffError::transient(format!( - "No sync {:?}", - ts - ))), - Err(e) => Err(BackoffError::transient(format!( - "Error checking for time synchronization: {}", - e - ))), - } - }, - |error, delay| { - warn!( - self.inner.log, - "Time not yet synchronised (retrying in {:?})", - delay; - "error" => ?error - ); - }, - ) - .await - .expect("Expected an infinite retry loop syncing time"); - - let mut existing_zones = self.inner.zones.lock().await; - - // Initialize all remaining services self.ensure_all_omicron_zones( &mut existing_zones, - Some(&all_zones_request), omicron_zones_config, - |_| true, ) .await?; Ok(()) @@ -2688,17 +2664,73 @@ impl ServiceManager { Ok(running_zone) } - // Populates `existing_zones` according to the requests in `services`. - async fn initialize_omicron_zones_locked( + // Ensures that a single Omicron zone is running. + // + // This method is NOT idempotent. + // + // - If the zone already exists, in any form, it is fully removed + // before being initialized. This is primarily intended to remove "partially + // stopped/started" zones with detritus from interfering with a new zone + // being launched. + // - If zones need time to be synchronized before they are initialized + // (e.g., this is a hard requirement for CockroachDb) they can check the + // `time_is_synchronized` argument. + // - `all_u2_pools` provides a snapshot into durable storage on this sled, + // which gives the storage manager an opportunity to validate the zone's + // storage configuration against the reality of the current sled. + async fn start_omicron_zone( &self, - existing_zones: &mut BTreeMap, - requests: &Vec, - ) -> Result<(), Error> { - if let Some(name) = requests - .iter() - .map(|request| request.zone.zone_name()) - .duplicates() - .next() + zone: &OmicronZoneConfig, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + // Ensure the zone has been fully removed before we try to boot it. + // + // This ensures that old "partially booted/stopped" zones do not + // interfere with our installation. + self.ensure_removed(&zone).await?; + + // If this zone requires timesync and we aren't ready, fail it early. + if zone.zone_type.requires_timesync() && !time_is_synchronized { + return Err(Error::TimeNotSynchronized); + } + + // Ensure that this zone's storage is ready. + let root = self + .validate_storage_and_pick_mountpoint(&zone, &all_u2_pools) + .await?; + + let config = OmicronZoneConfigLocal { zone: zone.clone(), root }; + + let runtime = self + .initialize_zone( + ZoneArgs::Omicron(&config), + // filesystems= + &[], + // data_links= + &[], + ) + .await?; + + Ok(OmicronZone { runtime, config }) + } + + // Concurrently attempts to start all zones identified by requests. + // + // This method is NOT idempotent. + // + // If we try to start ANY zones concurrently, the result is contained + // in the `StartZonesResult` value. This will contain the set of zones which + // were initialized successfully, as well as the set of zones which failed + // to start. + async fn start_omicron_zones( + &self, + requests: impl Iterator + Clone, + time_is_synchronized: bool, + all_u2_pools: &Vec, + ) -> Result { + if let Some(name) = + requests.clone().map(|zone| zone.zone_name()).duplicates().next() { return Err(Error::BadServiceRequest { service: name, @@ -2706,38 +2738,29 @@ impl ServiceManager { }); } - let futures = requests.iter().map(|request| { - async move { - self.initialize_zone( - ZoneArgs::Omicron(request), - // filesystems= - &[], - // data_links= - &[], - ) + let futures = requests.map(|zone| async move { + self.start_omicron_zone(&zone, time_is_synchronized, all_u2_pools) .await - .map_err(|error| (request.zone.zone_name(), error)) - } + .map_err(|err| (zone.zone_name().to_string(), err)) }); + let results = futures::future::join_all(futures).await; + let mut new_zones = Vec::new(); let mut errors = Vec::new(); for result in results { match result { Ok(zone) => { - existing_zones.insert(zone.name().to_string(), zone); + info!(self.inner.log, "Zone started"; "zone" => zone.name()); + new_zones.push(zone); } - Err((zone_name, error)) => { - errors.push((zone_name, Box::new(error))); + Err((name, error)) => { + warn!(self.inner.log, "Zone failed to start"; "zone" => &name); + errors.push((name, error)) } } } - - if !errors.is_empty() { - return Err(Error::ZoneInitialize(errors)); - } - - Ok(()) + Ok(StartZonesResult { new_zones, errors }) } /// Create a zone bundle for the provided zone. @@ -2761,7 +2784,7 @@ impl ServiceManager { return self .inner .zone_bundler - .create(zone, ZoneBundleCause::ExplicitRequest) + .create(&zone.runtime, ZoneBundleCause::ExplicitRequest) .await; } Err(BundleError::NoSuchZone { name: name.to_string() }) @@ -2860,14 +2883,19 @@ impl ServiceManager { } } - let new_config = self - .ensure_all_omicron_zones( - &mut existing_zones, - Some(ledger_zone_config), - request, - |_| true, - ) - .await?; + let omicron_generation = request.generation; + let ledger_generation = ledger_zone_config.ledger_generation; + self.ensure_all_omicron_zones(&mut existing_zones, request).await?; + let zones = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.clone()) + .collect(); + + let new_config = OmicronZonesConfigLocal { + omicron_generation, + ledger_generation, + zones, + }; // Update the zones in the ledger and write it back to both M.2s *ledger_zone_config = new_config; @@ -2878,44 +2906,48 @@ impl ServiceManager { // Ensures that only the following Omicron zones are running. // - // Does not record any information such that these services are - // re-instantiated on boot. - async fn ensure_all_omicron_zones( + // This method strives to be idempotent. + // + // - Starting and stopping zones is not an atomic operation - it's possible + // that we cannot start a zone after a previous one has been successfully + // created (or destroyed) intentionally. As a result, even in error cases, + // it's possible that the set of `existing_zones` changes. However, this set + // will only change in the direction of `new_request`: zones will only be + // removed if they ARE NOT part of `new_request`, and zones will only be + // added if they ARE part of `new_request`. + // - Zones are not updated in-place: two zone configurations that differ + // in any way are treated as entirely distinct. + // - This method does not record any information such that these services + // are re-instantiated on boot. + async fn ensure_all_omicron_zones( &self, // The MutexGuard here attempts to ensure that the caller has the right // lock held when calling this function. - existing_zones: &mut MutexGuard<'_, BTreeMap>, - old_config: Option<&OmicronZonesConfigLocal>, + existing_zones: &mut MutexGuard<'_, ZoneMap>, new_request: OmicronZonesConfig, - filter: F, - ) -> Result - where - F: Fn(&OmicronZoneConfig) -> bool, - { - let log = &self.inner.log; - + ) -> Result<(), Error> { // Do some data-normalization to ensure we can compare the "requested // set" vs the "existing set" as HashSets. - let old_zones_set: HashSet = old_config - .map(|old_config| { - HashSet::from_iter( - old_config.zones.iter().map(|z| z.zone.clone()), - ) - }) - .unwrap_or_else(HashSet::new); - let requested_zones_set = - HashSet::from_iter(new_request.zones.into_iter().filter(filter)); + let old_zone_configs: HashSet = existing_zones + .values() + .map(|omicron_zone| omicron_zone.config.zone.clone()) + .collect(); + let requested_zones_set: HashSet = + new_request.zones.into_iter().collect(); let zones_to_be_removed = - old_zones_set.difference(&requested_zones_set); - let zones_to_be_added = requested_zones_set.difference(&old_zones_set); + old_zone_configs.difference(&requested_zones_set); + let zones_to_be_added = + requested_zones_set.difference(&old_zone_configs); - // For each new zone request, ensure that we've sufficiently - // synchronized time. - // - // NOTE: This imposes a constraint, during initial setup, cold boot, - // etc, that NTP and the internal DNS system it depends on MUST be - // initialized prior to other zones. + // Destroy zones that should not be running + for zone in zones_to_be_removed { + self.zone_bundle_and_try_remove(existing_zones, &zone).await; + } + + // Collect information that's necessary to start new zones + let storage = self.inner.storage.get_latest_resources().await; + let all_u2_pools = storage.all_u2_zpools(); let time_is_synchronized = match self.timesync_get_locked(&existing_zones).await { // Time is synchronized @@ -2923,166 +2955,179 @@ impl ServiceManager { // Time is not synchronized, or we can't check _ => false, }; - for zone in zones_to_be_added.clone() { - if zone_requires_timesync(&zone.zone_type) && !time_is_synchronized - { - return Err(Error::TimeNotSynchronized); - } + + // Concurrently boot all new zones + let StartZonesResult { new_zones, errors } = self + .start_omicron_zones( + zones_to_be_added, + time_is_synchronized, + &all_u2_pools, + ) + .await?; + + // Add the new zones to our tracked zone set + existing_zones.extend( + new_zones.into_iter().map(|zone| (zone.name().to_string(), zone)), + ); + + // If any zones failed to start, exit with an error + if !errors.is_empty() { + return Err(Error::ZoneEnsure { errors }); } + Ok(()) + } - // Destroy zones that should not be running - for zone in zones_to_be_removed { - let expected_zone_name = zone.zone_name(); - if let Some(mut zone) = existing_zones.remove(&expected_zone_name) { - debug!( - log, - "removing an existing zone"; - "zone_name" => &expected_zone_name, + // Attempts to take a zone bundle and remove a zone. + // + // Logs, but does not return an error on failure. + async fn zone_bundle_and_try_remove( + &self, + existing_zones: &mut MutexGuard<'_, ZoneMap>, + zone: &OmicronZoneConfig, + ) { + let log = &self.inner.log; + let expected_zone_name = zone.zone_name(); + let Some(mut zone) = existing_zones.remove(&expected_zone_name) else { + warn!( + log, + "Expected to remove zone, but could not find it"; + "zone_name" => &expected_zone_name, + ); + return; + }; + debug!( + log, + "removing an existing zone"; + "zone_name" => &expected_zone_name, + ); + if let Err(e) = self + .inner + .zone_bundler + .create(&zone.runtime, ZoneBundleCause::UnexpectedZone) + .await + { + error!( + log, + "Failed to take bundle of unexpected zone"; + "zone_name" => &expected_zone_name, + "reason" => ?e, + ); + } + if let Err(e) = zone.runtime.stop().await { + error!(log, "Failed to stop zone {}: {e}", zone.name()); + } + } + + // Ensures that if a zone is about to be installed, it does not exist. + async fn ensure_removed( + &self, + zone: &OmicronZoneConfig, + ) -> Result<(), Error> { + let zone_name = zone.zone_name(); + match Zones::find(&zone_name).await { + Ok(Some(zone)) => { + warn!( + self.inner.log, + "removing zone"; + "zone" => &zone_name, + "state" => ?zone.state(), ); - if let Err(e) = self - .inner - .zone_bundler - .create(&zone, ZoneBundleCause::UnexpectedZone) - .await + if let Err(e) = + Zones::halt_and_remove_logged(&self.inner.log, &zone_name) + .await { error!( - log, - "Failed to take bundle of unexpected zone"; - "zone_name" => &expected_zone_name, - "reason" => ?e, + self.inner.log, + "Failed to remove zone"; + "zone" => &zone_name, + "error" => %e, ); + return Err(Error::ZoneRemoval { + zone_name: zone_name.to_string(), + err: e, + }); } - if let Err(e) = zone.stop().await { - error!(log, "Failed to stop zone {}: {e}", zone.name()); - } - } else { - warn!(log, "Expected to remove zone, but could not find it"); + return Ok(()); } + Ok(None) => return Ok(()), + Err(err) => return Err(Error::ZoneList(err)), } + } - // Create zones that should be running - let storage = self.inner.storage.get_latest_resources().await; - let all_u2_pools = storage.all_u2_zpools(); - - let mut new_zones = Vec::new(); - for zone in zones_to_be_added { - // Check if we think the zone should already be running - let name = zone.zone_name(); - if existing_zones.contains_key(&name) { - // Make sure the zone actually exists in the right state too - match Zones::find(&name).await { - Ok(Some(zone)) if zone.state() == zone::State::Running => { - info!(log, "skipping running zone"; "zone" => &name); - continue; - } - _ => { - // Mismatch between SA's view and reality, let's try to - // clean up any remanants and try initialize it again - warn!( - log, - "expected to find existing zone in running state"; - "zone" => &name, - ); - if let Err(e) = - existing_zones.remove(&name).unwrap().stop().await - { - error!( - log, - "Failed to stop zone"; - "zone" => &name, - "error" => %e, - ); - } - } - } - } + // Returns a zone filesystem mountpoint, after ensuring that U.2 storage + // is valid. + async fn validate_storage_and_pick_mountpoint( + &self, + zone: &OmicronZoneConfig, + all_u2_pools: &Vec, + ) -> Result { + let name = zone.zone_name(); + + // For each new zone request, we pick a U.2 to store the zone + // filesystem. Note: This isn't known to Nexus right now, so it's a + // local-to-sled decision. + // + // Currently, the zone filesystem should be destroyed between + // reboots, so it's fine to make this decision locally. + let root = if let Some(dataset) = zone.dataset_name() { + // Check that the dataset is actually ready to be used. + let [zoned, canmount, encryption] = + illumos_utils::zfs::Zfs::get_values( + &dataset.full_name(), + &["zoned", "canmount", "encryption"], + ) + .map_err(|err| Error::GetZfsValue { + zone: zone.zone_name(), + source: err, + })?; - // For each new zone request, we pick a U.2 to store the zone - // filesystem. Note: This isn't known to Nexus right now, so it's a - // local-to-sled decision. - // - // Currently, the zone filesystem should be destroyed between - // reboots, so it's fine to make this decision locally. - let root = if let Some(dataset) = zone.dataset_name() { - // Check that the dataset is actually ready to be used. - let [zoned, canmount, encryption] = - illumos_utils::zfs::Zfs::get_values( - &dataset.full_name(), - &["zoned", "canmount", "encryption"], - ) - .map_err(|err| Error::GetZfsValue { + let check_property = |name, actual, expected| { + if actual != expected { + return Err(Error::DatasetNotReady { zone: zone.zone_name(), - source: err, - })?; - - let check_property = |name, actual, expected| { - if actual != expected { - return Err(Error::DatasetNotReady { - zone: zone.zone_name(), - dataset: dataset.full_name(), - prop_name: String::from(name), - prop_value: actual, - prop_value_expected: String::from(expected), - }); - } - return Ok(()); - }; - check_property("zoned", zoned, "on")?; - check_property("canmount", canmount, "on")?; - if dataset.dataset().dataset_should_be_encrypted() { - check_property("encryption", encryption, "aes-256-gcm")?; - } - - // If the zone happens to already manage a dataset, then - // we co-locate the zone dataset on the same zpool. - // - // This slightly reduces the underlying fault domain for the - // service. - let data_pool = dataset.pool(); - if !all_u2_pools.contains(&data_pool) { - warn!( - log, - "zone dataset requested on a zpool which doesn't exist"; - "zone" => &name, - "zpool" => %data_pool - ); - return Err(Error::MissingDevice { - device: format!("zpool: {data_pool}"), + dataset: dataset.full_name(), + prop_name: String::from(name), + prop_value: actual, + prop_value_expected: String::from(expected), }); } - data_pool.dataset_mountpoint(ZONE_DATASET) - } else { - // If the zone it not coupled to other datsets, we pick one - // arbitrarily. - let mut rng = rand::thread_rng(); - all_u2_pools - .choose(&mut rng) - .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) - .ok_or_else(|| Error::U2NotFound)? - .clone() + return Ok(()); }; - - new_zones.push(OmicronZoneConfigLocal { zone: zone.clone(), root }); - } - - self.initialize_omicron_zones_locked(existing_zones, &new_zones) - .await?; - - if let Some(old_config) = old_config { - for old_zone in &old_config.zones { - if requested_zones_set.contains(&old_zone.zone) { - new_zones.push(old_zone.clone()); - } + check_property("zoned", zoned, "on")?; + check_property("canmount", canmount, "on")?; + if dataset.dataset().dataset_should_be_encrypted() { + check_property("encryption", encryption, "aes-256-gcm")?; } - } - Ok(OmicronZonesConfigLocal { - omicron_generation: new_request.generation, - ledger_generation: old_config - .map(|c| c.ledger_generation) - .unwrap_or_else(Generation::new), - zones: new_zones, - }) + // If the zone happens to already manage a dataset, then + // we co-locate the zone dataset on the same zpool. + // + // This slightly reduces the underlying fault domain for the + // service. + let data_pool = dataset.pool(); + if !all_u2_pools.contains(&data_pool) { + warn!( + self.inner.log, + "zone dataset requested on a zpool which doesn't exist"; + "zone" => &name, + "zpool" => %data_pool + ); + return Err(Error::MissingDevice { + device: format!("zpool: {data_pool}"), + }); + } + data_pool.dataset_mountpoint(ZONE_DATASET) + } else { + // If the zone it not coupled to other datsets, we pick one + // arbitrarily. + let mut rng = rand::thread_rng(); + all_u2_pools + .choose(&mut rng) + .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) + .ok_or_else(|| Error::U2NotFound)? + .clone() + }; + Ok(root) } pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { @@ -3095,7 +3140,7 @@ impl ServiceManager { if zone.name().contains(&ZoneType::CockroachDb.to_string()) { let address = Zones::get_address( Some(zone.name()), - &zone.control_interface(), + &zone.runtime.control_interface(), )? .ip(); let host = &format!("[{address}]:{COCKROACH_PORT}"); @@ -3103,7 +3148,7 @@ impl ServiceManager { log, "Initializing CRDB Cluster - sending request to {host}" ); - if let Err(err) = zone.run_cmd(&[ + if let Err(err) = zone.runtime.run_cmd(&[ "/opt/oxide/cockroachdb/bin/cockroach", "init", "--insecure", @@ -3118,26 +3163,28 @@ impl ServiceManager { } }; info!(log, "Formatting CRDB"); - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbwipe.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; - zone.run_cmd(&[ - "/opt/oxide/cockroachdb/bin/cockroach", - "sql", - "--insecure", - "--host", - host, - "--file", - "/opt/oxide/cockroachdb/sql/dbinit.sql", - ]) - .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbwipe.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; + zone.runtime + .run_cmd(&[ + "/opt/oxide/cockroachdb/bin/cockroach", + "sql", + "--insecure", + "--host", + host, + "--file", + "/opt/oxide/cockroachdb/sql/dbinit.sql", + ]) + .map_err(|err| Error::CockroachInit { err })?; info!(log, "Formatting CRDB - Completed"); // In the single-sled case, if there are multiple CRDB nodes on @@ -3218,7 +3265,8 @@ impl ServiceManager { // connect to the UNIX socket at // format!("{}/var/run/chrony/chronyd.sock", ntp_zone.root()) - match ntp_zone.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) { + match ntp_zone.runtime.run_cmd(&["/usr/bin/chronyc", "-c", "tracking"]) + { Ok(stdout) => { let v: Vec<&str> = stdout.split(',').collect(); @@ -3808,6 +3856,15 @@ mod test { expected_zone_name_prefix: &str, ) -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zone doesn't already exist + let find_zone_ctx = MockZones::find_context(); + let prefix = expected_zone_name_prefix.to_string(); + find_zone_ctx.expect().return_once(move |zone_name| { + assert!(zone_name.starts_with(&prefix)); + Ok(None) + }); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().return_once( @@ -3865,6 +3922,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -3882,6 +3940,11 @@ mod test { // because these functions may return any number of times. fn expect_new_services() -> Vec> { illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); + + // Ensure zones don't already exist + let find_zone_ctx = MockZones::find_context(); + find_zone_ctx.expect().returning(move |_zone_name| Ok(None)); + // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().returning( @@ -3940,6 +4003,7 @@ mod test { }); vec![ + Box::new(find_zone_ctx), Box::new(create_vnic_ctx), Box::new(install_ctx), Box::new(boot_ctx), @@ -4229,9 +4293,24 @@ mod test { OmicronZoneType::Oximeter { address }, ) .await; + + // First, ensure this is the right kind of error. + let err = result.unwrap_err(); + let errors = match &err { + Error::ZoneEnsure { errors } => errors, + err => panic!("unexpected result: {err:?}"), + }; + assert_eq!(errors.len(), 1); assert_matches::assert_matches!( - result, - Err(Error::TimeNotSynchronized) + errors[0].1, + Error::TimeNotSynchronized + ); + + // Next, ensure this still converts to an "unavail" common error + let common_err = omicron_common::api::external::Error::from(err); + assert_matches::assert_matches!( + common_err, + omicron_common::api::external::Error::ServiceUnavailable { .. } ); // Should succeed: we don't care that time has not yet synchronized (for @@ -4536,88 +4615,6 @@ mod test { logctx.cleanup_successful(); } - #[tokio::test] - async fn test_old_ledger_migration_continue() { - // This test is just like "test_old_ledger_migration", except that we - // deploy a new zone after migration and before shutting down the - // service manager. This tests that new changes modify the new, - // migrated config. - let logctx = omicron_test_utils::dev::test_setup_log( - "test_old_ledger_migration_continue", - ); - let test_config = TestConfig::new().await; - - // Before we start the service manager, stuff one of our old-format - // service ledgers into place. - let contents = - include_str!("../tests/old-service-ledgers/rack2-sled10.json"); - std::fs::write( - test_config.config_dir.path().join(SERVICES_LEDGER_FILENAME), - contents, - ) - .expect("failed to copy example old-format services ledger into place"); - - // Now start the service manager. - let helper = - LedgerTestHelper::new(logctx.log.clone(), &test_config).await; - let mgr = helper.clone().new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - - // Trigger the migration code. - let unused = Mutex::new(BTreeMap::new()); - let migrated_ledger = mgr - .load_ledgered_zones(&unused.lock().await) - .await - .expect("failed to load ledgered zones") - .unwrap(); - - // The other test verified that migration has happened normally so let's - // assume it has. Now provision a new zone. - let vv = migrated_ledger.data().omicron_generation.next(); - let id = Uuid::new_v4(); - - let _expectations = expect_new_services(); - let address = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, EXPECTED_PORT, 0, 0); - let mut zones = - migrated_ledger.data().clone().to_omicron_zones_config().zones; - zones.push(OmicronZoneConfig { - id, - underlay_address: Ipv6Addr::LOCALHOST, - zone_type: OmicronZoneType::InternalNtp { - address, - ntp_servers: vec![], - dns_servers: vec![], - domain: None, - }, - }); - mgr.ensure_all_omicron_zones_persistent(OmicronZonesConfig { - generation: vv, - zones, - }) - .await - .expect("failed to add new zone after migration"); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - // Just to be sure, shut down the manager and create a new one without - // triggering migration again. It should now report one more zone than - // was migrated earlier. - drop_service_manager(mgr); - - let mgr = helper.new_service_manager(); - LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); - let found = - mgr.omicron_zones_list().await.expect("failed to list zones"); - assert_eq!(found.generation, vv); - assert_eq!(found.zones.len(), migrated_ledger.data().zones.len() + 1); - - drop_service_manager(mgr); - logctx.cleanup_successful(); - } - #[tokio::test] async fn test_old_ledger_migration_bad() { let logctx = omicron_test_utils::dev::test_setup_log( From f7ee170f267ca8c1ffbc9ed664bff834aa03b605 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 7 Feb 2024 21:14:03 -0500 Subject: [PATCH 19/27] [omdb] Add `db rack list` (#5021) In the context of #5009, it would be good to be able to quickly confirm whether a rack has the correct or incorrect subnet recorded in CRDB. Madrid is incorrect (as expected): ``` root@oxz_switch1:~# /var/tmp/john/omdb db rack list ID INITIALIZED TUF_BASE_URL RACK_SUBNET ed6bcf59-9620-491d-8ebd-4a4eebf2e136 true - fd00:1122:3344:1::/56 ``` Rack2 is correct (not surprising, since this would've been set after RSS ran, which means it wouldn't hit #5009, although I don't know _exactly_ how this did get set in practice): ``` root@oxz_switch1:~# /var/tmp/john/omdb db rack list ID INITIALIZED TUF_BASE_URL RACK_SUBNET de608e01-b8e4-4d93-b972-a7dbed36dd22 true - fd00:1122:3344:100::/56 ``` --- dev-tools/omdb/src/bin/omdb/db.rs | 61 +++++++++++++++++++++++++++ dev-tools/omdb/tests/usage_errors.out | 2 + 2 files changed, 63 insertions(+) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index c2a4250595..989655dfed 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -160,6 +160,8 @@ pub struct DbFetchOptions { /// Subcommands that query or update the database #[derive(Debug, Subcommand)] enum DbCommands { + /// Print information about the rack + Rack(RackArgs), /// Print information about disks Disks(DiskArgs), /// Print information about internal and external DNS @@ -180,6 +182,18 @@ enum DbCommands { Validate(ValidateArgs), } +#[derive(Debug, Args)] +struct RackArgs { + #[command(subcommand)] + command: RackCommands, +} + +#[derive(Debug, Subcommand)] +enum RackCommands { + /// Summarize current racks + List, +} + #[derive(Debug, Args)] struct DiskArgs { #[command(subcommand)] @@ -406,6 +420,9 @@ impl DbArgs { let opctx = OpContext::for_tests(log.clone(), datastore.clone()); match &self.command { + DbCommands::Rack(RackArgs { command: RackCommands::List }) => { + cmd_db_rack_list(&opctx, &datastore, &self.fetch_opts).await + } DbCommands::Disks(DiskArgs { command: DiskCommands::Info(uuid), }) => cmd_db_disk_info(&opctx, &datastore, uuid).await, @@ -619,6 +636,50 @@ async fn cmd_db_disk_list( Ok(()) } +/// Run `omdb db rack info`. +async fn cmd_db_rack_list( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, +) -> Result<(), anyhow::Error> { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct RackRow { + id: String, + initialized: bool, + tuf_base_url: String, + rack_subnet: String, + } + + let ctx = || "listing racks".to_string(); + + let limit = fetch_opts.fetch_limit; + let rack_list = datastore + .rack_list(opctx, &first_page(limit)) + .await + .context("listing racks")?; + check_limit(&rack_list, limit, ctx); + + let rows = rack_list.into_iter().map(|rack| RackRow { + id: rack.id().to_string(), + initialized: rack.initialized, + tuf_base_url: rack.tuf_base_url.unwrap_or_else(|| "-".to_string()), + rack_subnet: rack + .rack_subnet + .map(|subnet| subnet.to_string()) + .unwrap_or_else(|| "-".to_string()), + }); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + + println!("{}", table); + + Ok(()) +} + /// Run `omdb db disk info `. async fn cmd_db_disk_info( opctx: &OpContext, diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 7688372984..6f9b539371 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -90,6 +90,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory @@ -118,6 +119,7 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: + rack Print information about the rack disks Print information about disks dns Print information about internal and external DNS inventory Print information about collected hardware/software inventory From 06833fbc21d1faab6408dfba6eed0137579f06dd Mon Sep 17 00:00:00 2001 From: David Crespo Date: Wed, 7 Feb 2024 20:59:26 -0600 Subject: [PATCH 20/27] Use macos-14 (M1) runner (#5020) https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/ --------- Co-authored-by: iliana etaoin --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 6ec296c3a9..724f88e7a3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,7 +26,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-22.04, macos-12 ] + os: [ ubuntu-22.04, macos-14 ] steps: # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34 - name: Disable packages.microsoft.com repo From a6f5733ca3979e42a00be27b521222837c46b9c7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 7 Feb 2024 19:46:57 -0800 Subject: [PATCH 21/27] [sled-agent] Avoid unnecessary ledger commit (#5025) Fixes https://github.com/oxidecomputer/omicron/issues/5014 --- sled-agent/src/services.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 6e99d2c8ef..bc40187b38 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -368,7 +368,13 @@ const ZONES_LEDGER_FILENAME: &str = "omicron-zones.json"; /// wants for all of its zones) with the locally-determined configuration for /// these zones. #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZonesConfigLocal { /// generation of the Omicron-provided part of the configuration @@ -429,7 +435,13 @@ impl OmicronZonesConfigLocal { /// wants for this zone) with any locally-determined configuration (like the /// path to the root filesystem) #[derive( - Clone, Debug, serde::Serialize, serde::Deserialize, schemars::JsonSchema, + Clone, + Debug, + Eq, + PartialEq, + serde::Serialize, + serde::Deserialize, + schemars::JsonSchema, )] pub struct OmicronZoneConfigLocal { pub zone: OmicronZoneConfig, @@ -2897,6 +2909,12 @@ impl ServiceManager { zones, }; + // If the contents of the ledger would be identical, we can avoid + // performing an update and commit. + if *ledger_zone_config == new_config { + return Ok(()); + } + // Update the zones in the ledger and write it back to both M.2s *ledger_zone_config = new_config; ledger.commit().await?; From dd9df92ea2294855399831a8c5d216d00c6d5aa8 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Thu, 8 Feb 2024 08:03:14 -0600 Subject: [PATCH 22/27] Remove articles from API endpoint summaries (#5015) Closes #4741 The process here was pretty cool. I fed all the lines to GPT-4 and got it to give me a giant `sed` command to update all the summaries. It had a couple of issues as written, but I fixed them. Details [here](https://gist.github.com/david-crespo/b5cd941045dc518a61e1eab1a5d1f50e). Then I manually went through and tweaked everything. --- nexus/src/external_api/http_entrypoints.rs | 247 +++++++++++---------- openapi/nexus.json | 221 +++++++++--------- 2 files changed, 238 insertions(+), 230 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 28755e5959..3a9e957328 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -411,7 +411,7 @@ async fn ping( Ok(HttpResponseOk(views::Ping { status: views::PingStatus::Ok })) } -/// Fetch the top-level IAM policy +/// Fetch top-level IAM policy #[endpoint { method = GET, path = "/v1/system/policy", @@ -430,7 +430,7 @@ async fn system_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the top-level IAM policy +/// Update top-level IAM policy #[endpoint { method = PUT, path = "/v1/system/policy", @@ -454,7 +454,7 @@ async fn system_policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the current silo's IAM policy +/// Fetch current silo's IAM policy #[endpoint { method = GET, path = "/v1/policy", @@ -481,7 +481,7 @@ pub(crate) async fn policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the current silo's IAM policy +/// Update current silo's IAM policy #[endpoint { method = PUT, path = "/v1/policy", @@ -513,7 +513,7 @@ async fn policy_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource utilization of the user's current silo +/// Fetch resource utilization for user's current silo #[endpoint { method = GET, path = "/v1/utilization", @@ -535,7 +535,7 @@ async fn utilization_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the current utilization of a given silo +/// Fetch current utilization for given silo #[endpoint { method = GET, path = "/v1/system/utilization/silos/{silo}", @@ -628,7 +628,7 @@ async fn system_quotas_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// View the resource quotas of a given silo +/// Fetch resource quotas for silo #[endpoint { method = GET, path = "/v1/system/silos/{silo}/quotas", @@ -651,7 +651,7 @@ async fn silo_quotas_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update the resource quotas of a given silo +/// Update resource quotas for silo /// /// If a quota value is not specified, it will remain unchanged. #[endpoint { @@ -735,9 +735,9 @@ async fn silo_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo +/// Fetch silo /// -/// Fetch a silo by name or ID. +/// Fetch silo by name or ID. #[endpoint { method = GET, path = "/v1/system/silos/{silo}", @@ -829,7 +829,7 @@ async fn silo_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a silo's IAM policy +/// Fetch silo IAM policy #[endpoint { method = GET, path = "/v1/system/silos/{silo}/policy", @@ -851,7 +851,7 @@ async fn silo_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a silo's IAM policy +/// Update silo IAM policy #[endpoint { method = PUT, path = "/v1/system/silos/{silo}/policy", @@ -881,7 +881,7 @@ async fn silo_policy_update( // Silo-specific user endpoints -/// List built-in (system) users in a silo +/// List built-in (system) users in silo #[endpoint { method = GET, path = "/v1/system/users", @@ -922,7 +922,7 @@ struct UserParam { user_id: Uuid, } -/// Fetch a built-in (system) user +/// Fetch built-in (system) user #[endpoint { method = GET, path = "/v1/system/users/{user_id}", @@ -986,7 +986,7 @@ async fn silo_identity_provider_list( // Silo SAML identity providers -/// Create a SAML IdP +/// Create SAML IdP #[endpoint { method = POST, path = "/v1/system/identity-providers/saml", @@ -1015,7 +1015,7 @@ async fn saml_identity_provider_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a SAML IdP +/// Fetch SAML IdP #[endpoint { method = GET, path = "/v1/system/identity-providers/saml/{provider}", @@ -1053,7 +1053,7 @@ async fn saml_identity_provider_view( // "Local" Identity Provider -/// Create a user +/// Create user /// /// Users can only be created in Silos with `provision_type` == `Fixed`. /// Otherwise, Silo users are just-in-time (JIT) provisioned when a user first @@ -1086,7 +1086,7 @@ async fn local_idp_user_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a user +/// Delete user #[endpoint { method = DELETE, path = "/v1/system/identity-providers/local/users/{user_id}", @@ -1110,7 +1110,7 @@ async fn local_idp_user_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set or invalidate a user's password +/// Set or invalidate user's password /// /// Passwords can only be updated for users in Silos with identity mode /// `LocalOnly`. @@ -1178,7 +1178,7 @@ async fn project_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a project +/// Create project #[endpoint { method = POST, path = "/v1/projects", @@ -1199,7 +1199,7 @@ async fn project_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project +/// Fetch project #[endpoint { method = GET, path = "/v1/projects/{project}", @@ -1223,7 +1223,7 @@ async fn project_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a project +/// Delete project #[endpoint { method = DELETE, path = "/v1/projects/{project}", @@ -1280,7 +1280,7 @@ async fn project_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a project's IAM policy +/// Fetch project's IAM policy #[endpoint { method = GET, path = "/v1/projects/{project}/policy", @@ -1305,7 +1305,7 @@ async fn project_policy_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a project's IAM policy +/// Update project's IAM policy #[endpoint { method = PUT, path = "/v1/projects/{project}/policy", @@ -1371,7 +1371,7 @@ async fn project_ip_pool_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/ip-pools/{pool}", @@ -1434,7 +1434,7 @@ pub struct IpPoolPathParam { pub pool_name: Name, } -/// Create an IP pool +/// Create IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools", @@ -1455,7 +1455,7 @@ async fn ip_pool_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an IP pool +/// Fetch IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}", @@ -1479,7 +1479,7 @@ async fn ip_pool_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an IP pool +/// Delete IP pool #[endpoint { method = DELETE, path = "/v1/system/ip-pools/{pool}", @@ -1501,7 +1501,7 @@ async fn ip_pool_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update an IP pool +/// Update IP pool #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}", @@ -1525,7 +1525,7 @@ async fn ip_pool_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an IP pool's linked silos +/// List IP pool's linked silos #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/silos", @@ -1573,7 +1573,7 @@ async fn ip_pool_silo_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Link an IP pool to a silo +/// Link IP pool to silo /// /// Users in linked silos can allocate external IPs from this pool for their /// instances. A silo can have at most one default pool. IPs are allocated from @@ -1603,7 +1603,7 @@ async fn ip_pool_silo_link( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Unlink an IP pool from a silo +/// Unlink IP pool from silo /// /// Will fail if there are any outstanding IPs allocated in the silo. #[endpoint { @@ -1660,7 +1660,7 @@ async fn ip_pool_silo_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the Oxide service IP pool +/// Fetch Oxide service IP pool #[endpoint { method = GET, path = "/v1/system/ip-pools-service", @@ -1681,9 +1681,9 @@ async fn ip_pool_service_view( type IpPoolRangePaginationParams = PaginationParams; -/// List ranges for an IP pool +/// List ranges for IP pool /// -/// List ranges for an IP pool. Ranges are ordered by their first address. +/// Ranges are ordered by their first address. #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/ranges", @@ -1727,7 +1727,7 @@ async fn ip_pool_range_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a range to an IP pool +/// Add range to IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/add", @@ -1751,7 +1751,7 @@ async fn ip_pool_range_add( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Remove a range from an IP pool +/// Remove range from IP pool #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/remove", @@ -1894,7 +1894,7 @@ async fn floating_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a floating IP +/// Create floating IP #[endpoint { method = POST, path = "/v1/floating-ips", @@ -1920,7 +1920,7 @@ async fn floating_ip_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a floating IP +/// Delete floating IP #[endpoint { method = DELETE, path = "/v1/floating-ips/{floating_ip}", @@ -1950,7 +1950,7 @@ async fn floating_ip_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a floating IP +/// Fetch floating IP #[endpoint { method = GET, path = "/v1/floating-ips/{floating_ip}", @@ -1980,7 +1980,9 @@ async fn floating_ip_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a floating IP to an instance or other resource +/// Attach floating IP +/// +/// Attach floating IP to an instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/attach", @@ -2014,7 +2016,9 @@ async fn floating_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a floating IP from an instance or other resource +/// Detach floating IP +/// +// Detach floating IP from instance or other resource. #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/detach", @@ -2106,7 +2110,7 @@ async fn disk_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a disk +/// Fetch disk #[endpoint { method = GET, path = "/v1/disks/{disk}", @@ -2132,7 +2136,7 @@ async fn disk_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a disk +/// Delete disk #[endpoint { method = DELETE, path = "/v1/disks/{disk}", @@ -2220,7 +2224,7 @@ async fn disk_metrics_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Start importing blocks into a disk +/// Start importing blocks into disk /// /// Start the process of importing blocks into a disk #[endpoint { @@ -2251,7 +2255,7 @@ async fn disk_bulk_write_import_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Import blocks into a disk +/// Import blocks into disk #[endpoint { method = POST, path = "/v1/disks/{disk}/bulk-write", @@ -2282,7 +2286,7 @@ async fn disk_bulk_write_import( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop importing blocks into a disk +/// Stop importing blocks into disk /// /// Stop the process of importing blocks into a disk #[endpoint { @@ -2380,7 +2384,7 @@ async fn instance_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an instance +/// Create instance #[endpoint { method = POST, path = "/v1/instances", @@ -2410,7 +2414,7 @@ async fn instance_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance +/// Fetch instance #[endpoint { method = GET, path = "/v1/instances/{instance}", @@ -2444,7 +2448,7 @@ async fn instance_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an instance +/// Delete instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}", @@ -2540,7 +2544,7 @@ async fn instance_reboot( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Boot an instance +/// Boot instance #[endpoint { method = POST, path = "/v1/instances/{instance}/start", @@ -2569,7 +2573,7 @@ async fn instance_start( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stop an instance +/// Stop instance #[endpoint { method = POST, path = "/v1/instances/{instance}/stop", @@ -2598,7 +2602,7 @@ async fn instance_stop( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an instance's serial console +/// Fetch instance serial console #[endpoint { method = GET, path = "/v1/instances/{instance}/serial-console", @@ -2629,7 +2633,7 @@ async fn instance_serial_console( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Stream an instance's serial console +/// Stream instance serial console #[channel { protocol = WEBSOCKETS, path = "/v1/instances/{instance}/serial-console/stream", @@ -2681,9 +2685,10 @@ async fn instance_serial_console_stream( } } -/// List the SSH public keys added to the instance via cloud-init during instance creation +/// List SSH public keys for instance /// -/// Note that this list is a snapshot in time and will not reflect updates made after +/// List SSH public keys injected via cloud-init during instance creation. Note +/// that this list is a snapshot in time and will not reflect updates made after /// the instance is created. #[endpoint { method = GET, @@ -2725,7 +2730,7 @@ async fn instance_ssh_public_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List an instance's disks +/// List disks for instance #[endpoint { method = GET, path = "/v1/instances/{instance}/disks", @@ -2766,7 +2771,7 @@ async fn instance_disk_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach a disk to an instance +/// Attach disk to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/attach", @@ -2798,7 +2803,7 @@ async fn instance_disk_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach a disk from an instance +/// Detach disk from instance #[endpoint { method = POST, path = "/v1/instances/{instance}/disks/detach", @@ -2869,7 +2874,7 @@ async fn certificate_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new system-wide x.509 certificate +/// Create new system-wide x.509 certificate /// /// This certificate is automatically used by the Oxide Control plane to serve /// external connections. @@ -2899,7 +2904,7 @@ struct CertificatePathParam { certificate: NameOrId, } -/// Fetch a certificate +/// Fetch certificate /// /// Returns the details of a specific certificate #[endpoint { @@ -2923,7 +2928,7 @@ async fn certificate_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a certificate +/// Delete certificate /// /// Permanently delete a certificate. This operation cannot be undone. #[endpoint { @@ -2951,7 +2956,7 @@ async fn certificate_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an address lot +/// Create address lot #[endpoint { method = POST, path = "/v1/system/networking/address-lot", @@ -2977,7 +2982,7 @@ async fn networking_address_lot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an address lot +/// Delete address lot #[endpoint { method = DELETE, path = "/v1/system/networking/address-lot/{address_lot}", @@ -3034,7 +3039,7 @@ async fn networking_address_lot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List the blocks in an address lot +/// List blocks in address lot #[endpoint { method = GET, path = "/v1/system/networking/address-lot/{address_lot}/blocks", @@ -3070,7 +3075,7 @@ async fn networking_address_lot_block_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a loopback address +/// Create loopback address #[endpoint { method = POST, path = "/v1/system/networking/loopback-address", @@ -3111,7 +3116,7 @@ pub struct LoopbackAddressPath { pub subnet_mask: u8, } -/// Delete a loopback address +/// Delete loopback address #[endpoint { method = DELETE, path = "/v1/system/networking/loopback-address/{rack_id}/{switch_location}/{address}/{subnet_mask}", @@ -3258,7 +3263,7 @@ async fn networking_switch_port_settings_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get information about a switch port +/// Get information about switch port #[endpoint { method = GET, path = "/v1/system/networking/switch-port-settings/{port}", @@ -3361,7 +3366,7 @@ async fn networking_switch_port_clear_settings( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP configuration +/// Create new BGP configuration #[endpoint { method = POST, path = "/v1/system/networking/bgp", @@ -3458,7 +3463,7 @@ async fn networking_bgp_imported_routes_ipv4( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP configuration +/// Delete BGP configuration #[endpoint { method = DELETE, path = "/v1/system/networking/bgp", @@ -3479,7 +3484,7 @@ async fn networking_bgp_config_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a new BGP announce set +/// Create new BGP announce set #[endpoint { method = POST, path = "/v1/system/networking/bgp-announce", @@ -3527,7 +3532,7 @@ async fn networking_bgp_announce_set_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a BGP announce set +/// Delete BGP announce set #[endpoint { method = DELETE, path = "/v1/system/networking/bgp-announce", @@ -3661,7 +3666,7 @@ async fn image_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an image +/// Create image /// /// Create a new image in a project. #[endpoint { @@ -3699,7 +3704,7 @@ async fn image_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an image +/// Fetch image /// /// Fetch the details for a specific image in a project. #[endpoint { @@ -3742,7 +3747,7 @@ async fn image_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an image +/// Delete image /// /// Permanently delete an image from a project. This operation cannot be undone. /// Any instances in the project using the image will continue to run, however @@ -3778,9 +3783,9 @@ async fn image_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Promote a project image +/// Promote project image /// -/// Promote a project image to be visible to all projects in the silo +/// Promote project image to be visible to all projects in the silo #[endpoint { method = POST, path = "/v1/images/{image}/promote", @@ -3812,9 +3817,9 @@ async fn image_promote( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Demote a silo image +/// Demote silo image /// -/// Demote a silo image to be visible only to a specified project +/// Demote silo image to be visible only to a specified project #[endpoint { method = POST, path = "/v1/images/{image}/demote", @@ -3886,7 +3891,7 @@ async fn instance_network_interface_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a network interface +/// Create network interface #[endpoint { method = POST, path = "/v1/network-interfaces", @@ -3915,7 +3920,7 @@ async fn instance_network_interface_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a network interface +/// Delete network interface /// /// Note that the primary interface for an instance cannot be deleted if there /// are any secondary interfaces. A new primary interface must be designated @@ -3952,7 +3957,7 @@ async fn instance_network_interface_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a network interface +/// Fetch network interface #[endpoint { method = GET, path = "/v1/network-interfaces/{interface}", @@ -3983,7 +3988,7 @@ async fn instance_network_interface_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a network interface +/// Update network interface #[endpoint { method = PUT, path = "/v1/network-interfaces/{interface}", @@ -4057,7 +4062,7 @@ async fn instance_external_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Allocate and attach an ephemeral IP to an instance +/// Allocate and attach ephemeral IP to instance #[endpoint { method = POST, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4095,7 +4100,7 @@ async fn instance_ephemeral_ip_attach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Detach and deallocate an ephemeral IP from an instance +/// Detach and deallocate ephemeral IP from instance #[endpoint { method = DELETE, path = "/v1/instances/{instance}/external-ips/ephemeral", @@ -4167,7 +4172,7 @@ async fn snapshot_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a snapshot +/// Create snapshot /// /// Creates a point-in-time snapshot from a disk. #[endpoint { @@ -4195,7 +4200,7 @@ async fn snapshot_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a snapshot +/// Fetch snapshot #[endpoint { method = GET, path = "/v1/snapshots/{snapshot}", @@ -4223,7 +4228,7 @@ async fn snapshot_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a snapshot +/// Delete snapshot #[endpoint { method = DELETE, path = "/v1/snapshots/{snapshot}", @@ -4290,7 +4295,7 @@ async fn vpc_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC +/// Create VPC #[endpoint { method = POST, path = "/v1/vpcs", @@ -4316,7 +4321,7 @@ async fn vpc_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a VPC +/// Fetch VPC #[endpoint { method = GET, path = "/v1/vpcs/{vpc}", @@ -4371,7 +4376,7 @@ async fn vpc_update( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a VPC +/// Delete VPC #[endpoint { method = DELETE, path = "/v1/vpcs/{vpc}", @@ -4432,7 +4437,7 @@ async fn vpc_subnet_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a subnet +/// Create subnet #[endpoint { method = POST, path = "/v1/vpc-subnets", @@ -4457,7 +4462,7 @@ async fn vpc_subnet_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a subnet +/// Fetch subnet #[endpoint { method = GET, path = "/v1/vpc-subnets/{subnet}", @@ -4486,7 +4491,7 @@ async fn vpc_subnet_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a subnet +/// Delete subnet #[endpoint { method = DELETE, path = "/v1/vpc-subnets/{subnet}", @@ -4515,7 +4520,7 @@ async fn vpc_subnet_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a subnet +/// Update subnet #[endpoint { method = PUT, path = "/v1/vpc-subnets/{subnet}", @@ -4695,7 +4700,7 @@ async fn vpc_router_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a router +/// Fetch router #[endpoint { method = GET, path = "/v1/vpc-routers/{router}", @@ -4725,7 +4730,7 @@ async fn vpc_router_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a VPC router +/// Create VPC router #[endpoint { method = POST, path = "/v1/vpc-routers", @@ -4757,7 +4762,7 @@ async fn vpc_router_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a router +/// Delete router #[endpoint { method = DELETE, path = "/v1/vpc-routers/{router}", @@ -4787,7 +4792,7 @@ async fn vpc_router_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a router +/// Update router #[endpoint { method = PUT, path = "/v1/vpc-routers/{router}", @@ -4861,7 +4866,7 @@ async fn vpc_router_route_list( // Vpc Router Routes -/// Fetch a route +/// Fetch route #[endpoint { method = GET, path = "/v1/vpc-router-routes/{route}", @@ -4894,7 +4899,7 @@ async fn vpc_router_route_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create a router +/// Create router #[endpoint { method = POST, path = "/v1/vpc-router-routes", @@ -4926,7 +4931,7 @@ async fn vpc_router_route_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete a route +/// Delete route #[endpoint { method = DELETE, path = "/v1/vpc-router-routes/{route}", @@ -4958,7 +4963,7 @@ async fn vpc_router_route_delete( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Update a route +/// Update route #[endpoint { method = PUT, path = "/v1/vpc-router-routes/{route}", @@ -5033,7 +5038,7 @@ struct RackPathParam { rack_id: Uuid, } -/// Fetch a rack +/// Fetch rack #[endpoint { method = GET, path = "/v1/system/hardware/racks/{rack_id}", @@ -5054,7 +5059,7 @@ async fn rack_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List uninitialized sleds in a given rack +/// List uninitialized sleds #[endpoint { method = GET, path = "/v1/system/hardware/sleds-uninitialized", @@ -5081,7 +5086,7 @@ async fn sled_list_uninitialized( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Add a sled to an initialized rack +/// Add sled to initialized rack // // TODO: In the future this should really be a PUT request, once we resolve // https://github.com/oxidecomputer/omicron/issues/4494. It should also @@ -5138,7 +5143,7 @@ async fn sled_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a sled +/// Fetch sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}", @@ -5160,7 +5165,7 @@ async fn sled_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Set the sled's provision state +/// Set sled provision state #[endpoint { method = PUT, path = "/v1/system/hardware/sleds/{sled_id}/provision-state", @@ -5198,7 +5203,7 @@ async fn sled_set_provision_state( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// List instances running on a given sled +/// List instances running on given sled #[endpoint { method = GET, path = "/v1/system/hardware/sleds/{sled_id}/instances", @@ -5299,7 +5304,7 @@ async fn switch_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a switch +/// Fetch switch #[endpoint { method = GET, path = "/v1/system/hardware/switches/{switch_id}", @@ -5482,7 +5487,7 @@ async fn silo_metric( // Updates -/// Upload a TUF repository +/// Upload TUF repository #[endpoint { method = PUT, path = "/v1/system/update/repository", @@ -5507,7 +5512,9 @@ async fn system_update_put_repository( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Get the description of a repository by system version. +/// Fetch TUF repository description +/// +/// Fetch description of TUF repository by system version. #[endpoint { method = GET, path = "/v1/system/update/repository/{system_version}", @@ -5662,7 +5669,7 @@ async fn user_builtin_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in user +/// Fetch built-in user #[endpoint { method = GET, path = "/v1/system/users-builtin/{user}", @@ -5744,7 +5751,7 @@ async fn role_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch a built-in role +/// Fetch built-in role #[endpoint { method = GET, path = "/v1/system/roles/{role_name}", @@ -5768,7 +5775,7 @@ async fn role_view( // Current user -/// Fetch the user associated with the current session +/// Fetch user for current session #[endpoint { method = GET, path = "/v1/me", @@ -5791,7 +5798,7 @@ pub(crate) async fn current_user_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch the silo groups the current user belongs to +/// Fetch current user's groups #[endpoint { method = GET, path = "/v1/me/groups", @@ -5865,7 +5872,7 @@ async fn current_user_ssh_key_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Create an SSH public key +/// Create SSH public key /// /// Create an SSH public key for the currently authenticated user. #[endpoint { @@ -5893,9 +5900,9 @@ async fn current_user_ssh_key_create( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Fetch an SSH public key +/// Fetch SSH public key /// -/// Fetch an SSH public key associated with the currently authenticated user. +/// Fetch SSH public key associated with the currently authenticated user. #[endpoint { method = GET, path = "/v1/me/ssh-keys/{ssh_key}", @@ -5927,7 +5934,7 @@ async fn current_user_ssh_key_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Delete an SSH public key +/// Delete SSH public key /// /// Delete an SSH public key associated with the currently authenticated user. #[endpoint { diff --git a/openapi/nexus.json b/openapi/nexus.json index 8baf1a6316..d27261b179 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -223,7 +223,7 @@ "tags": [ "silos" ], - "summary": "Create a new system-wide x.509 certificate", + "summary": "Create new system-wide x.509 certificate", "description": "This certificate is automatically used by the Oxide Control plane to serve external connections.", "operationId": "certificate_create", "requestBody": { @@ -261,7 +261,7 @@ "tags": [ "silos" ], - "summary": "Fetch a certificate", + "summary": "Fetch certificate", "description": "Returns the details of a specific certificate", "operationId": "certificate_view", "parameters": [ @@ -297,7 +297,7 @@ "tags": [ "silos" ], - "summary": "Delete a certificate", + "summary": "Delete certificate", "description": "Permanently delete a certificate. This operation cannot be undone.", "operationId": "certificate_delete", "parameters": [ @@ -443,7 +443,7 @@ "tags": [ "disks" ], - "summary": "Fetch a disk", + "summary": "Fetch disk", "operationId": "disk_view", "parameters": [ { @@ -487,7 +487,7 @@ "tags": [ "disks" ], - "summary": "Delete a disk", + "summary": "Delete disk", "operationId": "disk_delete", "parameters": [ { @@ -526,7 +526,7 @@ "tags": [ "disks" ], - "summary": "Import blocks into a disk", + "summary": "Import blocks into disk", "operationId": "disk_bulk_write_import", "parameters": [ { @@ -575,7 +575,7 @@ "tags": [ "disks" ], - "summary": "Start importing blocks into a disk", + "summary": "Start importing blocks into disk", "description": "Start the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_start", "parameters": [ @@ -615,7 +615,7 @@ "tags": [ "disks" ], - "summary": "Stop importing blocks into a disk", + "summary": "Stop importing blocks into disk", "description": "Stop the process of importing blocks into a disk", "operationId": "disk_bulk_write_import_stop", "parameters": [ @@ -876,7 +876,7 @@ "tags": [ "floating-ips" ], - "summary": "Create a floating IP", + "summary": "Create floating IP", "operationId": "floating_ip_create", "parameters": [ { @@ -924,7 +924,7 @@ "tags": [ "floating-ips" ], - "summary": "Fetch a floating IP", + "summary": "Fetch floating IP", "operationId": "floating_ip_view", "parameters": [ { @@ -968,7 +968,7 @@ "tags": [ "floating-ips" ], - "summary": "Delete a floating IP", + "summary": "Delete floating IP", "operationId": "floating_ip_delete", "parameters": [ { @@ -1007,7 +1007,8 @@ "tags": [ "floating-ips" ], - "summary": "Attach a floating IP to an instance or other resource", + "summary": "Attach floating IP", + "description": "Attach floating IP to an instance or other resource.", "operationId": "floating_ip_attach", "parameters": [ { @@ -1063,7 +1064,7 @@ "tags": [ "floating-ips" ], - "summary": "Detach a floating IP from an instance or other resource", + "summary": "Detach floating IP", "operationId": "floating_ip_detach", "parameters": [ { @@ -1273,7 +1274,7 @@ "tags": [ "images" ], - "summary": "Create an image", + "summary": "Create image", "description": "Create a new image in a project.", "operationId": "image_create", "parameters": [ @@ -1321,7 +1322,7 @@ "tags": [ "images" ], - "summary": "Fetch an image", + "summary": "Fetch image", "description": "Fetch the details for a specific image in a project.", "operationId": "image_view", "parameters": [ @@ -1366,7 +1367,7 @@ "tags": [ "images" ], - "summary": "Delete an image", + "summary": "Delete image", "description": "Permanently delete an image from a project. This operation cannot be undone. Any instances in the project using the image will continue to run, however new instances can not be created with this image.", "operationId": "image_delete", "parameters": [ @@ -1406,8 +1407,8 @@ "tags": [ "images" ], - "summary": "Demote a silo image", - "description": "Demote a silo image to be visible only to a specified project", + "summary": "Demote silo image", + "description": "Demote silo image to be visible only to a specified project", "operationId": "image_demote", "parameters": [ { @@ -1454,8 +1455,8 @@ "tags": [ "images" ], - "summary": "Promote a project image", - "description": "Promote a project image to be visible to all projects in the silo", + "summary": "Promote project image", + "description": "Promote project image to be visible to all projects in the silo", "operationId": "image_promote", "parameters": [ { @@ -1568,7 +1569,7 @@ "tags": [ "instances" ], - "summary": "Create an instance", + "summary": "Create instance", "operationId": "instance_create", "parameters": [ { @@ -1616,7 +1617,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance", + "summary": "Fetch instance", "operationId": "instance_view", "parameters": [ { @@ -1660,7 +1661,7 @@ "tags": [ "instances" ], - "summary": "Delete an instance", + "summary": "Delete instance", "operationId": "instance_delete", "parameters": [ { @@ -1699,7 +1700,7 @@ "tags": [ "instances" ], - "summary": "List an instance's disks", + "summary": "List disks for instance", "operationId": "instance_disk_list", "parameters": [ { @@ -1775,7 +1776,7 @@ "tags": [ "instances" ], - "summary": "Attach a disk to an instance", + "summary": "Attach disk to instance", "operationId": "instance_disk_attach", "parameters": [ { @@ -1831,7 +1832,7 @@ "tags": [ "instances" ], - "summary": "Detach a disk from an instance", + "summary": "Detach disk from instance", "operationId": "instance_disk_detach", "parameters": [ { @@ -1933,7 +1934,7 @@ "tags": [ "instances" ], - "summary": "Allocate and attach an ephemeral IP to an instance", + "summary": "Allocate and attach ephemeral IP to instance", "operationId": "instance_ephemeral_ip_attach", "parameters": [ { @@ -1987,7 +1988,7 @@ "tags": [ "instances" ], - "summary": "Detach and deallocate an ephemeral IP from an instance", + "summary": "Detach and deallocate ephemeral IP from instance", "operationId": "instance_ephemeral_ip_detach", "parameters": [ { @@ -2128,7 +2129,7 @@ "tags": [ "instances" ], - "summary": "Fetch an instance's serial console", + "summary": "Fetch instance serial console", "operationId": "instance_serial_console", "parameters": [ { @@ -2207,7 +2208,7 @@ "tags": [ "instances" ], - "summary": "Stream an instance's serial console", + "summary": "Stream instance serial console", "operationId": "instance_serial_console_stream", "parameters": [ { @@ -2257,8 +2258,8 @@ "tags": [ "instances" ], - "summary": "List the SSH public keys added to the instance via cloud-init during instance creation", - "description": "Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", + "summary": "List SSH public keys for instance", + "description": "List SSH public keys injected via cloud-init during instance creation. Note that this list is a snapshot in time and will not reflect updates made after the instance is created.", "operationId": "instance_ssh_public_key_list", "parameters": [ { @@ -2334,7 +2335,7 @@ "tags": [ "instances" ], - "summary": "Boot an instance", + "summary": "Boot instance", "operationId": "instance_start", "parameters": [ { @@ -2380,7 +2381,7 @@ "tags": [ "instances" ], - "summary": "Stop an instance", + "summary": "Stop instance", "operationId": "instance_stop", "parameters": [ { @@ -2485,7 +2486,7 @@ "tags": [ "projects" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "project_ip_pool_view", "parameters": [ { @@ -2583,7 +2584,7 @@ "tags": [ "session" ], - "summary": "Fetch the user associated with the current session", + "summary": "Fetch user for current session", "operationId": "current_user_view", "responses": { "200": { @@ -2610,7 +2611,7 @@ "tags": [ "session" ], - "summary": "Fetch the silo groups the current user belongs to", + "summary": "Fetch current user's groups", "operationId": "current_user_groups", "parameters": [ { @@ -2727,7 +2728,7 @@ "tags": [ "session" ], - "summary": "Create an SSH public key", + "summary": "Create SSH public key", "description": "Create an SSH public key for the currently authenticated user.", "operationId": "current_user_ssh_key_create", "requestBody": { @@ -2765,8 +2766,8 @@ "tags": [ "session" ], - "summary": "Fetch an SSH public key", - "description": "Fetch an SSH public key associated with the currently authenticated user.", + "summary": "Fetch SSH public key", + "description": "Fetch SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_view", "parameters": [ { @@ -2802,7 +2803,7 @@ "tags": [ "session" ], - "summary": "Delete an SSH public key", + "summary": "Delete SSH public key", "description": "Delete an SSH public key associated with the currently authenticated user.", "operationId": "current_user_ssh_key_delete", "parameters": [ @@ -3006,7 +3007,7 @@ "tags": [ "instances" ], - "summary": "Create a network interface", + "summary": "Create network interface", "operationId": "instance_network_interface_create", "parameters": [ { @@ -3062,7 +3063,7 @@ "tags": [ "instances" ], - "summary": "Fetch a network interface", + "summary": "Fetch network interface", "operationId": "instance_network_interface_view", "parameters": [ { @@ -3114,7 +3115,7 @@ "tags": [ "instances" ], - "summary": "Update a network interface", + "summary": "Update network interface", "operationId": "instance_network_interface_update", "parameters": [ { @@ -3176,7 +3177,7 @@ "tags": [ "instances" ], - "summary": "Delete a network interface", + "summary": "Delete network interface", "description": "Note that the primary interface for an instance cannot be deleted if there are any secondary interfaces. A new primary interface must be designated first. The primary interface can be deleted if there are no secondary interfaces.", "operationId": "instance_network_interface_delete", "parameters": [ @@ -3252,7 +3253,7 @@ "tags": [ "silos" ], - "summary": "Fetch the current silo's IAM policy", + "summary": "Fetch current silo's IAM policy", "operationId": "policy_view", "responses": { "200": { @@ -3277,7 +3278,7 @@ "tags": [ "silos" ], - "summary": "Update the current silo's IAM policy", + "summary": "Update current silo's IAM policy", "operationId": "policy_update", "requestBody": { "content": { @@ -3371,7 +3372,7 @@ "tags": [ "projects" ], - "summary": "Create a project", + "summary": "Create project", "operationId": "project_create", "requestBody": { "content": { @@ -3408,7 +3409,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project", + "summary": "Fetch project", "operationId": "project_view", "parameters": [ { @@ -3490,7 +3491,7 @@ "tags": [ "projects" ], - "summary": "Delete a project", + "summary": "Delete project", "operationId": "project_delete", "parameters": [ { @@ -3521,7 +3522,7 @@ "tags": [ "projects" ], - "summary": "Fetch a project's IAM policy", + "summary": "Fetch project's IAM policy", "operationId": "project_policy_view", "parameters": [ { @@ -3557,7 +3558,7 @@ "tags": [ "projects" ], - "summary": "Update a project's IAM policy", + "summary": "Update project's IAM policy", "operationId": "project_policy_update", "parameters": [ { @@ -3672,7 +3673,7 @@ "tags": [ "snapshots" ], - "summary": "Create a snapshot", + "summary": "Create snapshot", "description": "Creates a point-in-time snapshot from a disk.", "operationId": "snapshot_create", "parameters": [ @@ -3721,7 +3722,7 @@ "tags": [ "snapshots" ], - "summary": "Fetch a snapshot", + "summary": "Fetch snapshot", "operationId": "snapshot_view", "parameters": [ { @@ -3765,7 +3766,7 @@ "tags": [ "snapshots" ], - "summary": "Delete a snapshot", + "summary": "Delete snapshot", "operationId": "snapshot_delete", "parameters": [ { @@ -3922,7 +3923,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a rack", + "summary": "Fetch rack", "operationId": "rack_view", "parameters": [ { @@ -4018,7 +4019,7 @@ "tags": [ "system/hardware" ], - "summary": "Add a sled to an initialized rack", + "summary": "Add sled to initialized rack", "operationId": "sled_add", "requestBody": { "content": { @@ -4048,7 +4049,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a sled", + "summary": "Fetch sled", "operationId": "sled_view", "parameters": [ { @@ -4156,7 +4157,7 @@ "tags": [ "system/hardware" ], - "summary": "List instances running on a given sled", + "summary": "List instances running on given sled", "operationId": "sled_instance_list", "parameters": [ { @@ -4225,7 +4226,7 @@ "tags": [ "system/hardware" ], - "summary": "Set the sled's provision state", + "summary": "Set sled provision state", "operationId": "sled_set_provision_state", "parameters": [ { @@ -4274,7 +4275,7 @@ "tags": [ "system/hardware" ], - "summary": "List uninitialized sleds in a given rack", + "summary": "List uninitialized sleds", "operationId": "sled_list_uninitialized", "parameters": [ { @@ -4562,7 +4563,7 @@ "tags": [ "system/hardware" ], - "summary": "Fetch a switch", + "summary": "Fetch switch", "operationId": "switch_view", "parameters": [ { @@ -4670,7 +4671,7 @@ "tags": [ "system/silos" ], - "summary": "Create a user", + "summary": "Create user", "description": "Users can only be created in Silos with `provision_type` == `Fixed`. Otherwise, Silo users are just-in-time (JIT) provisioned when a user first logs in using an external Identity Provider.", "operationId": "local_idp_user_create", "parameters": [ @@ -4719,7 +4720,7 @@ "tags": [ "system/silos" ], - "summary": "Delete a user", + "summary": "Delete user", "operationId": "local_idp_user_delete", "parameters": [ { @@ -4760,7 +4761,7 @@ "tags": [ "system/silos" ], - "summary": "Set or invalidate a user's password", + "summary": "Set or invalidate user's password", "description": "Passwords can only be updated for users in Silos with identity mode `LocalOnly`.", "operationId": "local_idp_user_set_password", "parameters": [ @@ -4812,7 +4813,7 @@ "tags": [ "system/silos" ], - "summary": "Create a SAML IdP", + "summary": "Create SAML IdP", "operationId": "saml_identity_provider_create", "parameters": [ { @@ -4860,7 +4861,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a SAML IdP", + "summary": "Fetch SAML IdP", "operationId": "saml_identity_provider_view", "parameters": [ { @@ -4964,7 +4965,7 @@ "tags": [ "system/networking" ], - "summary": "Create an IP pool", + "summary": "Create IP pool", "operationId": "ip_pool_create", "requestBody": { "content": { @@ -5001,7 +5002,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch an IP pool", + "summary": "Fetch IP pool", "operationId": "ip_pool_view", "parameters": [ { @@ -5037,7 +5038,7 @@ "tags": [ "system/networking" ], - "summary": "Update an IP pool", + "summary": "Update IP pool", "operationId": "ip_pool_update", "parameters": [ { @@ -5083,7 +5084,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an IP pool", + "summary": "Delete IP pool", "operationId": "ip_pool_delete", "parameters": [ { @@ -5114,8 +5115,8 @@ "tags": [ "system/networking" ], - "summary": "List ranges for an IP pool", - "description": "List ranges for an IP pool. Ranges are ordered by their first address.", + "summary": "List ranges for IP pool", + "description": "Ranges are ordered by their first address.", "operationId": "ip_pool_range_list", "parameters": [ { @@ -5176,7 +5177,7 @@ "tags": [ "system/networking" ], - "summary": "Add a range to an IP pool", + "summary": "Add range to IP pool", "operationId": "ip_pool_range_add", "parameters": [ { @@ -5224,7 +5225,7 @@ "tags": [ "system/networking" ], - "summary": "Remove a range from an IP pool", + "summary": "Remove range from IP pool", "operationId": "ip_pool_range_remove", "parameters": [ { @@ -5265,7 +5266,7 @@ "tags": [ "system/networking" ], - "summary": "List an IP pool's linked silos", + "summary": "List IP pool's linked silos", "operationId": "ip_pool_silo_list", "parameters": [ { @@ -5331,7 +5332,7 @@ "tags": [ "system/networking" ], - "summary": "Link an IP pool to a silo", + "summary": "Link IP pool to silo", "description": "Users in linked silos can allocate external IPs from this pool for their instances. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", "operationId": "ip_pool_silo_link", "parameters": [ @@ -5434,7 +5435,7 @@ "tags": [ "system/networking" ], - "summary": "Unlink an IP pool from a silo", + "summary": "Unlink IP pool from silo", "description": "Will fail if there are any outstanding IPs allocated in the silo.", "operationId": "ip_pool_silo_unlink", "parameters": [ @@ -5473,7 +5474,7 @@ "tags": [ "system/networking" ], - "summary": "Fetch the Oxide service IP pool", + "summary": "Fetch Oxide service IP pool", "operationId": "ip_pool_service_view", "responses": { "200": { @@ -5774,7 +5775,7 @@ "tags": [ "system/networking" ], - "summary": "Create an address lot", + "summary": "Create address lot", "operationId": "networking_address_lot_create", "requestBody": { "content": { @@ -5811,7 +5812,7 @@ "tags": [ "system/networking" ], - "summary": "Delete an address lot", + "summary": "Delete address lot", "operationId": "networking_address_lot_delete", "parameters": [ { @@ -5842,7 +5843,7 @@ "tags": [ "system/networking" ], - "summary": "List the blocks in an address lot", + "summary": "List blocks in address lot", "operationId": "networking_address_lot_block_list", "parameters": [ { @@ -6066,7 +6067,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP configuration", + "summary": "Create new BGP configuration", "operationId": "networking_bgp_config_create", "requestBody": { "content": { @@ -6101,7 +6102,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP configuration", + "summary": "Delete BGP configuration", "operationId": "networking_bgp_config_delete", "parameters": [ { @@ -6172,7 +6173,7 @@ "tags": [ "system/networking" ], - "summary": "Create a new BGP announce set", + "summary": "Create new BGP announce set", "operationId": "networking_bgp_announce_set_create", "requestBody": { "content": { @@ -6207,7 +6208,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a BGP announce set", + "summary": "Delete BGP announce set", "operationId": "networking_bgp_announce_set_delete", "parameters": [ { @@ -6370,7 +6371,7 @@ "tags": [ "system/networking" ], - "summary": "Create a loopback address", + "summary": "Create loopback address", "operationId": "networking_loopback_address_create", "requestBody": { "content": { @@ -6407,7 +6408,7 @@ "tags": [ "system/networking" ], - "summary": "Delete a loopback address", + "summary": "Delete loopback address", "operationId": "networking_loopback_address_delete", "parameters": [ { @@ -6599,7 +6600,7 @@ "tags": [ "system/networking" ], - "summary": "Get information about a switch port", + "summary": "Get information about switch port", "operationId": "networking_switch_port_settings_view", "parameters": [ { @@ -6637,7 +6638,7 @@ "tags": [ "policy" ], - "summary": "Fetch the top-level IAM policy", + "summary": "Fetch top-level IAM policy", "operationId": "system_policy_view", "responses": { "200": { @@ -6662,7 +6663,7 @@ "tags": [ "policy" ], - "summary": "Update the top-level IAM policy", + "summary": "Update top-level IAM policy", "operationId": "system_policy_update", "requestBody": { "content": { @@ -6751,7 +6752,7 @@ "tags": [ "roles" ], - "summary": "Fetch a built-in role", + "summary": "Fetch built-in role", "operationId": "role_view", "parameters": [ { @@ -6943,8 +6944,8 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo", - "description": "Fetch a silo by name or ID.", + "summary": "Fetch silo", + "description": "Fetch silo by name or ID.", "operationId": "silo_view", "parameters": [ { @@ -7081,7 +7082,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a silo's IAM policy", + "summary": "Fetch silo IAM policy", "operationId": "silo_policy_view", "parameters": [ { @@ -7117,7 +7118,7 @@ "tags": [ "system/silos" ], - "summary": "Update a silo's IAM policy", + "summary": "Update silo IAM policy", "operationId": "silo_policy_update", "parameters": [ { @@ -7165,7 +7166,7 @@ "tags": [ "system/silos" ], - "summary": "View the resource quotas of a given silo", + "summary": "Fetch resource quotas for silo", "operationId": "silo_quotas_view", "parameters": [ { @@ -7201,7 +7202,7 @@ "tags": [ "system/silos" ], - "summary": "Update the resource quotas of a given silo", + "summary": "Update resource quotas for silo", "description": "If a quota value is not specified, it will remain unchanged.", "operationId": "silo_quotas_update", "parameters": [ @@ -7250,7 +7251,7 @@ "tags": [ "system/silos" ], - "summary": "List built-in (system) users in a silo", + "summary": "List built-in (system) users in silo", "operationId": "silo_user_list", "parameters": [ { @@ -7319,7 +7320,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in (system) user", + "summary": "Fetch built-in (system) user", "operationId": "silo_user_view", "parameters": [ { @@ -7426,7 +7427,7 @@ "tags": [ "system/silos" ], - "summary": "Fetch a built-in user", + "summary": "Fetch built-in user", "operationId": "user_builtin_view", "parameters": [ { @@ -7522,7 +7523,7 @@ "tags": [ "system/silos" ], - "summary": "View the current utilization of a given silo", + "summary": "Fetch current utilization for given silo", "operationId": "silo_utilization_view", "parameters": [ { @@ -7628,7 +7629,7 @@ "tags": [ "silos" ], - "summary": "View the resource utilization of the user's current silo", + "summary": "Fetch resource utilization for user's current silo", "operationId": "utilization_view", "responses": { "200": { @@ -7830,7 +7831,7 @@ "tags": [ "vpcs" ], - "summary": "Create a subnet", + "summary": "Create subnet", "operationId": "vpc_subnet_create", "parameters": [ { @@ -7886,7 +7887,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a subnet", + "summary": "Fetch subnet", "operationId": "vpc_subnet_view", "parameters": [ { @@ -7938,7 +7939,7 @@ "tags": [ "vpcs" ], - "summary": "Update a subnet", + "summary": "Update subnet", "operationId": "vpc_subnet_update", "parameters": [ { @@ -8000,7 +8001,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a subnet", + "summary": "Delete subnet", "operationId": "vpc_subnet_delete", "parameters": [ { @@ -8198,7 +8199,7 @@ "tags": [ "vpcs" ], - "summary": "Create a VPC", + "summary": "Create VPC", "operationId": "vpc_create", "parameters": [ { @@ -8246,7 +8247,7 @@ "tags": [ "vpcs" ], - "summary": "Fetch a VPC", + "summary": "Fetch VPC", "operationId": "vpc_view", "parameters": [ { @@ -8344,7 +8345,7 @@ "tags": [ "vpcs" ], - "summary": "Delete a VPC", + "summary": "Delete VPC", "operationId": "vpc_delete", "parameters": [ { From 425bca85530c3884141e3fa919c040210c4abd9e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 8 Feb 2024 11:25:49 -0500 Subject: [PATCH 23/27] [RSS] fix split-brain rack subnet (#5018) Prior to this commit, RSS accepted a set of parameters that included a `rack_subnet` and an optional (but not really) `rack_network_config`; the `rack_network_config` _also_ contained a `rack_subnet` property. The first `rack_subnet` was used by RSS to pick sled addresses, but the second is what it handed off to Nexus to record in the database. This PR makes three changes to parameters (the bulk of the diff is the expected fallout from them): * Removes the top-level `rack_subnet` field; only the `rack_network_config.rack_subnet` remains. * Makes `rack_network_config` non-optional. The handoff to Nexus would fail if this was `None`, so now it's always required. (This was only a little annoying in a few tests where we now have to cons up a fake network config.) * Changes wicket/wicket to accept a subset of `rack_network_config` that does _not_ include a `rack_subnet`; this is a value the control plane should choose on its own. One potentially-dangerous change is that the RSS parameters changed are not just passed when RSS is run; they're also serialized to disk as `rss-sled-plan.json`. We have a test to ensure changes don't affect the schema of this plan, but I believe the changes here are backwards compatible (an old plan that has a no-longer-present `rack_subet` is fine, and the JSON representation of the optional `RackNetworkConfig` is that it can be either `null` or an object; we'll fail to read any plans with `null`, but those would have failed to handoff to Nexus anyway as noted above). To check this is right, I pulled the `rss-sled-plan.json` off of madrid, censored the certs, replaced the password hash with one of our test hashes, and added a test that we can still read it. --- Changes that might make sense but I didn't attempt: * Changing the `rack_network_config` in the early networking bootstore to be non-optional. I think this would be correct, but is probably more trouble than it's worth to migrate. We might consider this the next time we make other, unrelated changes here though. * Removing the `rack_subnet` field not just from user -> wicket, but also from {wicket,developer} -> RSS. We could make RSS pick its own rack subnet, maybe? This seemed dubious enough I stopped. This does mean the TOML files used to automatically launch RSS still have a `rack_subnet` value, but now it's only one (under the rack network config) instead of two. * Changing the rack subnet choice to be random. wicket continues to use the hardcoded value we've been using. --- I also fixed a handful of places where we define the rack subnet `fd00:1122:3344:01::/56`; I believe this is just wrong / a typo. The `:01` at the end is equivalent to `:0001`, which is equivalent to the /56 `fd00:1122:3344:0000::/56`. Every place we had this we meant to use `fd00:1122:3344:0100::/56`, so I changed all them (I think!). Fixes #5009, but only for any racks that run RSS after this change. I am not attempting to retroactively correct any racks that had the wrong `rack_subnet` recorded in the database, as I believe all such deployed racks are dev systems that are frequently wiped and reinstalled. --- clients/wicketd-client/src/lib.rs | 1 + docs/how-to-run.adoc | 2 +- nexus/src/app/rack.rs | 501 +++++++++--------- nexus/src/lib.rs | 8 +- nexus/test-utils/src/lib.rs | 2 +- nexus/tests/integration_tests/rack.rs | 2 +- nexus/types/src/internal_api/params.rs | 2 +- openapi/bootstrap-agent.json | 7 +- openapi/nexus-internal.json | 2 +- openapi/wicketd.json | 76 ++- schema/rss-sled-plan.json | 11 +- sled-agent/src/bootstrap/params.rs | 25 +- sled-agent/src/rack_setup/config.rs | 16 +- sled-agent/src/rack_setup/plan/service.rs | 14 +- sled-agent/src/rack_setup/plan/sled.rs | 28 +- sled-agent/src/rack_setup/service.rs | 105 ++-- sled-agent/src/sim/server.rs | 10 +- .../madrid-rss-sled-plan.json | 1 + .../madrid-rss-sled-plan.json | 164 ++++++ .../gimlet-standalone/config-rss.toml | 17 +- smf/sled-agent/non-gimlet/config-rss.toml | 17 +- wicket-common/src/rack_setup.rs | 16 +- .../src/cli/rack_setup/config_template.toml | 1 - wicket/src/cli/rack_setup/config_toml.rs | 13 +- wicketd/src/http_entrypoints.rs | 12 +- wicketd/src/preflight_check.rs | 6 +- wicketd/src/preflight_check/uplink.rs | 4 +- wicketd/src/rss_config.rs | 15 +- 28 files changed, 629 insertions(+), 449 deletions(-) create mode 100644 sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json create mode 100644 sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json diff --git a/clients/wicketd-client/src/lib.rs b/clients/wicketd-client/src/lib.rs index 01c3b04f87..09f9ca1418 100644 --- a/clients/wicketd-client/src/lib.rs +++ b/clients/wicketd-client/src/lib.rs @@ -51,6 +51,7 @@ progenitor::generate_api!( CurrentRssUserConfigInsensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfigSensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, + UserSpecifiedRackNetworkConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, GetLocationResponse = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, }, replace = { diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index 6a0b8f79d5..e286fe3730 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -277,7 +277,7 @@ The below example demonstrates a single static gateway route; in-depth explanati [rack_network_config] # An internal-only IPv6 address block which contains AZ-wide services. # This does not need to be changed. -rack_subnet = "fd00:1122:3344:01::/56" +rack_subnet = "fd00:1122:3344:0100::/56" # A range of IP addresses used by Boundary Services on the network. In a real # system, these would be addresses of the uplink ports on the Sidecar. With # softnpu, only one address is used. diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 2b38c62b23..a4d559f823 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -212,11 +212,7 @@ impl super::Nexus { mapped_fleet_roles, }; - let rack_network_config = request.rack_network_config.as_ref().ok_or( - Error::invalid_request( - "cannot initialize a rack without a network config", - ), - )?; + let rack_network_config = &request.rack_network_config; self.db_datastore .rack_set_initialized( @@ -336,289 +332,278 @@ impl super::Nexus { // Currently calling some of the apis directly, but should we be using sagas // going forward via self.run_saga()? Note that self.create_runnable_saga and // self.execute_saga are currently not available within this scope. - info!(self.log, "Checking for Rack Network Configuration"); - if let Some(rack_network_config) = &request.rack_network_config { - info!(self.log, "Recording Rack Network Configuration"); - let address_lot_name = - Name::from_str("initial-infra").map_err(|e| { - Error::internal_error(&format!( - "unable to use `initial-infra` as `Name`: {e}" - )) - })?; - let identity = IdentityMetadataCreateParams { - name: address_lot_name.clone(), - description: "initial infrastructure ip address lot" - .to_string(), - }; + info!(self.log, "Recording Rack Network Configuration"); + let address_lot_name = + Name::from_str("initial-infra").map_err(|e| { + Error::internal_error(&format!( + "unable to use `initial-infra` as `Name`: {e}" + )) + })?; + let identity = IdentityMetadataCreateParams { + name: address_lot_name.clone(), + description: "initial infrastructure ip address lot".to_string(), + }; - let kind = AddressLotKind::Infra; + let kind = AddressLotKind::Infra; - let first_address = IpAddr::V4(rack_network_config.infra_ip_first); - let last_address = IpAddr::V4(rack_network_config.infra_ip_last); - let ipv4_block = - AddressLotBlockCreate { first_address, last_address }; + let first_address = IpAddr::V4(rack_network_config.infra_ip_first); + let last_address = IpAddr::V4(rack_network_config.infra_ip_last); + let ipv4_block = AddressLotBlockCreate { first_address, last_address }; - let blocks = vec![ipv4_block]; + let blocks = vec![ipv4_block]; - let address_lot_params = - AddressLotCreate { identity, kind, blocks }; + let address_lot_params = AddressLotCreate { identity, kind, blocks }; - match self - .db_datastore - .address_lot_create(opctx, &address_lot_params) - .await - { - Ok(_) => Ok(()), - Err(e) => match e { - Error::ObjectAlreadyExists { - type_name: _, - object_name: _, - } => Ok(()), - _ => Err(e), - }, - }?; + match self + .db_datastore + .address_lot_create(opctx, &address_lot_params) + .await + { + Ok(_) => Ok(()), + Err(e) => match e { + Error::ObjectAlreadyExists { type_name: _, object_name: _ } => { + Ok(()) + } + _ => Err(e), + }, + }?; - let mut bgp_configs = HashMap::new(); + let mut bgp_configs = HashMap::new(); - for bgp_config in &rack_network_config.bgp { - bgp_configs.insert(bgp_config.asn, bgp_config.clone()); + for bgp_config in &rack_network_config.bgp { + bgp_configs.insert(bgp_config.asn, bgp_config.clone()); - let bgp_config_name: Name = - format!("as{}", bgp_config.asn).parse().unwrap(); + let bgp_config_name: Name = + format!("as{}", bgp_config.asn).parse().unwrap(); - let announce_set_name: Name = - format!("as{}-announce", bgp_config.asn).parse().unwrap(); + let announce_set_name: Name = + format!("as{}-announce", bgp_config.asn).parse().unwrap(); - let address_lot_name: Name = - format!("as{}-lot", bgp_config.asn).parse().unwrap(); + let address_lot_name: Name = + format!("as{}-lot", bgp_config.asn).parse().unwrap(); - self.db_datastore - .address_lot_create( - &opctx, - &AddressLotCreate { - identity: IdentityMetadataCreateParams { - name: address_lot_name, - description: format!( - "Address lot for announce set in as {}", - bgp_config.asn - ), - }, - kind: AddressLotKind::Infra, - blocks: bgp_config - .originate - .iter() - .map(|o| AddressLotBlockCreate { - first_address: o.network().into(), - last_address: o.broadcast().into(), - }) - .collect(), + self.db_datastore + .address_lot_create( + &opctx, + &AddressLotCreate { + identity: IdentityMetadataCreateParams { + name: address_lot_name, + description: format!( + "Address lot for announce set in as {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create address lot for BGP as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_create_announce_set( - &opctx, - &BgpAnnounceSetCreate { - identity: IdentityMetadataCreateParams { - name: announce_set_name.clone(), - description: format!( - "Announce set for AS {}", - bgp_config.asn - ), - }, - announcement: bgp_config - .originate - .iter() - .map(|x| BgpAnnouncementCreate { - address_lot_block: NameOrId::Name( - format!("as{}", bgp_config.asn) - .parse() - .unwrap(), - ), - network: IpNetwork::from(*x).into(), - }) - .collect(), + kind: AddressLotKind::Infra, + blocks: bgp_config + .originate + .iter() + .map(|o| AddressLotBlockCreate { + first_address: o.network().into(), + last_address: o.broadcast().into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create address lot for BGP as {}: {}", + bgp_config.asn, e + )) + })?; + + self.db_datastore + .bgp_create_announce_set( + &opctx, + &BgpAnnounceSetCreate { + identity: IdentityMetadataCreateParams { + name: announce_set_name.clone(), + description: format!( + "Announce set for AS {}", + bgp_config.asn + ), }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to create bgp announce set for as {}: {}", - bgp_config.asn, e - )) - })?; - - self.db_datastore - .bgp_config_set( - &opctx, - &BgpConfigCreate { - identity: IdentityMetadataCreateParams { - name: bgp_config_name, - description: format!( - "BGP config for AS {}", - bgp_config.asn + announcement: bgp_config + .originate + .iter() + .map(|x| BgpAnnouncementCreate { + address_lot_block: NameOrId::Name( + format!("as{}", bgp_config.asn) + .parse() + .unwrap(), ), - }, - asn: bgp_config.asn, - bgp_announce_set_id: announce_set_name.into(), - vrf: None, - }, - ) - .await - .map_err(|e| { - Error::internal_error(&format!( - "unable to set bgp config for as {}: {}", - bgp_config.asn, e - )) - })?; - } + network: IpNetwork::from(*x).into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create bgp announce set for as {}: {}", + bgp_config.asn, e + )) + })?; - for (idx, uplink_config) in - rack_network_config.ports.iter().enumerate() - { - let switch = uplink_config.switch.to_string(); - let switch_location = Name::from_str(&switch).map_err(|e| { + self.db_datastore + .bgp_config_set( + &opctx, + &BgpConfigCreate { + identity: IdentityMetadataCreateParams { + name: bgp_config_name, + description: format!( + "BGP config for AS {}", + bgp_config.asn + ), + }, + asn: bgp_config.asn, + bgp_announce_set_id: announce_set_name.into(), + vrf: None, + }, + ) + .await + .map_err(|e| { Error::internal_error(&format!( - "unable to use {switch} as Name: {e}" + "unable to set bgp config for as {}: {}", + bgp_config.asn, e )) })?; + } - let uplink_name = format!("default-uplink{idx}"); - let name = Name::from_str(&uplink_name).unwrap(); + for (idx, uplink_config) in rack_network_config.ports.iter().enumerate() + { + let switch = uplink_config.switch.to_string(); + let switch_location = Name::from_str(&switch).map_err(|e| { + Error::internal_error(&format!( + "unable to use {switch} as Name: {e}" + )) + })?; - let identity = IdentityMetadataCreateParams { - name: name.clone(), - description: "initial uplink configuration".to_string(), - }; + let uplink_name = format!("default-uplink{idx}"); + let name = Name::from_str(&uplink_name).unwrap(); - let port_config = SwitchPortConfigCreate { - geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, - }; + let identity = IdentityMetadataCreateParams { + name: name.clone(), + description: "initial uplink configuration".to_string(), + }; - let mut port_settings_params = SwitchPortSettingsCreate { - identity, - port_config, - groups: vec![], - links: HashMap::new(), - interfaces: HashMap::new(), - routes: HashMap::new(), - bgp_peers: HashMap::new(), - addresses: HashMap::new(), + let port_config = SwitchPortConfigCreate { + geometry: nexus_types::external_api::params::SwitchPortGeometry::Qsfp28x1, }; - let addresses: Vec
= uplink_config - .addresses - .iter() - .map(|a| Address { - address_lot: NameOrId::Name(address_lot_name.clone()), - address: (*a).into(), - }) - .collect(); - - port_settings_params - .addresses - .insert("phy0".to_string(), AddressConfig { addresses }); - - let routes: Vec = uplink_config - .routes - .iter() - .map(|r| Route { - dst: r.destination.into(), - gw: r.nexthop, - vid: None, - }) - .collect(); - - port_settings_params - .routes - .insert("phy0".to_string(), RouteConfig { routes }); - - let peers: Vec = uplink_config - .bgp_peers - .iter() - .map(|r| BgpPeer { - bgp_announce_set: NameOrId::Name( - format!("as{}-announce", r.asn).parse().unwrap(), - ), - bgp_config: NameOrId::Name( - format!("as{}", r.asn).parse().unwrap(), - ), - interface_name: "phy0".into(), - addr: r.addr.into(), - hold_time: r.hold_time.unwrap_or(6) as u32, - idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, - delay_open: r.delay_open.unwrap_or(0) as u32, - connect_retry: r.connect_retry.unwrap_or(3) as u32, - keepalive: r.keepalive.unwrap_or(2) as u32, - }) - .collect(); + let mut port_settings_params = SwitchPortSettingsCreate { + identity, + port_config, + groups: vec![], + links: HashMap::new(), + interfaces: HashMap::new(), + routes: HashMap::new(), + bgp_peers: HashMap::new(), + addresses: HashMap::new(), + }; - port_settings_params - .bgp_peers - .insert("phy0".to_string(), BgpPeerConfig { peers }); + let addresses: Vec
= uplink_config + .addresses + .iter() + .map(|a| Address { + address_lot: NameOrId::Name(address_lot_name.clone()), + address: (*a).into(), + }) + .collect(); + + port_settings_params + .addresses + .insert("phy0".to_string(), AddressConfig { addresses }); + + let routes: Vec = uplink_config + .routes + .iter() + .map(|r| Route { + dst: r.destination.into(), + gw: r.nexthop, + vid: None, + }) + .collect(); + + port_settings_params + .routes + .insert("phy0".to_string(), RouteConfig { routes }); + + let peers: Vec = uplink_config + .bgp_peers + .iter() + .map(|r| BgpPeer { + bgp_announce_set: NameOrId::Name( + format!("as{}-announce", r.asn).parse().unwrap(), + ), + bgp_config: NameOrId::Name( + format!("as{}", r.asn).parse().unwrap(), + ), + interface_name: "phy0".into(), + addr: r.addr.into(), + hold_time: r.hold_time.unwrap_or(6) as u32, + idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, + delay_open: r.delay_open.unwrap_or(0) as u32, + connect_retry: r.connect_retry.unwrap_or(3) as u32, + keepalive: r.keepalive.unwrap_or(2) as u32, + }) + .collect(); + + port_settings_params + .bgp_peers + .insert("phy0".to_string(), BgpPeerConfig { peers }); + + let link = LinkConfigCreate { + mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 + lldp: LldpServiceConfigCreate { + enabled: false, + lldp_config: None, + }, + fec: uplink_config.uplink_port_fec.into(), + speed: uplink_config.uplink_port_speed.into(), + autoneg: uplink_config.autoneg, + }; - let link = LinkConfigCreate { - mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 - lldp: LldpServiceConfigCreate { - enabled: false, - lldp_config: None, - }, - fec: uplink_config.uplink_port_fec.into(), - speed: uplink_config.uplink_port_speed.into(), - autoneg: uplink_config.autoneg, - }; + port_settings_params.links.insert("phy".to_string(), link); - port_settings_params.links.insert("phy".to_string(), link); + match self + .db_datastore + .switch_port_settings_create(opctx, &port_settings_params, None) + .await + { + Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), + Err(e) => Err(e), + }?; - match self - .db_datastore - .switch_port_settings_create( - opctx, - &port_settings_params, - None, - ) - .await - { - Ok(_) | Err(Error::ObjectAlreadyExists { .. }) => Ok(()), - Err(e) => Err(e), - }?; - - let port_settings_id = self - .db_datastore - .switch_port_settings_get_id( - opctx, - nexus_db_model::Name(name.clone()), - ) - .await?; + let port_settings_id = self + .db_datastore + .switch_port_settings_get_id( + opctx, + nexus_db_model::Name(name.clone()), + ) + .await?; - let switch_port_id = self - .db_datastore - .switch_port_get_id( - opctx, - rack_id, - switch_location.into(), - Name::from_str(&uplink_config.port).unwrap().into(), - ) - .await?; + let switch_port_id = self + .db_datastore + .switch_port_get_id( + opctx, + rack_id, + switch_location.into(), + Name::from_str(&uplink_config.port).unwrap().into(), + ) + .await?; + + self.db_datastore + .switch_port_set_settings_id( + opctx, + switch_port_id, + Some(port_settings_id), + db::datastore::UpdatePrecondition::Null, + ) + .await?; + } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 + // record port speed - self.db_datastore - .switch_port_set_settings_id( - opctx, - switch_port_id, - Some(port_settings_id), - db::datastore::UpdatePrecondition::Null, - ) - .await?; - } // TODO - https://github.com/oxidecomputer/omicron/issues/3277 - // record port speed - }; self.initial_bootstore_sync(&opctx).await?; Ok(()) diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index e1392440a1..cb08bfcdc0 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -288,13 +288,15 @@ impl nexus_test_interface::NexusServer for Server { vec!["qsfp0".parse().unwrap()], )]), ), - rack_network_config: Some(RackNetworkConfig { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56" + .parse() + .unwrap(), infra_ip_first: Ipv4Addr::UNSPECIFIED, infra_ip_last: Ipv4Addr::UNSPECIFIED, ports: Vec::new(), bgp: Vec::new(), - }), + }, }, ) .await diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index da21602cb1..7baacf97ce 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -65,7 +65,7 @@ pub const RACK_UUID: &str = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"; pub const SWITCH_UUID: &str = "dae4e1f1-410e-4314-bff1-fec0504be07e"; pub const OXIMETER_UUID: &str = "39e6175b-4df2-4730-b11d-cbc1e60a2e78"; pub const PRODUCER_UUID: &str = "a6458b7d-87c3-4483-be96-854d814c20de"; -pub const RACK_SUBNET: &str = "fd00:1122:3344:01::/56"; +pub const RACK_SUBNET: &str = "fd00:1122:3344:0100::/56"; /// Password for the user created by the test suite /// diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index a6fc93e92a..a58871ee71 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -110,7 +110,7 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { let baseboard = uninitialized_sleds.pop().unwrap().baseboard; let sled_uuid = Uuid::new_v4(); let sa = SledAgentStartupInfo { - sa_address: "[fd00:1122:3344:01::1]:8080".parse().unwrap(), + sa_address: "[fd00:1122:3344:0100::1]:8080".parse().unwrap(), role: SledRole::Gimlet, baseboard: Baseboard { serial_number: baseboard.serial, diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index bc25e8d4bd..ab15ec26b7 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -263,7 +263,7 @@ pub struct RackInitializationRequest { /// The external qsfp ports per sidecar pub external_port_count: ExternalPortDiscovery, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } pub type DnsConfigParams = dns_service_client::types::DnsConfigParams; diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 6fd83cef47..a55803eda9 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -651,7 +651,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -659,10 +658,6 @@ } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ @@ -688,7 +683,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ] }, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index bc26736b37..4714b64c52 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5618,7 +5618,6 @@ } }, "rack_network_config": { - "nullable": true, "description": "Initial rack network configuration", "allOf": [ { @@ -5649,6 +5648,7 @@ "external_port_count", "internal_dns_zone_config", "internal_services_ip_pool_ranges", + "rack_network_config", "recovery_silo", "services" ] diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 300e8412c3..b9645a174f 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -1132,7 +1132,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } ] } @@ -2172,7 +2172,7 @@ } }, "rack_network_config": { - "$ref": "#/components/schemas/RackNetworkConfigV1" + "$ref": "#/components/schemas/UserSpecifiedRackNetworkConfig" } }, "required": [ @@ -2190,46 +2190,6 @@ "type": "string", "format": "uuid" }, - "RackNetworkConfigV1": { - "description": "Initial network configuration", - "type": "object", - "properties": { - "bgp": { - "description": "BGP configurations for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/BgpConfig" - } - }, - "infra_ip_first": { - "description": "First ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "infra_ip_last": { - "description": "Last ip address to be used for configuring network infrastructure", - "type": "string", - "format": "ipv4" - }, - "ports": { - "description": "Uplinks for connecting the rack to external networks", - "type": "array", - "items": { - "$ref": "#/components/schemas/PortConfigV1" - } - }, - "rack_subnet": { - "$ref": "#/components/schemas/Ipv6Network" - } - }, - "required": [ - "bgp", - "infra_ip_first", - "infra_ip_last", - "ports", - "rack_subnet" - ] - }, "RackOperationStatus": { "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackInitId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackResetId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", "oneOf": [ @@ -4698,6 +4658,38 @@ } ] }, + "UserSpecifiedRackNetworkConfig": { + "description": "User-specified parts of [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig).", + "type": "object", + "properties": { + "bgp": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BgpConfig" + } + }, + "infra_ip_first": { + "type": "string", + "format": "ipv4" + }, + "infra_ip_last": { + "type": "string", + "format": "ipv4" + }, + "ports": { + "type": "array", + "items": { + "$ref": "#/components/schemas/PortConfigV1" + } + } + }, + "required": [ + "bgp", + "infra_ip_first", + "infra_ip_last", + "ports" + ] + }, "IgnitionCommand": { "description": "Ignition command.\n\n
JSON schema\n\n```json { \"description\": \"Ignition command.\", \"type\": \"string\", \"enum\": [ \"power_on\", \"power_off\", \"power_reset\" ] } ```
", "type": "string", diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index cbd73ed066..f5ac5bd0ff 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -466,7 +466,7 @@ "external_dns_zone_name", "internal_services_ip_pool_ranges", "ntp_servers", - "rack_subnet", + "rack_network_config", "recovery_silo" ], "properties": { @@ -521,19 +521,12 @@ }, "rack_network_config": { "description": "Initial rack network configuration", - "anyOf": [ + "allOf": [ { "$ref": "#/definitions/RackNetworkConfigV1" - }, - { - "type": "null" } ] }, - "rack_subnet": { - "type": "string", - "format": "ipv6" - }, "recovery_silo": { "description": "Configuration of the Recovery Silo (the initial Silo)", "allOf": [ diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index b684d96763..48444af8d4 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; use sha3::{Digest, Sha3_256}; use sled_hardware::Baseboard; use std::borrow::Cow; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; @@ -24,14 +24,13 @@ pub enum BootstrapAddressDiscovery { /// Ignore all bootstrap addresses except our own. OnlyOurs, /// Ignore all bootstrap addresses except the following. - OnlyThese { addrs: HashSet }, + OnlyThese { addrs: BTreeSet }, } // "Shadow" copy of `RackInitializeRequest` that does no validation on its // fields. #[derive(Clone, Deserialize)] struct UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr, trust_quorum_peers: Option>, bootstrap_discovery: BootstrapAddressDiscovery, ntp_servers: Vec, @@ -41,7 +40,7 @@ struct UnvalidatedRackInitializeRequest { external_dns_zone_name: String, external_certificates: Vec, recovery_silo: RecoverySiloConfig, - rack_network_config: Option, + rack_network_config: RackNetworkConfig, } /// Configuration for the "rack setup service". @@ -53,8 +52,6 @@ struct UnvalidatedRackInitializeRequest { #[derive(Clone, Deserialize, Serialize, PartialEq, JsonSchema)] #[serde(try_from = "UnvalidatedRackInitializeRequest")] pub struct RackInitializeRequest { - pub rack_subnet: Ipv6Addr, - /// The set of peer_ids required to initialize trust quorum /// /// The value is `None` if we are not using trust quorum @@ -89,7 +86,7 @@ pub struct RackInitializeRequest { pub recovery_silo: RecoverySiloConfig, /// Initial rack network configuration - pub rack_network_config: Option, + pub rack_network_config: RackNetworkConfig, } // This custom debug implementation hides the private keys. @@ -98,7 +95,6 @@ impl std::fmt::Debug for RackInitializeRequest { // If you find a compiler error here, and you just added a field to this // struct, be sure to add it to the Debug impl below! let RackInitializeRequest { - rack_subnet, trust_quorum_peers: trust_qurorum_peers, bootstrap_discovery, ntp_servers, @@ -112,7 +108,6 @@ impl std::fmt::Debug for RackInitializeRequest { } = &self; f.debug_struct("RackInitializeRequest") - .field("rack_subnet", rack_subnet) .field("trust_quorum_peers", trust_qurorum_peers) .field("bootstrap_discovery", bootstrap_discovery) .field("ntp_servers", ntp_servers) @@ -155,7 +150,6 @@ impl TryFrom for RackInitializeRequest { } Ok(RackInitializeRequest { - rack_subnet: value.rack_subnet, trust_quorum_peers: value.trust_quorum_peers, bootstrap_discovery: value.bootstrap_discovery, ntp_servers: value.ntp_servers, @@ -368,6 +362,7 @@ pub fn test_config() -> RackInitializeRequest { #[cfg(test)] mod tests { + use std::net::Ipv4Addr; use std::net::Ipv6Addr; use super::*; @@ -395,7 +390,6 @@ mod tests { #[test] fn parse_rack_initialization_weak_hash() { let config = r#" - rack_subnet = "fd00:1122:3344:0100::" bootstrap_discovery.type = "only_ours" ntp_servers = [ "ntp.eng.oxide.computer" ] dns_servers = [ "1.1.1.1", "9.9.9.9" ] @@ -480,7 +474,6 @@ mod tests { // Conjure up a config; we'll tweak the internal services pools and // external DNS IPs, but no other fields matter. let mut config = UnvalidatedRackInitializeRequest { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -494,7 +487,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; // Valid configs: all external DNS IPs are contained in the IP pool diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs index 33de7121d4..52bea295a5 100644 --- a/sled-agent/src/rack_setup/config.rs +++ b/sled-agent/src/rack_setup/config.rs @@ -70,12 +70,14 @@ impl SetupServiceConfig { } pub fn az_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new(self.rack_network_config.rack_subnet.ip()) } /// Returns the subnet for our rack. pub fn rack_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new(self.rack_subnet) + Ipv6Subnet::::new( + self.rack_network_config.rack_subnet.ip(), + ) } /// Returns the subnet for the `index`-th sled in the rack. @@ -92,12 +94,12 @@ mod test { use anyhow::Context; use camino::Utf8PathBuf; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; #[test] fn test_subnets() { let cfg = SetupServiceConfig { - rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: vec![String::from("test.pool.example.com")], @@ -119,7 +121,13 @@ mod test { .parse() .unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::".parse().unwrap(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; assert_eq!( diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index bed82a7a01..220f0d686b 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -35,7 +35,7 @@ use sled_agent_client::{ use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::num::Wrapping; use thiserror::Error; @@ -708,7 +708,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - sleds: &HashMap, + sleds: &BTreeMap, ) -> Result { // Load the information we need about each Sled to be able to allocate // components on it. @@ -1078,6 +1078,7 @@ mod tests { use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::RecoverySiloConfig; use omicron_common::address::IpRange; + use omicron_common::api::internal::shared::RackNetworkConfig; const EXPECTED_RESERVED_ADDRESSES: u16 = 2; const EXPECTED_USABLE_ADDRESSES: u16 = @@ -1149,7 +1150,6 @@ mod tests { "fd01::103", ]; let config = Config { - rack_subnet: Ipv6Addr::LOCALHOST, trust_quorum_peers: None, bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, ntp_servers: Vec::new(), @@ -1173,7 +1173,13 @@ mod tests { user_name: "recovery".parse().unwrap(), user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), }, - rack_network_config: None, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; let mut svp = ServicePortBuilder::new(&config); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 07f33893fc..efdd86d2f9 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet}; use std::net::{Ipv6Addr, SocketAddrV6}; use thiserror::Error; use uuid::Uuid; @@ -46,7 +46,7 @@ const RSS_SLED_PLAN_FILENAME: &str = "rss-sled-plan.json"; #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct Plan { pub rack_id: Uuid, - pub sleds: HashMap, + pub sleds: BTreeMap, // Store the provided RSS configuration as part of the sled plan; if it // changes after reboot, we need to know. @@ -81,7 +81,7 @@ impl Plan { log: &Logger, config: &Config, storage_manager: &StorageHandle, - bootstrap_addrs: HashSet, + bootstrap_addrs: BTreeSet, use_trust_quorum: bool, ) -> Result { let rack_id = Uuid::new_v4(); @@ -117,7 +117,7 @@ impl Plan { info!(log, "Serializing plan"); - let mut sleds = std::collections::HashMap::new(); + let mut sleds = BTreeMap::new(); for (addr, allocation) in allocations { sleds.insert(addr, allocation); } @@ -152,4 +152,24 @@ mod tests { &serde_json::to_string_pretty(&schema).unwrap(), ); } + + #[test] + fn test_read_known_rss_sled_plans() { + let known_rss_sled_plans = &["madrid-rss-sled-plan.json"]; + + let path = Utf8PathBuf::from("tests/old-rss-sled-plans"); + let out_path = Utf8PathBuf::from("tests/output/new-rss-sled-plans"); + for sled_plan_basename in known_rss_sled_plans { + println!("checking {:?}", sled_plan_basename); + let contents = + std::fs::read_to_string(path.join(sled_plan_basename)) + .expect("failed to read file"); + let parsed: Plan = + serde_json::from_str(&contents).expect("failed to parse file"); + expectorate::assert_contents( + out_path.join(sled_plan_basename), + &serde_json::to_string_pretty(&parsed).unwrap(), + ); + } + } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index af81df52bb..2788e189cc 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -601,58 +601,55 @@ impl ServiceInner { .map(Into::into) .collect(); - let rack_network_config = match &config.rack_network_config { - Some(config) => { - let value = NexusTypes::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, - infra_ip_first: config.infra_ip_first, - infra_ip_last: config.infra_ip_last, - ports: config - .ports - .iter() - .map(|config| NexusTypes::PortConfigV1 { - port: config.port.clone(), - routes: config - .routes - .iter() - .map(|r| NexusTypes::RouteConfig { - destination: r.destination, - nexthop: r.nexthop, - }) - .collect(), - addresses: config.addresses.clone(), - switch: config.switch.into(), - uplink_port_speed: config.uplink_port_speed.into(), - uplink_port_fec: config.uplink_port_fec.into(), - autoneg: config.autoneg, - bgp_peers: config - .bgp_peers - .iter() - .map(|b| NexusTypes::BgpPeerConfig { - addr: b.addr, - asn: b.asn, - port: b.port.clone(), - hold_time: b.hold_time, - connect_retry: b.connect_retry, - delay_open: b.delay_open, - idle_hold_time: b.idle_hold_time, - keepalive: b.keepalive, - }) - .collect(), - }) - .collect(), - bgp: config - .bgp - .iter() - .map(|config| NexusTypes::BgpConfig { - asn: config.asn, - originate: config.originate.clone(), - }) - .collect(), - }; - Some(value) + let rack_network_config = { + let config = &config.rack_network_config; + NexusTypes::RackNetworkConfigV1 { + rack_subnet: config.rack_subnet, + infra_ip_first: config.infra_ip_first, + infra_ip_last: config.infra_ip_last, + ports: config + .ports + .iter() + .map(|config| NexusTypes::PortConfigV1 { + port: config.port.clone(), + routes: config + .routes + .iter() + .map(|r| NexusTypes::RouteConfig { + destination: r.destination, + nexthop: r.nexthop, + }) + .collect(), + addresses: config.addresses.clone(), + switch: config.switch.into(), + uplink_port_speed: config.uplink_port_speed.into(), + uplink_port_fec: config.uplink_port_fec.into(), + autoneg: config.autoneg, + bgp_peers: config + .bgp_peers + .iter() + .map(|b| NexusTypes::BgpPeerConfig { + addr: b.addr, + asn: b.asn, + port: b.port.clone(), + hold_time: b.hold_time, + connect_retry: b.connect_retry, + delay_open: b.delay_open, + idle_hold_time: b.idle_hold_time, + keepalive: b.keepalive, + }) + .collect(), + }) + .collect(), + bgp: config + .bgp + .iter() + .map(|config| NexusTypes::BgpConfig { + asn: config.asn, + originate: config.originate.clone(), + }) + .collect(), } - None => None, }; info!(self.log, "rack_network_config: {:#?}", rack_network_config); @@ -868,14 +865,14 @@ impl ServiceInner { // - Enough peers to create a new plan (if one does not exist) let bootstrap_addrs = match &config.bootstrap_discovery { BootstrapAddressDiscovery::OnlyOurs => { - HashSet::from([local_bootstrap_agent.our_address()]) + BTreeSet::from([local_bootstrap_agent.our_address()]) } BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = SledPlan::load(&self.log, storage_manager).await?; if let Some(plan) = &maybe_sled_plan { - let stored_peers: HashSet = + let stored_peers: BTreeSet = plan.sleds.keys().map(|a| *a.ip()).collect(); if stored_peers != bootstrap_addrs { let e = concat!( @@ -931,7 +928,7 @@ impl ServiceInner { schema_version: 1, body: EarlyNetworkConfigBody { ntp_servers: config.ntp_servers.clone(), - rack_network_config: config.rack_network_config.clone(), + rack_network_config: Some(config.rack_network_config.clone()), }, }; info!(self.log, "Writing Rack Network Configuration to bootstore"); diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index b214667631..fd5995b8f1 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -26,6 +26,8 @@ use omicron_common::FileKv; use slog::{info, Drain, Logger}; use std::collections::HashMap; use std::net::IpAddr; +use std::net::Ipv4Addr; +use std::net::Ipv6Addr; use std::net::SocketAddr; use std::net::SocketAddrV6; use std::sync::Arc; @@ -455,7 +457,13 @@ pub async fn run_standalone_server( external_port_count: NexusTypes::ExternalPortDiscovery::Static( HashMap::new(), ), - rack_network_config: None, + rack_network_config: NexusTypes::RackNetworkConfigV1 { + rack_subnet: Ipv6Addr::LOCALHOST.into(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + }, }; handoff_to_nexus(&log, &config, &rack_init_request).await?; diff --git a/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..5512247ee8 --- /dev/null +++ b/sled-agent/tests/old-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1 @@ +{"rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","sleds":{"[fdb0:a840:2504:396::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b3e78a88-0f2e-476e-a8a9-2d8c90a169d6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:103::/64"}}},"[fdb0:a840:2504:157::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"168e1ad6-1e4b-4f7a-b894-157974bd8bb8","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:104::/64"}}},"[fdb0:a840:2504:355::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"b9877212-212b-4588-b818-9c7b53c5b143","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:102::/64"}}},"[fdb0:a840:2504:3d2::1]:12346":{"generation":0,"schema_version":1,"body":{"id":"c3a0f8be-5b05-4ee8-8c4e-2514de6501b6","rack_id":"ed6bcf59-9620-491d-8ebd-4a4eebf2e136","use_trust_quorum":true,"is_lrtq_learner":false,"subnet":{"net":"fd00:1122:3344:101::/64"}}}},"config":{"rack_subnet":"fd00:1122:3344:100::","trust_quorum_peers":[{"type":"gimlet","identifier":"BRM42220081","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220046","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM44220001","model":"913-0000019","revision":6},{"type":"gimlet","identifier":"BRM42220004","model":"913-0000019","revision":6}],"bootstrap_discovery":{"type":"only_these","addrs":["fdb0:a840:2504:3d2::1","fdb0:a840:2504:355::1","fdb0:a840:2504:396::1","fdb0:a840:2504:157::1"]},"ntp_servers":["ntp.eng.oxide.computer"],"dns_servers":["1.1.1.1","9.9.9.9"],"internal_services_ip_pool_ranges":[{"first":"172.20.28.1","last":"172.20.28.10"}],"external_dns_ips":["172.20.28.1"],"external_dns_zone_name":"madrid.eng.oxide.computer","external_certificates":[{"cert":"","key":""}],"recovery_silo":{"silo_name":"recovery","user_name":"recovery","user_password_hash":"$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY"},"rack_network_config":{"rack_subnet":"fd00:1122:3344:1::/56","infra_ip_first":"172.20.15.37","infra_ip_last":"172.20.15.38","ports":[{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.38/29"],"switch":"switch0","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false},{"routes":[{"destination":"0.0.0.0/0","nexthop":"172.20.15.33"}],"addresses":["172.20.15.37/29"],"switch":"switch1","port":"qsfp0","uplink_port_speed":"speed40_g","uplink_port_fec":"none","bgp_peers":[],"autoneg":false}],"bgp":[]}}} diff --git a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json new file mode 100644 index 0000000000..69f68c60ad --- /dev/null +++ b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json @@ -0,0 +1,164 @@ +{ + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "sleds": { + "[fdb0:a840:2504:157::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "168e1ad6-1e4b-4f7a-b894-157974bd8bb8", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:104::/64" + } + } + }, + "[fdb0:a840:2504:355::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b9877212-212b-4588-b818-9c7b53c5b143", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:102::/64" + } + } + }, + "[fdb0:a840:2504:396::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "b3e78a88-0f2e-476e-a8a9-2d8c90a169d6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:103::/64" + } + } + }, + "[fdb0:a840:2504:3d2::1]:12346": { + "generation": 0, + "schema_version": 1, + "body": { + "id": "c3a0f8be-5b05-4ee8-8c4e-2514de6501b6", + "rack_id": "ed6bcf59-9620-491d-8ebd-4a4eebf2e136", + "use_trust_quorum": true, + "is_lrtq_learner": false, + "subnet": { + "net": "fd00:1122:3344:101::/64" + } + } + } + }, + "config": { + "trust_quorum_peers": [ + { + "type": "gimlet", + "identifier": "BRM42220081", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220046", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM44220001", + "model": "913-0000019", + "revision": 6 + }, + { + "type": "gimlet", + "identifier": "BRM42220004", + "model": "913-0000019", + "revision": 6 + } + ], + "bootstrap_discovery": { + "type": "only_these", + "addrs": [ + "fdb0:a840:2504:157::1", + "fdb0:a840:2504:355::1", + "fdb0:a840:2504:396::1", + "fdb0:a840:2504:3d2::1" + ] + }, + "ntp_servers": [ + "ntp.eng.oxide.computer" + ], + "dns_servers": [ + "1.1.1.1", + "9.9.9.9" + ], + "internal_services_ip_pool_ranges": [ + { + "first": "172.20.28.1", + "last": "172.20.28.10" + } + ], + "external_dns_ips": [ + "172.20.28.1" + ], + "external_dns_zone_name": "madrid.eng.oxide.computer", + "external_certificates": [ + { + "cert": "", + "key": "" + } + ], + "recovery_silo": { + "silo_name": "recovery", + "user_name": "recovery", + "user_password_hash": "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY" + }, + "rack_network_config": { + "rack_subnet": "fd00:1122:3344:1::/56", + "infra_ip_first": "172.20.15.37", + "infra_ip_last": "172.20.15.38", + "ports": [ + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.38/29" + ], + "switch": "switch0", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + }, + { + "routes": [ + { + "destination": "0.0.0.0/0", + "nexthop": "172.20.15.33" + } + ], + "addresses": [ + "172.20.15.37/29" + ], + "switch": "switch1", + "port": "qsfp0", + "uplink_port_speed": "speed40_g", + "uplink_port_fec": "none", + "bgp_peers": [], + "autoneg": false + } + ], + "bgp": [] + } + } +} \ No newline at end of file diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index f7a93260e3..6c874d9a70 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 12cb2afd24..d0b4f94d9f 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -4,14 +4,6 @@ # Agent API. See the `RackInitializeRequest` type in bootstrap-agent or its # OpenAPI spec (in openapi/bootstrap-agent.json in the root of this workspace). -# The /56 subnet for this rack. This subnet is internal to the rack and fully -# managed by Omicron, so you can pick anything you want within the IPv6 Unique -# Local Address (ULA) range. The rack-specific /56 subnet also implies the -# parent /48 AZ subnet. -# |............| <- This /48 is the AZ Subnet -# |...............| <- This /56 is the Rack Subnet -rack_subnet = "fd00:1122:3344:0100::" - # Only include "our own sled" in the bootstrap network bootstrap_discovery.type = "only_ours" @@ -88,7 +80,14 @@ last = "192.168.1.29" # Configuration to bring up Boundary Services and make Nexus reachable from the # outside. See docs/how-to-run.adoc for more on what to put here. [rack_network_config] -rack_subnet = "fd00:1122:3344:01::/56" +# The /56 subnet for this rack. This subnet is internal to the rack and fully +# managed by Omicron, so you can pick anything you want within the IPv6 Unique +# Local Address (ULA) range. The rack-specific /56 subnet also implies the +# parent /48 AZ subnet. +# |............| <- This /48 is the AZ Subnet +# |...............| <- This /56 is the Rack Subnet +rack_subnet = "fd00:1122:3344:0100::/56" + # A range of IP addresses used by Boundary Services on the external network. In # a real system, these would be addresses of the uplink ports on the Sidecar. # With softnpu, only one address is used. diff --git a/wicket-common/src/rack_setup.rs b/wicket-common/src/rack_setup.rs index e3d5fad5fb..f28c0639a9 100644 --- a/wicket-common/src/rack_setup.rs +++ b/wicket-common/src/rack_setup.rs @@ -5,12 +5,24 @@ // Copyright 2023 Oxide Computer Company use omicron_common::address; -use omicron_common::api::internal::shared::RackNetworkConfig; +use omicron_common::api::internal::shared::BgpConfig; +use omicron_common::api::internal::shared::PortConfigV1; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; use std::collections::BTreeSet; use std::net::IpAddr; +use std::net::Ipv4Addr; + +/// User-specified parts of +/// [`RackNetworkConfig`](omicron_common::api::internal::shared::RackNetworkConfig). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct UserSpecifiedRackNetworkConfig { + pub infra_ip_first: Ipv4Addr, + pub infra_ip_last: Ipv4Addr, + pub ports: Vec, + pub bgp: Vec, +} // The portion of `CurrentRssUserConfig` that can be posted in one shot; it is // provided by the wicket user uploading a TOML file, currently. @@ -27,5 +39,5 @@ pub struct PutRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: RackNetworkConfig, + pub rack_network_config: UserSpecifiedRackNetworkConfig, } diff --git a/wicket/src/cli/rack_setup/config_template.toml b/wicket/src/cli/rack_setup/config_template.toml index 2886fa01d7..d091237b5f 100644 --- a/wicket/src/cli/rack_setup/config_template.toml +++ b/wicket/src/cli/rack_setup/config_template.toml @@ -40,7 +40,6 @@ bootstrap_sleds = [] # TODO: docs on network config [rack_network_config] -rack_subnet = "" infra_ip_first = "" infra_ip_last = "" diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 5a8e8a560e..d050610c30 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -19,7 +19,7 @@ use wicket_common::rack_update::SpType; use wicketd_client::types::BootstrapSledDescription; use wicketd_client::types::CurrentRssUserConfigInsensitive; use wicketd_client::types::IpRange; -use wicketd_client::types::RackNetworkConfigV1; +use wicketd_client::types::UserSpecifiedRackNetworkConfig; static TEMPLATE: &str = include_str!("config_template.toml"); @@ -176,7 +176,7 @@ fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { fn populate_network_table( table: &mut Table, - config: Option<&RackNetworkConfigV1>, + config: Option<&UserSpecifiedRackNetworkConfig>, ) { // Helper function to serialize enums into their appropriate string // representations. @@ -195,7 +195,6 @@ fn populate_network_table( }; for (property, value) in [ - ("rack_subnet", config.rack_subnet.to_string()), ("infra_ip_first", config.infra_ip_first.to_string()), ("infra_ip_last", config.infra_ip_last.to_string()), ] { @@ -350,7 +349,6 @@ fn populate_network_table( #[cfg(test)] mod tests { use super::*; - use omicron_common::api::internal::shared::RackNetworkConfigV1 as InternalRackNetworkConfig; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; use wicket_common::rack_update::SpIdentifier; @@ -373,6 +371,7 @@ mod tests { use omicron_common::api::internal::shared::PortSpeed as InternalPortSpeed; use omicron_common::api::internal::shared::RouteConfig as InternalRouteConfig; use omicron_common::api::internal::shared::SwitchLocation as InternalSwitchLocation; + use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig as InternalUserSpecifiedRackNetworkConfig; let rnc = value.rack_network_config.unwrap(); @@ -401,8 +400,7 @@ mod tests { .collect(), external_dns_ips: value.external_dns_ips, ntp_servers: value.ntp_servers, - rack_network_config: InternalRackNetworkConfig { - rack_subnet: rnc.rack_subnet, + rack_network_config: InternalUserSpecifiedRackNetworkConfig { infra_ip_first: rnc.infra_ip_first, infra_ip_last: rnc.infra_ip_last, ports: rnc @@ -514,8 +512,7 @@ mod tests { )], external_dns_ips: vec!["10.0.0.1".parse().unwrap()], ntp_servers: vec!["ntp1.com".into(), "ntp2.com".into()], - rack_network_config: Some(RackNetworkConfigV1 { - rack_subnet: "fd00:1122:3344:01::/56".parse().unwrap(), + rack_network_config: Some(UserSpecifiedRackNetworkConfig { infra_ip_first: "172.30.0.1".parse().unwrap(), infra_ip_last: "172.30.0.10".parse().unwrap(), ports: vec![PortConfigV1 { diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index 9c1740679f..9748a93bd5 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -32,7 +32,6 @@ use http::StatusCode; use internal_dns::resolver::Resolver; use omicron_common::address; use omicron_common::api::external::SemverVersion; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::update::ArtifactHashId; use omicron_common::update::ArtifactId; @@ -47,6 +46,7 @@ use std::net::IpAddr; use std::net::Ipv6Addr; use std::time::Duration; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; use wicket_common::update_events::EventReport; use wicket_common::WICKETD_TIMEOUT; @@ -172,7 +172,7 @@ pub struct CurrentRssUserConfigInsensitive { pub internal_services_ip_pool_ranges: Vec, pub external_dns_ips: Vec, pub external_dns_zone_name: String, - pub rack_network_config: Option, + pub rack_network_config: Option, } // This is a summary of the subset of `RackInitializeRequest` that is sensitive; @@ -1189,12 +1189,14 @@ async fn post_start_preflight_uplink_check( let (network_config, dns_servers, ntp_servers) = { let rss_config = rqctx.rss_config.lock().unwrap(); - let network_config = - rss_config.rack_network_config().cloned().ok_or_else(|| { + let network_config = rss_config + .user_specified_rack_network_config() + .cloned() + .ok_or_else(|| { HttpError::for_bad_request( None, "uplink preflight check requires setting \ - the uplink config for RSS" + the uplink config for RSS" .to_string(), ) })?; diff --git a/wicketd/src/preflight_check.rs b/wicketd/src/preflight_check.rs index 75cc5f6e09..4cd17604a0 100644 --- a/wicketd/src/preflight_check.rs +++ b/wicketd/src/preflight_check.rs @@ -2,7 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use slog::o; use slog::Logger; @@ -12,6 +11,7 @@ use std::sync::Mutex; use tokio::sync::oneshot; use update_engine::events::EventReport; use update_engine::GenericSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; mod uplink; @@ -44,7 +44,7 @@ impl PreflightCheckerHandler { pub(crate) async fn uplink_start( &self, - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, @@ -94,7 +94,7 @@ pub(crate) struct PreflightCheckerBusy; #[derive(Debug)] enum PreflightCheck { Uplink { - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index 47995f0c10..31d479a5ed 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -22,7 +22,6 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::api::internal::shared::PortConfigV1; use omicron_common::api::internal::shared::PortFec as OmicronPortFec; use omicron_common::api::internal::shared::PortSpeed as OmicronPortSpeed; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::OMICRON_DPD_TAG; use schemars::JsonSchema; @@ -49,6 +48,7 @@ use trust_dns_resolver::error::ResolveError; use trust_dns_resolver::error::ResolveErrorKind; use trust_dns_resolver::TokioAsyncResolver; use update_engine::StepSpec; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; const DNS_PORT: u16 = 53; @@ -68,7 +68,7 @@ const IPADM: &str = "/usr/sbin/ipadm"; const ROUTE: &str = "/usr/sbin/route"; pub(super) async fn run_local_uplink_preflight_check( - network_config: RackNetworkConfig, + network_config: UserSpecifiedRackNetworkConfig, dns_servers: Vec, ntp_servers: Vec, our_switch_location: SwitchLocation, diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index f654597d81..4bc1a6b62b 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -26,7 +26,6 @@ use gateway_client::types::SpType; use omicron_certificates::CertificateError; use omicron_common::address; use omicron_common::address::Ipv4Range; -use omicron_common::api::internal::shared::RackNetworkConfig; use sled_hardware::Baseboard; use slog::warn; use std::collections::BTreeSet; @@ -34,6 +33,7 @@ use std::mem; use std::net::IpAddr; use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; +use wicket_common::rack_setup::UserSpecifiedRackNetworkConfig; // TODO-correctness For now, we always use the same rack subnet when running // RSS. When we get to multirack, this will be wrong, but there are many other @@ -64,7 +64,7 @@ pub(crate) struct CurrentRssConfig { external_dns_zone_name: String, external_certificates: Vec, recovery_silo_password_hash: Option, - rack_network_config: Option, + rack_network_config: Option, // External certificates are uploaded in two separate actions (cert then // key, or vice versa). Here we store a partial certificate; once we have @@ -82,7 +82,9 @@ impl CurrentRssConfig { &self.ntp_servers } - pub(crate) fn rack_network_config(&self) -> Option<&RackNetworkConfig> { + pub(crate) fn user_specified_rack_network_config( + &self, + ) -> Option<&UserSpecifiedRackNetworkConfig> { self.rack_network_config.as_ref() } @@ -252,7 +254,6 @@ impl CurrentRssConfig { .collect(); let request = RackInitializeRequest { - rack_subnet: RACK_SUBNET, trust_quorum_peers, bootstrap_discovery: BootstrapAddressDiscovery::OnlyThese( bootstrap_ips, @@ -268,7 +269,7 @@ impl CurrentRssConfig { user_name: UserId(RECOVERY_SILO_USERNAME.into()), user_password_hash, }, - rack_network_config: Some(rack_network_config), + rack_network_config, }; Ok(request) @@ -452,7 +453,7 @@ impl From<&'_ CurrentRssConfig> for CurrentRssUserConfig { } fn validate_rack_network_config( - config: &RackNetworkConfig, + config: &UserSpecifiedRackNetworkConfig, ) -> Result { use bootstrap_agent_client::types::BgpConfig as BaBgpConfig; use bootstrap_agent_client::types::BgpPeerConfig as BaBgpPeerConfig; @@ -497,7 +498,7 @@ fn validate_rack_network_config( // TODO Add more client side checks on `rack_network_config` contents? Ok(bootstrap_agent_client::types::RackNetworkConfigV1 { - rack_subnet: config.rack_subnet, + rack_subnet: RACK_SUBNET.into(), infra_ip_first: config.infra_ip_first, infra_ip_last: config.infra_ip_last, ports: config From a5430a6465f457175ca1cf0c0053ffc47f25bf62 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Thu, 8 Feb 2024 10:25:33 -0800 Subject: [PATCH 24/27] bump maghemite (#5028) --- package-manifest.toml | 12 ++++++------ tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index ee20bfd307..1e88ddc760 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -446,10 +446,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//maghemite.sha256.txt -source.sha256 = "097553ad7c8cb50f23852e9d6332d9c4e58050fddaa7137bfd5e2859354c2f25" +source.sha256 = "19d5eaa744257c32ccdca52af79d718aeb88a0af188345d33a4564a69b259632" output.type = "tarball" [package.mg-ddm] @@ -462,10 +462,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "cf42b987a81dc1ff102f8f603ff90d8fe9d8a3db890a19810a3ddbb04ab1b526" +source.sha256 = "ffb647b3297ec616d3d9ea93396ad9edd16ed146048a660b34e9b86e85d466b7" output.type = "zone" output.intermediate_only = true @@ -477,10 +477,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" +source.commit = "41a69a11db6cfa8fc0c8686dc2d725708e0586ce" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "92e96984663d2d57d01c200685a47d998a1fd75ea89777e79c00095ebc8de9aa" +source.sha256 = "26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index f300b40aa1..6c58d83ea3 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0b0dbc2f8bbc5d2d9be92d64c4865f8f9335355aae62f7de9f67f81dfb3f1803" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 7a650e00a3..896be8d38c 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="c5401ad7153bb6d28c7960d811fa3d8a1aa19c6a" +COMMIT="41a69a11db6cfa8fc0c8686dc2d725708e0586ce" SHA2="0ac038bbaa54d0ae0ac5ccaeff48f03070618372cca26c9d09b716b909bf9355" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 0c2898f954..8fc4d083f8 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="92e96984663d2d57d01c200685a47d998a1fd75ea89777e79c00095ebc8de9aa" -MGD_LINUX_SHA256="2d8f090161cbabddafa677954a3e6a69eff77ad9a73c686452884528260f0616" \ No newline at end of file +CIDL_SHA256="26d34f61589f63be64eaa77a6e9e2db4c95d6675798386a1d61721c1ccc59d4d" +MGD_LINUX_SHA256="b2c823dd714fad67546a0e0c0d4ae56f2fe2e7c43434469b38e13b78de9f6968" \ No newline at end of file From 5cb119dea7822db4756e74ce4056216deb064d78 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 8 Feb 2024 13:16:17 -0800 Subject: [PATCH 25/27] [nexus] Improve logging when transactions retry (#5029) I was trying to determine whether or not transaction retries spiked alongside a CRDB crash, and found that the current mechanism of tracking "which transactions did we retry" could use a little improvement. Although we do send these retry attempts to Clickhouse (and I'm able to see them for historical data) this doesn't help us if Nexus crashes before those queries can be pulled by Oximeter. This PR adds logging for these cases: - It logs with `warn` if we retry at all - It logs with `info` if a retried transaction completes --- dev-tools/omdb/src/bin/omdb/db.rs | 2 +- nexus/db-queries/src/db/datastore/mod.rs | 8 +++- nexus/db-queries/src/transaction_retry.rs | 46 +++++++++++++++++++---- nexus/src/bin/schema-updater.rs | 3 +- 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 989655dfed..9c41c25cc0 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -413,7 +413,7 @@ impl DbArgs { // here. We will then check the schema version explicitly and warn the // user if it doesn't match. let datastore = Arc::new( - DataStore::new_unchecked(pool) + DataStore::new_unchecked(log.clone(), pool) .map_err(|e| anyhow!(e).context("creating datastore"))?, ); check_schema_version(&datastore).await; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b9ad2ea610..f9e0be81c1 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -150,6 +150,7 @@ pub type DataStoreConnection<'a> = bb8::PooledConnection<'a, ConnectionManager>; pub struct DataStore { + log: Logger, pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, transaction_retry_producer: crate::transaction_retry::Producer, @@ -164,8 +165,9 @@ impl DataStore { /// Ignores the underlying DB version. Should be used with caution, as usage /// of this method can construct a Datastore which does not understand /// the underlying CockroachDB schema. Data corruption could result. - pub fn new_unchecked(pool: Arc) -> Result { + pub fn new_unchecked(log: Logger, pool: Arc) -> Result { let datastore = DataStore { + log, pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), @@ -184,7 +186,8 @@ impl DataStore { pool: Arc, config: Option<&SchemaConfig>, ) -> Result { - let datastore = Self::new_unchecked(pool)?; + let datastore = + Self::new_unchecked(log.new(o!("component" => "datastore")), pool)?; // Keep looping until we find that the schema matches our expectation. const EXPECTED_VERSION: SemverVersion = @@ -230,6 +233,7 @@ impl DataStore { name: &'static str, ) -> crate::transaction_retry::RetryHelper { crate::transaction_retry::RetryHelper::new( + &self.log, &self.transaction_retry_producer, name, ) diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index c474b729f8..6b5098158b 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -9,6 +9,7 @@ use chrono::Utc; use diesel::result::Error as DieselError; use oximeter::{types::Sample, Metric, MetricsError, Target}; use rand::{thread_rng, Rng}; +use slog::{info, warn, Logger}; use std::sync::{Arc, Mutex}; use std::time::Duration; @@ -60,6 +61,10 @@ impl RetryHelperInner { Self { start: Utc::now(), attempts: 1 } } + fn has_retried(&self) -> bool { + self.attempts > 1 + } + fn tick(&mut self) -> Self { let start = self.start; let attempts = self.attempts; @@ -74,6 +79,7 @@ impl RetryHelperInner { /// Helper utility for tracking retry attempts and latency. /// Intended to be used from within "transaction_async_with_retry". pub struct RetryHelper { + log: Logger, producer: Producer, name: &'static str, inner: Mutex, @@ -86,8 +92,13 @@ const MAX_RETRY_ATTEMPTS: u32 = 10; impl RetryHelper { /// Creates a new RetryHelper, and starts a timer tracking the transaction /// duration. - pub(crate) fn new(producer: &Producer, name: &'static str) -> Self { + pub(crate) fn new( + log: &Logger, + producer: &Producer, + name: &'static str, + ) -> Self { Self { + log: log.new(o!("transaction" => name)), producer: producer.clone(), name, inner: Mutex::new(RetryHelperInner::new()), @@ -107,7 +118,21 @@ impl RetryHelper { + Send + Sync, { - conn.transaction_async_with_retry(f, self.as_callback()).await + let slef = Arc::new(self); + let result = conn + .transaction_async_with_retry(f, slef.clone().as_callback()) + .await; + + let retry_info = slef.inner.lock().unwrap(); + if retry_info.has_retried() { + info!( + slef.log, + "transaction completed"; + "attempts" => retry_info.attempts, + ); + } + + result } // Called upon retryable transaction failure. @@ -143,6 +168,12 @@ impl RetryHelper { let mut rng = thread_rng(); rng.gen_range(MIN_RETRY_BACKOFF..MAX_RETRY_BACKOFF) }; + + warn!( + self.log, + "Retryable transaction failure"; + "retry_after (ms)" => duration.as_millis(), + ); tokio::time::sleep(duration).await; // Now that we've finished sleeping, reset the timer and bump the number @@ -151,14 +182,13 @@ impl RetryHelper { return inner.attempts < MAX_RETRY_ATTEMPTS; } - /// Converts this function to a retryable callback that can be used from - /// "transaction_async_with_retry". - pub(crate) fn as_callback( - self, + // Converts this function to a retryable callback that can be used from + // "transaction_async_with_retry". + fn as_callback( + self: Arc, ) -> impl Fn() -> futures::future::BoxFuture<'static, bool> { - let r = Arc::new(self); move || { - let r = r.clone(); + let r = self.clone(); Box::pin(async move { r.retry_callback().await }) } } diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index db179dc7f6..d016bd0421 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -76,7 +76,8 @@ async fn main() -> anyhow::Result<()> { // We use the unchecked constructor of the datastore because we // don't want to block on someone else applying an upgrade. - let datastore = DataStore::new_unchecked(pool).map_err(|e| anyhow!(e))?; + let datastore = + DataStore::new_unchecked(log.clone(), pool).map_err(|e| anyhow!(e))?; match args.cmd { Cmd::List => { From 39dbc4e3e81afa52b2ea224827e29d8e9caf303c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 8 Feb 2024 13:28:59 -0800 Subject: [PATCH 26/27] Add documentation example for querying Clickhouse (#5030) This is somewhat of a corollary to https://github.com/oxidecomputer/omicron/pull/5029. While debugging this info, it took me a while to figure out what exactly to poke, so I figured I'd document that experience. --- docs/clickhouse-debugging.adoc | 199 +++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 docs/clickhouse-debugging.adoc diff --git a/docs/clickhouse-debugging.adoc b/docs/clickhouse-debugging.adoc new file mode 100644 index 0000000000..a906d1841f --- /dev/null +++ b/docs/clickhouse-debugging.adoc @@ -0,0 +1,199 @@ +:showtitle: +:numbered: +:toc: left + += Omicron Clickhouse Debugging Guide + +This is a guide for debugging Clickhouse on a variety of environments. + +If you have advice that is not covered here, consider adding it! + +== Debugging on a Live System + +The following provides instructions for connecting to a Clickhouse shell on a running system. + +. **Find the zone running Clickhouse**. This can be accomplished by running `zoneadm list -cv`, and finding the zone with a prefix of `oxz_clickhouse`. If you're running on a multi-machine system (e.g., dogfood, colo, etc) and you have access to the `pilot` binary, you can ask all sleds at once for the location of Clickhouse with: +// '+' for list continuation to insert code blocks while keeping the list order ++ +[source,bash] +---- +# Run from the switch zone. +$ pilot host exec -c "zoneadm list -c | grep clickhouse" 0-31 +---- +. **Log into that zone**. This can be done using: ++ +[source,bash] +---- +# Run from the switch zone +$ pilot host login + +# Run from the machine with the Clichouse zone +$ pfexec zlogin oxz_clickhouse_ +---- + +. **Identify the IP address of Clickhouse**. This is possible using `ipadm`: ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ ipadm +ADDROBJ TYPE STATE ADDR +lo0/v4 static ok 127.0.0.1/8 +lo0/v6 static ok ::1/128 +oxControlService8/ll addrconf ok fe80::8:20ff:fe35:6b0a%oxControlService8/10 +oxControlService8/omicron6 static ok fd00:1122:3344:107::4/64 <-- It's this one! +---- +. **Log into Clickhouse using the CLI** ++ +[source,bash] +---- +# Run from within the Clickhouse zone +$ /opt/oxide/clickhouse/clickhouse client --host fd00:1122:3344:107::4 +ClickHouse client version 22.8.9.1. +Connecting to fd00:1122:3344:107::4:9000 as user default. +Connected to ClickHouse server version 22.8.9 revision 54460. + +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) +---- +. **Inspect the database**. At this point, you've successfully accessed the Clichouse shell. +The `oximeter` database is likely the most useful one for inspection: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) USE oximeter; +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SHOW TABLES + +SHOW TABLES + +Query id: a8c82507-6179-40ee-8e51-4801ca5ff6f8 + +┌─name───────────────────────┐ +│ fields_bool │ +│ fields_i16 │ +│ fields_i32 │ +│ fields_i64 │ +│ fields_i8 │ +│ fields_ipaddr │ +│ fields_string │ +│ fields_u16 │ +│ fields_u32 │ +│ fields_u64 │ +│ fields_u8 │ +│ fields_uuid │ +│ measurements_bool │ +│ measurements_bytes │ +│ measurements_cumulativef32 │ +│ measurements_cumulativef64 │ +│ measurements_cumulativei64 │ +│ measurements_cumulativeu64 │ +│ measurements_f32 │ +│ measurements_f64 │ +│ measurements_histogramf32 │ +│ measurements_histogramf64 │ +│ measurements_histogrami16 │ +│ measurements_histogrami32 │ +│ measurements_histogrami64 │ +│ measurements_histogrami8 │ +│ measurements_histogramu16 │ +│ measurements_histogramu32 │ +│ measurements_histogramu64 │ +│ measurements_histogramu8 │ +│ measurements_i16 │ +│ measurements_i32 │ +│ measurements_i64 │ +│ measurements_i8 │ +│ measurements_string │ +│ measurements_u16 │ +│ measurements_u32 │ +│ measurements_u64 │ +│ measurements_u8 │ +│ timeseries_schema │ +│ version │ +└────────────────────────────┘ +41 rows in set. Elapsed: 0.002 sec. +---- +. **Query for your schema**. The `timeseries_schema` table can provide some additional context for your particular +measurement. The rest of this document will contain an example looking for a very specific "transaction retry" +timeseries, but you can substitute these values with your own. If we know even part of the timeseries name (like the word "transaction") we can search for it with the following: ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT timeseries_name,fields.type,fields.source,datum_type FROM timeseries_schema WHERE timeseries_name LIKE '%transaction%' + +SELECT + timeseries_name, + fields.type, + fields.source, + datum_type +FROM timeseries_schema +WHERE timeseries_name LIKE '%transaction%' + +Query id: 09e6086f-fc5d-4905-abed-013be55d6706 + +┌─timeseries_name─────────────────┬─fields.type──────┬─fields.source───────┬─datum_type─┐ +│ database_transaction:retry_data │ ['U32','String'] │ ['Metric','Target'] │ F64 │ +└─────────────────────────────────┴──────────────────┴─────────────────────┴────────────┘ + +1 row in set. Elapsed: 0.003 sec. +---- +This tells us the following: first, our timeseries has fields (see: `fields.type`) from `fields_u32` and `fields_string`. Next, it also emits measurements (see: `datum_type`) into `measurements_f64`. + +. **Query for your data**. This next step is extremely specific to your particular timeseries. +However, for this "database_transaction:retry_data" example, we need to query for data related +to this timeseries from `fields_u32`, `fields_string`, and `measurements_f64`. This information +should be inferable from the query to the `timeseries_schema` table. + ++ +[source,bash] +---- +oxz_clickhouse_aa646c82-c6d7-4d0c-8401-150130927759.local :) SELECT + fields_string.field_value as transaction_name, + fields_u32.field_value as attempt, + measurements_f64.datum as attempt_duration, + measurements_f64.timestamp +FROM measurements_f64 +INNER JOIN fields_string ON fields_string.timeseries_key = measurements_f64.timeseries_key +INNER JOIN fields_u32 ON fields_u32.timeseries_key = measurements_f64.timeseries_key +WHERE measurements_f64.timeseries_name = 'database_transaction:retry_data' +ORDER BY measurements_f64.timestamp ASC + +Query id: 813c994e-058c-4af2-9d3a-11cf9f222cbf + +┌─transaction_name─────────┬─attempt─┬─attempt_duration─┬────measurements_f64.timestamp─┐ +│ sled_reservation_create │ 1 │ 0.014977911 │ 2024-01-11 22:41:13.667101491 │ +│ sled_reservation_create │ 1 │ 0.01660099 │ 2024-01-11 22:41:13.667610290 │ +│ sled_reservation_create │ 1 │ 0.014088819 │ 2024-01-11 22:41:13.672007505 │ +│ sled_reservation_create │ 1 │ 0.01501511 │ 2024-01-11 22:41:13.673713738 │ +│ sled_reservation_create │ 2 │ 0.156134143 │ 2024-01-11 22:41:13.843218486 │ +│ sled_reservation_create │ 2 │ 0.150804944 │ 2024-01-11 22:41:13.855771487 │ +│ sled_reservation_create │ 2 │ 0.17012195 │ 2024-01-11 22:41:13.855798649 │ +│ sled_reservation_create │ 1 │ 0.205570224 │ 2024-01-11 22:41:13.872957153 │ +│ sled_reservation_create │ 3 │ 0.006690087 │ 2024-01-11 22:41:13.891856215 │ +│ sled_reservation_create │ 4 │ 0.012846307 │ 2024-01-11 22:41:13.955465361 │ +│ sled_reservation_create │ 1 │ 0.020482506 │ 2024-01-18 23:22:48.146559108 │ +│ sled_reservation_create │ 1 │ 0.008722631 │ 2024-01-19 05:26:07.397242186 │ +│ sled_reservation_create │ 1 │ 0.007484627 │ 2024-01-19 05:26:07.590876948 │ +│ sled_reservation_create │ 1 │ 0.008384388 │ 2024-01-19 05:27:42.833060701 │ +│ sled_reservation_create │ 1 │ 0.009016489 │ 2024-01-19 05:28:15.860577501 │ +│ sled_reservation_create │ 1 │ 0.017649607 │ 2024-01-29 08:21:59.599608552 │ +│ sled_reservation_create │ 1 │ 0.017026628 │ 2024-01-29 08:23:30.278820785 │ +│ volume_create │ 1 │ 0.025257548 │ 2024-01-29 13:03:44.799614376 │ +│ volume_checkout │ 1 │ 0.009869392 │ 2024-01-29 13:03:49.827578682 │ +│ sled_reservation_create │ 1 │ 0.018168935 │ 2024-01-29 13:03:56.876826535 │ +│ volume_checkout │ 1 │ 0.007425083 │ 2024-01-29 13:27:17.949365703 │ +│ sled_reservation_create │ 1 │ 0.017133937 │ 2024-01-29 13:27:39.534955222 │ +│ sled_reservation_create │ 1 │ 0.028159647 │ 2024-01-29 13:27:39.593375890 │ +│ sled_reservation_create │ 1 │ 0.053410541 │ 2024-01-29 13:27:39.593709195 │ +│ sled_reservation_create │ 2 │ 0.080795694 │ 2024-01-29 13:27:39.717689230 │ +│ sled_reservation_create │ 1 │ 0.071597836 │ 2024-01-29 13:27:39.722071303 │ +│ regions_hard_delete │ 1 │ 0.019350474 │ 2024-01-31 13:51:58.056808199 │ +│ sled_reservation_create │ 1 │ 0.032482692 │ 2024-02-01 06:41:51.647937599 │ +│ volume_checkout │ 1 │ 0.009380859 │ 2024-02-01 07:03:04.971258393 │ +│ sled_reservation_create │ 1 │ 0.018020138 │ 2024-02-01 07:04:17.110928203 │ +│ regions_hard_delete │ 1 │ 0.011993838 │ 2024-02-01 08:32:56.113587884 │ +│ volume_checkout │ 1 │ 0.223425122 │ 2024-02-01 15:47:31.240008185 │ +│ volume_checkout │ 1 │ 0.454675525 │ 2024-02-01 15:47:31.480408091 │ +│ volume_checkout │ 1 │ 0.445790132 │ 2024-02-01 15:47:31.480943824 │ +│ volume_checkout │ 2 │ 0.206526747 │ 2024-02-01 15:47:31.481037611 │ +└──────────────────────────┴─────────┴──────────────────┴───────────────────────────────┘ +---- From dcbc9cb3b14d527c773849d79ac31f13de041621 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 8 Feb 2024 16:54:48 -0500 Subject: [PATCH 27/27] blueprint planner: ensure target number of Nexus zones exist (#4959) --- Cargo.lock | 1 + clients/sled-agent-client/Cargo.toml | 1 + clients/sled-agent-client/src/lib.rs | 72 +- common/src/address.rs | 18 + .../db-queries/src/db/datastore/deployment.rs | 24 +- nexus/deployment/src/blueprint_builder.rs | 663 +++++++++++++++++- nexus/deployment/src/lib.rs | 4 +- nexus/deployment/src/planner.rs | 423 ++++++++++- nexus/src/app/deployment.rs | 49 +- nexus/types/src/deployment.rs | 16 + nexus/types/src/inventory.rs | 2 +- sled-agent/src/rack_setup/plan/service.rs | 9 +- 12 files changed, 1224 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 234fbc5e45..5814dd101a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8168,6 +8168,7 @@ dependencies = [ name = "sled-agent-client" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", "chrono", "ipnetwork", diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index 8630030b24..71b94441ed 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" license = "MPL-2.0" [dependencies] +anyhow.workspace = true async-trait.workspace = true chrono.workspace = true omicron-common.workspace = true diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 39de64ec62..eb1e57b11f 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -4,8 +4,11 @@ //! Interface for making API requests to a Sled Agent +use anyhow::Context; use async_trait::async_trait; use std::convert::TryFrom; +use std::net::IpAddr; +use std::net::SocketAddr; use uuid::Uuid; progenitor::generate_api!( @@ -86,6 +89,74 @@ impl types::OmicronZoneType { | types::OmicronZoneType::Oximeter { .. } => false, } } + + /// Identifies whether this is a Nexus zone + pub fn is_nexus(&self) -> bool { + match self { + types::OmicronZoneType::Nexus { .. } => true, + + types::OmicronZoneType::BoundaryNtp { .. } + | types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::ExternalDns { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => false, + } + } + + /// This zone's external IP + pub fn external_ip(&self) -> anyhow::Result> { + match self { + types::OmicronZoneType::Nexus { external_ip, .. } => { + Ok(Some(*external_ip)) + } + + types::OmicronZoneType::ExternalDns { dns_address, .. } => { + let dns_address = + dns_address.parse::().with_context(|| { + format!( + "failed to parse ExternalDns address {dns_address}" + ) + })?; + Ok(Some(dns_address.ip())) + } + + types::OmicronZoneType::BoundaryNtp { snat_cfg, .. } => { + Ok(Some(snat_cfg.ip)) + } + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => Ok(None), + } + } + + /// The service vNIC providing external connectivity to this zone + pub fn service_vnic(&self) -> Option<&types::NetworkInterface> { + match self { + types::OmicronZoneType::Nexus { nic, .. } + | types::OmicronZoneType::ExternalDns { nic, .. } + | types::OmicronZoneType::BoundaryNtp { nic, .. } => Some(nic), + + types::OmicronZoneType::InternalNtp { .. } + | types::OmicronZoneType::Clickhouse { .. } + | types::OmicronZoneType::ClickhouseKeeper { .. } + | types::OmicronZoneType::CockroachDb { .. } + | types::OmicronZoneType::Crucible { .. } + | types::OmicronZoneType::CruciblePantry { .. } + | types::OmicronZoneType::InternalDns { .. } + | types::OmicronZoneType::Oximeter { .. } => None, + } + } } impl omicron_common::api::external::ClientError for types::Error { @@ -351,7 +422,6 @@ impl From for types::Ipv6Net { impl From for types::IpNet { fn from(s: std::net::IpAddr) -> Self { - use std::net::IpAddr; match s { IpAddr::V4(v4) => Self::V4(v4.into()), IpAddr::V6(v6) => Self::V6(v6.into()), diff --git a/common/src/address.rs b/common/src/address.rs index 65a6604daf..152fb9319e 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -24,6 +24,12 @@ pub const MAX_PORT: u16 = u16::MAX; /// minimum possible value for a tcp or udp port pub const MIN_PORT: u16 = u16::MIN; +/// The amount of redundancy for Nexus services. +/// +/// This is used by both RSS (to distribute the initial set of services) and the +/// Reconfigurator (to know whether to add new Nexus zones) +pub const NEXUS_REDUNDANCY: usize = 3; + /// The amount of redundancy for internal DNS servers. /// /// Must be less than or equal to MAX_DNS_REDUNDANCY. @@ -457,6 +463,18 @@ impl TryFrom<(Ipv6Addr, Ipv6Addr)> for IpRange { } } +impl From for IpRange { + fn from(value: Ipv4Range) -> Self { + Self::V4(value) + } +} + +impl From for IpRange { + fn from(value: Ipv6Range) -> Self { + Self::V6(value) + } +} + /// A non-decreasing IPv4 address range, inclusive of both ends. /// /// The first address must be less than or equal to the last address. diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 72adb1d3df..cde6e7e8c6 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1053,6 +1053,7 @@ mod tests { use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use omicron_common::address::Ipv6Subnet; use omicron_test_utils::dev; @@ -1061,7 +1062,11 @@ mod tests { use std::mem; use std::net::Ipv6Addr; - static EMPTY_POLICY: Policy = Policy { sleds: BTreeMap::new() }; + static EMPTY_POLICY: Policy = Policy { + sleds: BTreeMap::new(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: 0, + }; // This is a not-super-future-maintainer-friendly helper to check that all // the subtables related to blueprints have been pruned of a specific @@ -1111,7 +1116,11 @@ mod tests { }) .collect(); let ip = ip.unwrap_or_else(|| thread_rng().gen::().into()); - SledResources { zpools, subnet: Ipv6Subnet::new(ip) } + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet: Ipv6Subnet::new(ip), + } } // Create a `Policy` that contains all the sleds found in `collection` @@ -1131,6 +1140,11 @@ mod tests { ) }) .collect(), + service_ip_pool_ranges: Vec::new(), + target_nexus_zone_count: collection + .all_omicron_zones() + .filter(|z| z.zone_type.is_nexus()) + .count(), } } @@ -1320,7 +1334,8 @@ mod tests { // Create a builder for a child blueprint. let mut builder = - BlueprintBuilder::new_based_on(&blueprint1, &policy, "test"); + BlueprintBuilder::new_based_on(&blueprint1, &policy, "test") + .expect("failed to create builder"); // Add zones to our new sled. assert_eq!( @@ -1465,9 +1480,11 @@ mod tests { .unwrap(); let blueprint2 = BlueprintBuilder::new_based_on(&blueprint1, &EMPTY_POLICY, "test2") + .expect("failed to create builder") .build(); let blueprint3 = BlueprintBuilder::new_based_on(&blueprint1, &EMPTY_POLICY, "test3") + .expect("failed to create builder") .build(); assert_eq!(blueprint1.parent_blueprint_id, None); assert_eq!(blueprint2.parent_blueprint_id, Some(blueprint1.id)); @@ -1559,6 +1576,7 @@ mod tests { // with enabled=false, that status is serialized. let blueprint4 = BlueprintBuilder::new_based_on(&blueprint3, &EMPTY_POLICY, "test3") + .expect("failed to create builder") .build(); assert_eq!(blueprint4.parent_blueprint_id, Some(blueprint3.id)); datastore.blueprint_insert(&opctx, &blueprint4).await.unwrap(); diff --git a/nexus/deployment/src/blueprint_builder.rs b/nexus/deployment/src/blueprint_builder.rs index 904e768e1b..1bf46d34b2 100644 --- a/nexus/deployment/src/blueprint_builder.rs +++ b/nexus/deployment/src/blueprint_builder.rs @@ -6,11 +6,14 @@ use crate::ip_allocator::IpAllocator; use anyhow::anyhow; +use anyhow::bail; use internal_dns::config::Host; use internal_dns::config::ZoneVariant; use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::NetworkInterface; +use nexus_types::deployment::NetworkInterfaceKind; use nexus_types::deployment::OmicronZoneConfig; use nexus_types::deployment::OmicronZoneDataset; use nexus_types::deployment::OmicronZoneType; @@ -23,11 +26,20 @@ use omicron_common::address::get_internal_dns_server_addresses; use omicron_common::address::get_sled_address; use omicron_common::address::get_switch_zone_address; use omicron_common::address::CP_SERVICES_RESERVED_ADDRESSES; +use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; +use omicron_common::address::NEXUS_OPTE_IPV6_SUBNET; use omicron_common::address::NTP_PORT; use omicron_common::address::SLED_RESERVED_ADDRESSES; use omicron_common::api::external::Generation; +use omicron_common::api::external::IpNet; +use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; +use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::collections::HashSet; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use thiserror::Error; @@ -38,6 +50,14 @@ use uuid::Uuid; pub enum Error { #[error("sled {sled_id}: ran out of available addresses for sled")] OutOfAddresses { sled_id: Uuid }, + #[error("no Nexus zones exist in parent blueprint")] + NoNexusZonesInParentBlueprint, + #[error("no external service IP addresses are available")] + NoExternalServiceIpAvailable, + #[error("no system MAC addresses are available")] + NoSystemMacAddressAvailable, + #[error("exhausted available Nexus IP addresses")] + ExhaustedNexusIps, #[error("programming error in planner")] Planner(#[from] anyhow::Error), } @@ -52,6 +72,16 @@ pub enum Ensure { NotNeeded, } +/// Describes whether an idempotent "ensure" operation resulted in multiple +/// actions taken or no action was necessary +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum EnsureMultiple { + /// action was taken, and multiple items were added + Added(usize), + /// no action was necessary + NotNeeded, +} + /// Helper for assembling a blueprint /// /// There are two basic ways to assemble a new blueprint: @@ -81,6 +111,16 @@ pub struct BlueprintBuilder<'a> { zones_in_service: BTreeSet, creator: String, comments: Vec, + + // These fields mirror how RSS chooses addresses for zone NICs. + nexus_v4_ips: Box + Send>, + nexus_v6_ips: Box + Send>, + + // Iterator of available external IPs for service zones + available_external_ips: Box + Send + 'a>, + + // Iterator of available MAC addresses in the system address range + available_system_macs: Box>, } impl<'a> BlueprintBuilder<'a> { @@ -146,8 +186,103 @@ impl<'a> BlueprintBuilder<'a> { parent_blueprint: &'a Blueprint, policy: &'a Policy, creator: &str, - ) -> BlueprintBuilder<'a> { - BlueprintBuilder { + ) -> anyhow::Result> { + // Scan through the parent blueprint and build several sets of "used + // resources". When adding new control plane zones to a sled, we may + // need to allocate new resources to that zone. However, allocation at + // this point is entirely optimistic and theoretical: our caller may + // discard the blueprint we create without ever making it the new + // target, or it might be an arbitrarily long time before it becomes the + // target. We need to be able to make allocation decisions that we + // expect the blueprint executor to be able to realize successfully if + // and when we become the target, but we cannot _actually_ perform + // resource allocation. + // + // To do this, we look at our parent blueprint's used resources, and + // then choose new resources that aren't already in use (if possible; if + // we need to allocate a new resource and the parent blueprint appears + // to be using all the resources of that kind, our blueprint generation + // will fail). + // + // For example, RSS assigns Nexus NIC IPs by stepping through a list of + // addresses based on `NEXUS_OPTE_IPVx_SUBNET` (as in the iterators + // below). We use the same list of addresses, but additionally need to + // filter out the existing IPs for any Nexus instances that already + // exist. + // + // Note that by building these iterators up front based on + // `parent_blueprint`, we cannot reuse resources in a case where we + // remove a zone that used a resource and then add another zone that + // wants the same kind of resource. We don't support zone removal yet, + // but expect this to be okay: we don't anticipate removal and addition + // to frequently be combined into the exact same blueprint, particularly + // in a way that expects the addition to reuse resources from the + // removal; we won't want to attempt to reuse resources from a zone + // until we know it's been fully removed. + let mut existing_nexus_v4_ips: HashSet = HashSet::new(); + let mut existing_nexus_v6_ips: HashSet = HashSet::new(); + let mut used_external_ips: HashSet = HashSet::new(); + let mut used_macs: HashSet = HashSet::new(); + + for (_, z) in parent_blueprint.all_omicron_zones() { + if let OmicronZoneType::Nexus { nic, .. } = &z.zone_type { + match nic.ip { + IpAddr::V4(ip) => { + if !existing_nexus_v4_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + IpAddr::V6(ip) => { + if !existing_nexus_v6_ips.insert(ip) { + bail!("duplicate Nexus NIC IP: {ip}"); + } + } + } + } + if let Some(external_ip) = z.zone_type.external_ip()? { + if !used_external_ips.insert(external_ip) { + bail!("duplicate external IP: {external_ip}"); + } + } + if let Some(nic) = z.zone_type.service_vnic() { + if !used_macs.insert(nic.mac) { + bail!("duplicate service vNIC MAC: {}", nic.mac); + } + } + } + + // TODO-performance Building these iterators as "walk through the list + // and skip anything we've used already" is fine as long as we're + // talking about a small number of resources (e.g., single-digit number + // of Nexus instances), but wouldn't be ideal if we have many resources + // we need to skip. We could do something smarter here based on the sets + // of used resources we built above if needed. + let nexus_v4_ips = Box::new( + NEXUS_OPTE_IPV4_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v4_ips.contains(ip)), + ); + let nexus_v6_ips = Box::new( + NEXUS_OPTE_IPV6_SUBNET + .0 + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES) + .filter(move |ip| !existing_nexus_v6_ips.contains(ip)), + ); + let available_external_ips = Box::new( + policy + .service_ip_pool_ranges + .iter() + .flat_map(|r| r.iter()) + .filter(move |ip| !used_external_ips.contains(ip)), + ); + let available_system_macs = Box::new( + MacAddr::iter_system().filter(move |mac| !used_macs.contains(mac)), + ); + + Ok(BlueprintBuilder { parent_blueprint, policy, sled_ip_allocators: BTreeMap::new(), @@ -155,7 +290,11 @@ impl<'a> BlueprintBuilder<'a> { zones_in_service: parent_blueprint.zones_in_service.clone(), creator: creator.to_owned(), comments: Vec::new(), - } + nexus_v4_ips, + nexus_v6_ips, + available_external_ips, + available_system_macs, + }) } /// Assemble a final [`Blueprint`] based on the contents of the builder @@ -288,6 +427,127 @@ impl<'a> BlueprintBuilder<'a> { Ok(Ensure::Added) } + /// Return the number of Nexus zones that would be configured to run on the + /// given sled if this builder generated a blueprint + /// + /// This value may change before a blueprint is actually generated if + /// further changes are made to the builder. + pub fn sled_num_nexus_zones(&self, sled_id: Uuid) -> usize { + self.zones + .current_sled_zones(sled_id) + .filter(|z| z.zone_type.is_nexus()) + .count() + } + + pub fn sled_ensure_zone_multiple_nexus( + &mut self, + sled_id: Uuid, + desired_zone_count: usize, + ) -> Result { + // How many Nexus zones do we need to add? + let nexus_count = self.sled_num_nexus_zones(sled_id); + let num_nexus_to_add = match desired_zone_count.checked_sub(nexus_count) + { + Some(0) => return Ok(EnsureMultiple::NotNeeded), + Some(n) => n, + None => { + return Err(Error::Planner(anyhow!( + "removing a Nexus zone not yet supported \ + (sled {sled_id} has {nexus_count}; \ + planner wants {desired_zone_count})" + ))); + } + }; + + // Whether Nexus should use TLS and what the external DNS servers it + // should use are currently provided at rack-setup time, and should be + // consistent across all Nexus instances. We'll assume we can copy them + // from any other Nexus zone in our parent blueprint. + // + // TODO-correctness Once these properties can be changed by a rack + // operator, this will need more work. At a minimum, if such a change + // goes through the blueprint system (which seems likely), we'll need to + // check that we're if this builder is being used to make such a change, + // that change is also reflected here in a new zone. Perhaps these + // settings should be part of `Policy` instead? + let (external_tls, external_dns_servers) = self + .parent_blueprint + .omicron_zones + .values() + .find_map(|sled_zones| { + sled_zones.zones.iter().find_map(|z| match &z.zone_type { + OmicronZoneType::Nexus { + external_tls, + external_dns_servers, + .. + } => Some((*external_tls, external_dns_servers.clone())), + _ => None, + }) + }) + .ok_or(Error::NoNexusZonesInParentBlueprint)?; + + for _ in 0..num_nexus_to_add { + let nexus_id = Uuid::new_v4(); + let external_ip = self + .available_external_ips + .next() + .ok_or(Error::NoExternalServiceIpAvailable)?; + + let nic = { + let (ip, subnet) = match external_ip { + IpAddr::V4(_) => ( + self.nexus_v4_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + ), + IpAddr::V6(_) => ( + self.nexus_v6_ips + .next() + .ok_or(Error::ExhaustedNexusIps)? + .into(), + IpNet::from(*NEXUS_OPTE_IPV6_SUBNET).into(), + ), + }; + let mac = self + .available_system_macs + .next() + .ok_or(Error::NoSystemMacAddressAvailable)?; + NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(nexus_id), + name: format!("nexus-{nexus_id}").parse().unwrap(), + ip, + mac, + subnet, + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + } + }; + + let ip = self.sled_alloc_ip(sled_id)?; + let port = omicron_common::address::NEXUS_INTERNAL_PORT; + let internal_address = + SocketAddrV6::new(ip, port, 0, 0).to_string(); + let zone = OmicronZoneConfig { + id: nexus_id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address, + external_ip, + nic, + external_tls, + external_dns_servers: external_dns_servers.clone(), + }, + }; + self.sled_add_zone(sled_id, zone)?; + } + + Ok(EnsureMultiple::Added(num_nexus_to_add)) + } + fn sled_add_zone( &mut self, sled_id: Uuid, @@ -344,7 +604,7 @@ impl<'a> BlueprintBuilder<'a> { allocator }); - allocator.alloc().ok_or_else(|| Error::OutOfAddresses { sled_id }) + allocator.alloc().ok_or(Error::OutOfAddresses { sled_id }) } fn sled_resources(&self, sled_id: Uuid) -> Result<&SledResources, Error> { @@ -457,27 +717,18 @@ impl<'a> BlueprintZones<'a> { #[cfg(test)] pub mod test { - use super::BlueprintBuilder; - use ipnet::IpAdd; - use nexus_types::deployment::Blueprint; - use nexus_types::deployment::Policy; - use nexus_types::deployment::SledResources; - use nexus_types::deployment::ZpoolName; - use nexus_types::inventory::Collection; + use super::*; + use nexus_types::external_api::views::SledProvisionState; + use omicron_common::address::IpRange; + use omicron_common::address::Ipv4Range; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::ByteCount; - use omicron_common::api::external::Generation; use sled_agent_client::types::{ Baseboard, Inventory, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, OmicronZonesConfig, SledRole, }; - use std::collections::BTreeMap; - use std::collections::BTreeSet; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; use std::str::FromStr; - use uuid::Uuid; /// Returns a collection and policy describing a pretty simple system pub fn example() -> (Collection, Policy) { @@ -488,7 +739,32 @@ pub mod test { "a5f3db3a-61aa-4f90-ad3e-02833c253bf5", "0d168386-2551-44e8-98dd-ae7a7570f8a0", ]; - let mut policy = Policy { sleds: BTreeMap::new() }; + let mut policy = Policy { + sleds: BTreeMap::new(), + // IPs from TEST-NET-1 (RFC 5737) + service_ip_pool_ranges: vec![Ipv4Range::new( + "192.0.2.2".parse().unwrap(), + "192.0.2.20".parse().unwrap(), + ) + .unwrap() + .into()], + target_nexus_zone_count: 3, + }; + let mut service_ip_pool_range = policy.service_ip_pool_ranges[0].iter(); + let mut nexus_nic_ips = NEXUS_OPTE_IPV4_SUBNET + .iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES); + let mut nexus_nic_macs = { + let mut used = HashSet::new(); + std::iter::from_fn(move || { + let mut mac = MacAddr::random_system(); + while !used.insert(mac) { + mac = MacAddr::random_system(); + } + Some(mac) + }) + }; + for sled_id_str in sled_ids.iter() { let sled_id: Uuid = sled_id_str.parse().unwrap(); let sled_ip = policy_add_sled(&mut policy, sled_id); @@ -516,19 +792,58 @@ pub mod test { .unwrap(); let zpools = &policy.sleds.get(&sled_id).unwrap().zpools; - let ip1 = sled_ip.saturating_add(1); - let zones: Vec<_> = std::iter::once(OmicronZoneConfig { - id: Uuid::new_v4(), - underlay_address: sled_ip.saturating_add(1), - zone_type: OmicronZoneType::InternalNtp { - address: SocketAddrV6::new(ip1, 12345, 0, 0).to_string(), - dns_servers: vec![], - domain: None, - ntp_servers: vec![], - }, + let mut sled_ips = + std::iter::successors(Some(sled_ip.saturating_add(1)), |ip| { + println!("sled_ips iterator: currently {ip:?}"); + Some(ip.saturating_add(1)) + }); + let zones: Vec<_> = std::iter::once({ + let ip = sled_ips.next().unwrap(); + OmicronZoneConfig { + id: Uuid::new_v4(), + underlay_address: ip, + zone_type: OmicronZoneType::InternalNtp { + address: SocketAddrV6::new(ip, 12345, 0, 0).to_string(), + dns_servers: vec![], + domain: None, + ntp_servers: vec![], + }, + } }) - .chain(zpools.iter().enumerate().map(|(i, zpool_name)| { - let ip = sled_ip.saturating_add(u128::try_from(i + 2).unwrap()); + .chain(std::iter::once({ + let id = Uuid::new_v4(); + let ip = sled_ips.next().unwrap(); + let external_ip = + service_ip_pool_range.next().expect("no service IPs left"); + let nic_ip = + nexus_nic_ips.next().expect("no nexus nic IPs left"); + OmicronZoneConfig { + id, + underlay_address: ip, + zone_type: OmicronZoneType::Nexus { + internal_address: SocketAddrV6::new(ip, 12346, 0, 0) + .to_string(), + external_ip, + nic: NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service(id), + name: format!("nexus-{id}").parse().unwrap(), + ip: nic_ip.into(), + mac: nexus_nic_macs + .next() + .expect("no nexus nic MACs left"), + subnet: IpNet::from(*NEXUS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_tls: false, + external_dns_servers: Vec::new(), + }, + } + })) + .chain(zpools.iter().map(|zpool_name| { + let ip = sled_ips.next().unwrap(); OmicronZoneConfig { id: Uuid::new_v4(), underlay_address: ip, @@ -576,7 +891,14 @@ pub mod test { .collect(); let subnet = Ipv6Subnet::::new(sled_ip); - policy.sleds.insert(sled_id, SledResources { zpools, subnet }); + policy.sleds.insert( + sled_id, + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet, + }, + ); sled_ip } @@ -631,7 +953,8 @@ pub mod test { &blueprint_initial, &policy, "test_basic", - ); + ) + .expect("failed to create builder"); let blueprint = builder.build(); verify_blueprint(&blueprint); let diff = blueprint_initial.diff(&blueprint); @@ -656,7 +979,8 @@ pub mod test { verify_blueprint(&blueprint1); let mut builder = - BlueprintBuilder::new_based_on(&blueprint1, &policy, "test_basic"); + BlueprintBuilder::new_based_on(&blueprint1, &policy, "test_basic") + .expect("failed to create builder"); // The initial blueprint should have internal NTP zones on all the // existing sleds, plus Crucible zones on all pools. So if we ensure @@ -685,7 +1009,8 @@ pub mod test { let new_sled_id = Uuid::new_v4(); let _ = policy_add_sled(&mut policy, new_sled_id); let mut builder = - BlueprintBuilder::new_based_on(&blueprint2, &policy, "test_basic"); + BlueprintBuilder::new_based_on(&blueprint2, &policy, "test_basic") + .expect("failed to create builder"); builder.sled_ensure_zone_ntp(new_sled_id).unwrap(); let new_sled_resources = policy.sleds.get(&new_sled_id).unwrap(); for pool_name in &new_sled_resources.zpools { @@ -751,4 +1076,274 @@ pub mod test { .collect::>(); assert_eq!(crucible_pool_names, new_sled_resources.zpools); } + + #[test] + fn test_add_nexus_with_no_existing_nexus_zones() { + let (mut collection, policy) = example(); + + // Adding a new Nexus zone currently requires copying settings from an + // existing Nexus zone. If we remove all Nexus zones from the + // collection, create a blueprint, then try to add a Nexus zone, it + // should fail. + for zones in collection.omicron_zones.values_mut() { + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + } + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + + let err = builder + .sled_ensure_zone_multiple_nexus( + collection + .omicron_zones + .keys() + .next() + .copied() + .expect("no sleds present"), + 1, + ) + .unwrap_err(); + + assert!( + matches!(err, Error::NoNexusZonesInParentBlueprint), + "unexpected error {err}" + ); + } + + #[test] + fn test_add_nexus_error_cases() { + let (mut collection, policy) = example(); + + // Remove the Nexus zone from one of the sleds so that + // `sled_ensure_zone_nexus` can attempt to add a Nexus zone to + // `sled_id`. + let sled_id = { + let mut selected_sled_id = None; + for (sled_id, zones) in &mut collection.omicron_zones { + let nzones_before_retain = zones.zones.zones.len(); + zones.zones.zones.retain(|z| { + !matches!(z.zone_type, OmicronZoneType::Nexus { .. }) + }); + if zones.zones.zones.len() < nzones_before_retain { + selected_sled_id = Some(*sled_id); + break; + } + } + selected_sled_id.expect("found no sleds with Nexus zone") + }; + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .expect("failed to create initial blueprint"); + + { + // Attempting to add Nexus to the sled we removed it from (with no + // other changes to the environment) should succeed. + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(1)); + } + + { + // Attempting to add multiple Nexus zones to the sled we removed it + // from (with no other changes to the environment) should also + // succeed. + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let added = builder + .sled_ensure_zone_multiple_nexus(sled_id, 3) + .expect("failed to ensure nexus zone"); + + assert_eq!(added, EnsureMultiple::Added(3)); + } + + { + // Replace the policy's external service IP pool ranges with ranges + // that are already in use by existing zones. Attempting to add a + // Nexus with no remaining external IPs should fail. + let mut policy = policy.clone(); + let mut used_ip_ranges = Vec::new(); + for (_, z) in parent.all_omicron_zones() { + if let Some(ip) = z + .zone_type + .external_ip() + .expect("failed to check for external IP") + { + used_ip_ranges.push(IpRange::from(ip)); + } + } + assert!(!used_ip_ranges.is_empty()); + policy.service_ip_pool_ranges = used_ip_ranges; + + let mut builder = + BlueprintBuilder::new_based_on(&parent, &policy, "test") + .expect("failed to create builder"); + let err = builder + .sled_ensure_zone_multiple_nexus(sled_id, 1) + .unwrap_err(); + + assert!( + matches!(err, Error::NoExternalServiceIpAvailable), + "unexpected error {err}" + ); + } + + // We're not testing the `ExhaustedNexusIps` error case (where we've run + // out of Nexus OPTE addresses), because it's fairly diffiult to induce + // that from outside: we would need to start from a parent blueprint + // that contained a Nexus instance for every IP in the + // `NEXUS_OPTE_*_SUBNET`. We could hack around that by creating the + // `BlueprintBuilder` and mucking with its internals, but that doesn't + // seem like a particularly useful test either. + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_external_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same external IP. Skim through the zones, copy the + // external IP from one Nexus zone, then assign it to a later Nexus + // zone. + let mut found_second_nexus_zone = false; + let mut nexus_external_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { external_ip, .. } = + &mut z.zone_type + { + if let Some(ip) = nexus_external_ip { + *external_ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_external_ip = Some(*external_ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate external IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_nexus_zones_with_same_nic_ip() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // Nexus zones with the same NIC IP. Skim through the zones, copy + // the NIC IP from one Nexus zone, then assign it to a later + // Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_ip = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(ip) = nexus_nic_ip { + nic.ip = ip; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_ip = Some(nic.ip); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate Nexus NIC IP"), + "unexpected error: {err:#}" + ), + }; + } + + #[test] + fn test_invalid_parent_blueprint_two_zones_with_same_vnic_mac() { + let (mut collection, policy) = example(); + + // We should fail if the parent blueprint claims to contain two + // zones with the same service vNIC MAC address. Skim through the + // zones, copy the NIC MAC from one Nexus zone, then assign it to a + // later Nexus zone. + let mut found_second_nexus_zone = false; + let mut nexus_nic_mac = None; + + 'outer: for zones in collection.omicron_zones.values_mut() { + for z in zones.zones.zones.iter_mut() { + if let OmicronZoneType::Nexus { nic, .. } = &mut z.zone_type { + if let Some(mac) = nexus_nic_mac { + nic.mac = mac; + found_second_nexus_zone = true; + break 'outer; + } else { + nexus_nic_mac = Some(nic.mac); + continue 'outer; + } + } + } + } + assert!(found_second_nexus_zone, "only one Nexus zone present?"); + + let parent = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "test", + ) + .unwrap(); + + match BlueprintBuilder::new_based_on(&parent, &policy, "test") { + Ok(_) => panic!("unexpected success"), + Err(err) => assert!( + err.to_string().contains("duplicate service vNIC MAC"), + "unexpected error: {err:#}" + ), + }; + } } diff --git a/nexus/deployment/src/lib.rs b/nexus/deployment/src/lib.rs index fd182ae613..546f2c1dc1 100644 --- a/nexus/deployment/src/lib.rs +++ b/nexus/deployment/src/lib.rs @@ -57,7 +57,7 @@ //! The Planner //! //! fleet policy (latest inventory) (latest blueprint) -//! \ | / +//! \ | / //! \ | / //! +----------+ | +----------/ //! | | | @@ -85,7 +85,7 @@ //! The Executor (better name?) //! //! latest committed blueprint latest inventory -//! | | +//! | | //! | | //! +----+ +----+ //! | | diff --git a/nexus/deployment/src/planner.rs b/nexus/deployment/src/planner.rs index 002a1dbe2e..cbdcfd80c0 100644 --- a/nexus/deployment/src/planner.rs +++ b/nexus/deployment/src/planner.rs @@ -8,11 +8,17 @@ use crate::blueprint_builder::BlueprintBuilder; use crate::blueprint_builder::Ensure; +use crate::blueprint_builder::EnsureMultiple; use crate::blueprint_builder::Error; use nexus_types::deployment::Blueprint; use nexus_types::deployment::Policy; +use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; +use slog::warn; use slog::{info, Logger}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use uuid::Uuid; pub struct Planner<'a> { log: Logger, @@ -39,10 +45,10 @@ impl<'a> Planner<'a> { // NOTE: Right now, we just assume that this is the latest inventory // collection. See the comment on the corresponding field in `Planner`. inventory: &'a Collection, - ) -> Planner<'a> { + ) -> anyhow::Result> { let blueprint = - BlueprintBuilder::new_based_on(parent_blueprint, policy, creator); - Planner { log, policy, blueprint, inventory } + BlueprintBuilder::new_based_on(parent_blueprint, policy, creator)?; + Ok(Planner { log, policy, blueprint, inventory }) } pub fn plan(mut self) -> Result { @@ -61,6 +67,17 @@ impl<'a> Planner<'a> { // added and where they should go. And the blueprint builder will need // to grow the ability to provision one. + // After we make our initial pass through the sleds below to check for + // zones every sled should have (NTP, Crucible), we'll start making + // decisions about placing other service zones. We need to _exclude_ any + // sleds for which we just added an NTP zone, as we won't be able to add + // additional services to them until that NTP zone has been brought up. + // + // We will not mark sleds getting Crucible zones as ineligible; other + // control plane service zones starting concurrently with Crucible zones + // is fine. + let mut sleds_ineligible_for_services = BTreeSet::new(); + for (sled_id, sled_info) in &self.policy.sleds { // Check for an NTP zone. Every sled should have one. If it's not // there, all we can do is provision that one zone. We have to wait @@ -70,13 +87,14 @@ impl<'a> Planner<'a> { info!( &self.log, "found sled missing NTP zone (will add one)"; - "sled_id" => ?sled_id + "sled_id" => %sled_id ); self.blueprint .comment(&format!("sled {}: add NTP zone", sled_id)); // Don't make any other changes to this sled. However, this // change is compatible with any other changes to other sleds, // so we can "continue" here rather than "break". + sleds_ineligible_for_services.insert(*sled_id); continue; } @@ -100,7 +118,7 @@ impl<'a> Planner<'a> { let has_ntp_inventory = self .inventory .omicron_zones - .get(&sled_id) + .get(sled_id) .map(|sled_zones| { sled_zones.zones.zones.iter().any(|z| z.zone_type.is_ntp()) }) @@ -110,7 +128,7 @@ impl<'a> Planner<'a> { &self.log, "parent blueprint contains NTP zone, but it's not in \ inventory yet"; - "sled_id" => ?sled_id, + "sled_id" => %sled_id, ); continue; } @@ -145,6 +163,139 @@ impl<'a> Planner<'a> { } } + // We've now placed all the services that should always exist on all + // sleds. Before moving on to make decisions about placing services that + // are _not_ present on all sleds, check the provision state of all our + // sleds so we can avoid any non-provisionable sleds under the + // assumption that there is something amiss with them. + sleds_ineligible_for_services.extend( + self.policy.sleds.iter().filter_map(|(sled_id, sled_info)| { + match sled_info.provision_state { + SledProvisionState::Provisionable => None, + SledProvisionState::NonProvisionable => Some(*sled_id), + } + }), + ); + + self.ensure_correct_number_of_nexus_zones( + &sleds_ineligible_for_services, + )?; + + Ok(()) + } + + fn ensure_correct_number_of_nexus_zones( + &mut self, + sleds_ineligible_for_services: &BTreeSet, + ) -> Result<(), Error> { + // Bin every sled by the number of Nexus zones it currently has while + // counting the total number of Nexus zones. + let mut num_total_nexus = 0; + let mut sleds_by_num_nexus: BTreeMap> = + BTreeMap::new(); + for &sled_id in self.policy.sleds.keys() { + let num_nexus = self.blueprint.sled_num_nexus_zones(sled_id); + num_total_nexus += num_nexus; + + // Only bin this sled if we're allowed to use it. If we have a sled + // we're not allowed to use that's already running a Nexus (seems + // fishy!), we counted its Nexus above but will ignore it here. + if !sleds_ineligible_for_services.contains(&sled_id) { + sleds_by_num_nexus.entry(num_nexus).or_default().push(sled_id); + } + } + + // TODO-correctness What should we do if we have _too many_ Nexus + // instances? For now, just log it the number of zones any time we have + // at least the minimum number. + let nexus_to_add = + self.policy.target_nexus_zone_count.saturating_sub(num_total_nexus); + if nexus_to_add == 0 { + info!( + self.log, "sufficient Nexus zones exist in plan"; + "desired_count" => self.policy.target_nexus_zone_count, + "current_count" => num_total_nexus, + ); + return Ok(()); + } + + // Ensure we have at least one sled on which we can add Nexus zones. If + // we don't, we have nothing else to do. This isn't a hard error, + // because we might be waiting for NTP on all eligible sleds (although + // it would be weird, since we're presumably running from within Nexus + // on some sled). + if sleds_by_num_nexus.is_empty() { + warn!(self.log, "want to add Nexus zones, but no eligible sleds"); + return Ok(()); + } + + // Build a map of sled -> new nexus zone count. + let mut sleds_to_change: BTreeMap = BTreeMap::new(); + + 'outer: for _ in 0..nexus_to_add { + // `sleds_by_num_nexus` is sorted by key already, and we want to + // pick from the lowest-numbered bin. We can just loop over its + // keys, expecting to stop on the first iteration, with the only + // exception being when we've removed all the sleds from a bin. + for (&num_nexus, sleds) in sleds_by_num_nexus.iter_mut() { + // `sleds` contains all sleds with the minimum number of Nexus + // zones. Pick one arbitrarily but deterministically. + let Some(sled_id) = sleds.pop() else { + // We already drained this bin; move on. + continue; + }; + + // This insert might overwrite an old value for this sled (e.g., + // in the "we have 1 sled and need to add many Nexus instances + // to it" case). That's fine. + sleds_to_change.insert(sled_id, num_nexus + 1); + + // Put this sled back in our map, but now with one more Nexus. + sleds_by_num_nexus + .entry(num_nexus + 1) + .or_default() + .push(sled_id); + + continue 'outer; + } + + // This should be unreachable: it's only possible if we fail to find + // a nonempty vec in `sleds_by_num_nexus`, and we checked above that + // `sleds_by_num_nexus` is not empty. + unreachable!("logic error finding sleds for Nexus"); + } + + // For each sled we need to change, actually do so. + let mut total_added = 0; + for (sled_id, new_nexus_count) in sleds_to_change { + match self + .blueprint + .sled_ensure_zone_multiple_nexus(sled_id, new_nexus_count)? + { + EnsureMultiple::Added(n) => { + info!( + self.log, "will add {n} Nexus zone(s) to sled"; + "sled_id" => %sled_id, + ); + total_added += n; + } + // This is only possible if we asked the sled to ensure the same + // number of zones it already has, but that's impossible based + // on the way we built up `sleds_to_change`. + EnsureMultiple::NotNeeded => unreachable!( + "sled on which we added Nexus zones did not add any" + ), + } + } + + // Double check that we didn't make any arithmetic mistakes. If we've + // arrived here, we think we've added the number of Nexus zones we + // needed to. + assert_eq!( + total_added, nexus_to_add, + "internal error counting Nexus zones" + ); + Ok(()) } } @@ -157,6 +308,7 @@ mod test { use crate::blueprint_builder::test::verify_blueprint; use crate::blueprint_builder::BlueprintBuilder; use nexus_inventory::now_db_precision; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::OmicronZoneType; use nexus_types::inventory::OmicronZonesFound; use omicron_common::api::external::Generation; @@ -190,6 +342,7 @@ mod test { "no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -213,6 +366,7 @@ mod test { "test: add NTP?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -244,6 +398,7 @@ mod test { "test: add nothing more", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); let diff = blueprint3.diff(&blueprint4); @@ -279,6 +434,7 @@ mod test { "test: add Crucible zones?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -300,12 +456,12 @@ mod test { assert_eq!(zones.len(), 3); for zone in &zones { let OmicronZoneType::Crucible { .. } = zone.zone_type else { - panic!("unexpectedly added a non-Crucible zone"); + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); }; } verify_blueprint(&blueprint5); - // Check that there are no more steps + // Check that there are no more steps. let blueprint6 = Planner::new_based_on( logctx.log.clone(), &blueprint5, @@ -313,6 +469,7 @@ mod test { "test: no-op?", &collection, ) + .expect("failed to create planner") .plan() .expect("failed to plan"); @@ -325,4 +482,254 @@ mod test { logctx.cleanup_successful(); } + + /// Check that the planner will add more Nexus zones to a single sled, if + /// needed + #[test] + fn test_add_multiple_nexus_to_one_sled() { + let logctx = test_setup_log("planner_add_multiple_nexus_to_one_sled"); + + // Use our example inventory collection as a starting point, but strip + // it down to just one sled. + let (sled_id, collection, mut policy) = { + let (mut collection, mut policy) = example(); + + // Pick one sled ID to keep and remove the rest. + let keep_sled_id = + policy.sleds.keys().next().copied().expect("no sleds"); + policy.sleds.retain(|&k, _v| keep_sled_id == k); + collection.sled_agents.retain(|&k, _v| keep_sled_id == k); + collection.omicron_zones.retain(|&k, _v| keep_sled_id == k); + + assert_eq!(collection.sled_agents.len(), 1); + assert_eq!(collection.omicron_zones.len(), 1); + + (keep_sled_id, collection, policy) + }; + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 1 Nexus instance on the one sled we + // kept. + assert_eq!(blueprint1.omicron_zones.len(), 1); + assert_eq!( + blueprint1 + .omicron_zones + .get(&sled_id) + .expect("missing kept sled") + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + + // Now run the planner. It should add additional Nexus instances to the + // one sled we have. + policy.target_nexus_zone_count = 5; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let mut sleds = diff.sleds_changed().collect::>(); + assert_eq!(sleds.len(), 1); + let (changed_sled_id, sled_changes) = sleds.pop().unwrap(); + assert_eq!(changed_sled_id, sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + assert_eq!(zones.len(), policy.target_nexus_zone_count - 1); + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Nexus zone: {zone:?}"); + }; + } + + logctx.cleanup_successful(); + } + + /// Check that the planner will spread additional Nexus zones out across + /// sleds as it adds them + #[test] + fn test_spread_additional_nexus_zones_across_sleds() { + let logctx = test_setup_log( + "planner_spread_additional_nexus_zones_across_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // All 3 sleds should get additional Nexus zones. We expect a total of + // 11 new Nexus zones, which should be spread evenly across the three + // sleds (two should get 4 and one should get 3). + assert_eq!(sleds.len(), 3); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (3 | 4) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); + + logctx.cleanup_successful(); + } + + /// Check that the planner will skip non-provisionable sleds when allocating + /// extra Nexus zones + #[test] + fn test_nexus_allocation_skips_nonprovisionable_sleds() { + let logctx = test_setup_log( + "planner_nexus_allocation_skips_nonprovisionable_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Arbitrarily choose one of the sleds and mark it non-provisionable. + let nonprovisionable_sled_id = { + let (sled_id, resources) = + policy.sleds.iter_mut().next().expect("no sleds"); + resources.provision_state = SledProvisionState::NonProvisionable; + *sled_id + }; + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // Only 2 of the 3 sleds should get additional Nexus zones. We expect a + // total of 11 new Nexus zones, which should be spread evenly across the + // two sleds (one gets 6 and the other gets 5), while the + // non-provisionable sled should be unchanged. + assert_eq!(sleds.len(), 2); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert!(sled_id != nonprovisionable_sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (5 | 6) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); + + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 65f8f4d028..b8cb6deabf 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -18,7 +18,9 @@ use nexus_types::deployment::SledResources; use nexus_types::deployment::ZpoolName; use nexus_types::identity::Asset; use nexus_types::inventory::Collection; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; +use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -169,11 +171,39 @@ impl super::Nexus { let zpools = zpools_by_sled_id .remove(&sled_id) .unwrap_or_else(BTreeSet::new); - let sled_info = SledResources { subnet, zpools }; + let sled_info = SledResources { + provision_state: sled_row.provision_state().into(), + subnet, + zpools, + }; (sled_id, sled_info) }) .collect(); + let service_ip_pool_ranges = { + let (authz_service_ip_pool, _) = + datastore.ip_pools_service_lookup(opctx).await?; + + let mut ip_ranges = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = datastore + .ip_pool_list_ranges( + opctx, + &authz_service_ip_pool, + &p.current_pagparams(), + ) + .await?; + // The use of `last_address` here assumes `paginator` is sorting + // in Ascending order (which it does - see the implementation of + // `current_pagparams()`). + paginator = p.found_batch(&batch, &|r| r.last_address); + ip_ranges.extend(batch.iter().map(IpRange::from)); + } + + ip_ranges + }; + // The choice of which inventory collection to use here is not // necessarily trivial. Inventory collections may be incomplete due to // transient (or even persistent) errors. It's not yet clear what @@ -192,7 +222,15 @@ impl super::Nexus { "fetching latest inventory collection for blueprint planner", )?; - Ok(PlanningContext { creator, policy: Policy { sleds }, inventory }) + Ok(PlanningContext { + creator, + policy: Policy { + sleds, + service_ip_pool_ranges, + target_nexus_zone_count: NEXUS_REDUNDANCY, + }, + inventory, + }) } async fn blueprint_add( @@ -252,7 +290,12 @@ impl super::Nexus { &planning_context.policy, &planning_context.creator, &inventory, - ); + ) + .map_err(|error| { + Error::internal_error(&format!( + "error creating blueprint planner: {error:#}", + )) + })?; let blueprint = planner.plan().map_err(|error| { Error::internal_error(&format!( "error generating blueprint: {}", diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 324768b9d8..06427507d5 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -11,13 +11,17 @@ //! nexus/deployment does not currently know about nexus/db-model and it's //! convenient to separate these concerns.) +use crate::external_api::views::SledProvisionState; use crate::inventory::Collection; +pub use crate::inventory::NetworkInterface; +pub use crate::inventory::NetworkInterfaceKind; pub use crate::inventory::OmicronZoneConfig; pub use crate::inventory::OmicronZoneDataset; pub use crate::inventory::OmicronZoneType; pub use crate::inventory::OmicronZonesConfig; pub use crate::inventory::SourceNatConfig; pub use crate::inventory::ZpoolName; +use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; @@ -43,14 +47,26 @@ use uuid::Uuid; /// /// The current policy is pretty limited. It's aimed primarily at supporting /// the add/remove sled use case. +#[derive(Debug, Clone)] pub struct Policy { /// set of sleds that are supposed to be part of the control plane, along /// with information about resources available to the planner pub sleds: BTreeMap, + + /// ranges specified by the IP pool for externally-visible control plane + /// services (e.g., external DNS, Nexus, boundary NTP) + pub service_ip_pool_ranges: Vec, + + /// desired total number of deployed Nexus zones + pub target_nexus_zone_count: usize, } /// Describes the resources available on each sled for the planner +#[derive(Debug, Clone)] pub struct SledResources { + /// provision state of this sled + pub provision_state: SledProvisionState, + /// zpools on this sled /// /// (used to allocate storage for control plane zones with persistent diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index c99e51af4f..50e8b380b3 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -48,7 +48,7 @@ use uuid::Uuid; /// database. /// /// See the documentation in the database schema for more background. -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq, Clone)] pub struct Collection { /// unique identifier for this collection pub id: Uuid, diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 220f0d686b..0b633c2057 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -15,8 +15,8 @@ use internal_dns::ServiceName; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, - MGD_PORT, MGS_PORT, NTP_PORT, NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, - SLED_PREFIX, + MGD_PORT, MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, + RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; use omicron_common::api::external::{MacAddr, Vni}; use omicron_common::api::internal::shared::SwitchLocation; @@ -44,9 +44,6 @@ use uuid::Uuid; // The number of boundary NTP servers to create from RSS. const BOUNDARY_NTP_COUNT: usize = 2; -// The number of Nexus instances to create from RSS. -const NEXUS_COUNT: usize = 3; - // The number of CRDB instances to create from RSS. const CRDB_COUNT: usize = 5; @@ -458,7 +455,7 @@ impl Plan { } // Provision Nexus zones, continuing to stripe across sleds. - for _ in 0..NEXUS_COUNT { + for _ in 0..NEXUS_REDUNDANCY { let sled = { let which_sled = sled_allocator.next().ok_or(PlanError::NotEnoughSleds)?;