From 106003efc454b32f3dafdb9f2ae953effc9d8eea Mon Sep 17 00:00:00 2001 From: "oxide-reflector-bot[bot]" <130185838+oxide-reflector-bot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 19:34:48 -0700 Subject: [PATCH 01/91] Update maghemite to 0c4292f (#6183) Updated maghemite to commit 0c4292f. --------- Co-authored-by: reflector[bot] <130185838+reflector[bot]@users.noreply.github.com> --- package-manifest.toml | 12 ++++++------ tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index 24f848c31a..5ee81e722b 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -578,10 +578,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "220dd026e83142b83bd93123f465a64dd4600201" +source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "58c8fcec6b932f7e602ac82cc28460aa557cabae1b66947ab3cb7334b87c35d4" +source.sha256 = "b0f08e754f7c834d7ca05093b13a574863f500cff56210591ef4cc7eaf20159b" output.type = "tarball" [package.mg-ddm] @@ -594,10 +594,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "220dd026e83142b83bd93123f465a64dd4600201" +source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "69fa43393a77f19713c7d76a320064e3eb58b3ea0b2953d2079a5c3edebc172e" +source.sha256 = "499962b57404626aff1ecd62d5045ba2ee06070d45f7cb2a8fc284e53eed17d6" output.type = "zone" output.intermediate_only = true @@ -609,10 +609,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "220dd026e83142b83bd93123f465a64dd4600201" +source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "f1103de5dda4830eb653f4d555995d08c31253116448387399a77392c08dfb54" +source.sha256 = "e15db7d262b5b2f08a2e2799668c67d0cb883e84c72736a30d299688115bf055" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 40d39b3dd0..c1e011e38d 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="220dd026e83142b83bd93123f465a64dd4600201" +COMMIT="0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" SHA2="007bfb717ccbc077c0250dee3121aeb0c5bb0d1c16795429a514fa4f8635a5ef" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 172c5c6f3d..1184f6e4fd 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="220dd026e83142b83bd93123f465a64dd4600201" +COMMIT="0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" SHA2="e4b42ab9daad90f0c561a830b62a9d17e294b4d0da0a6d44b4030929b0c37b7e" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 5479623d30..7ca642fa70 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="f1103de5dda4830eb653f4d555995d08c31253116448387399a77392c08dfb54" -MGD_LINUX_SHA256="b4469b8ec3b2193f3eff2886fe1c7ac17dc135b8d7572e1a6c765811738402bd" \ No newline at end of file +CIDL_SHA256="e15db7d262b5b2f08a2e2799668c67d0cb883e84c72736a30d299688115bf055" +MGD_LINUX_SHA256="915e7b5cac8ff1deb6549b86e4ba49fd5c6adbdcc56ae5dc3c7b3e69555a7c2c" \ No newline at end of file From 8dfa0a4ab058b362b2aba9c781475ec9eb63f4d3 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 10 Aug 2024 23:11:50 +0000 Subject: [PATCH 02/91] Update Rust crate parse-display to 0.10.0 (#6265) --- Cargo.lock | 28 ++++++++++++++-------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 61050081f7..eef8c03f57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2474,7 +2474,7 @@ checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" dependencies = [ "bit-set", "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] @@ -3002,7 +3002,7 @@ dependencies = [ "bstr 1.9.1", "log", "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] @@ -6199,7 +6199,7 @@ dependencies = [ "proc-macro2", "regex", "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", "reqwest", "ring 0.17.8", "rustix", @@ -6875,25 +6875,25 @@ dependencies = [ [[package]] name = "parse-display" -version = "0.9.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "914a1c2265c98e2446911282c6ac86d8524f495792c38c5bd884f80499c7538a" +checksum = "287d8d3ebdce117b8539f59411e4ed9ec226e0a4153c7f55495c6070d68e6f72" dependencies = [ "parse-display-derive", "regex", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] name = "parse-display-derive" -version = "0.9.1" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ae7800a4c974efd12df917266338e79a7a74415173caf7e70aa0a0707345281" +checksum = "7fc048687be30d79502dea2f623d052f3a074012c6eac41726b7ab17213616b1" dependencies = [ "proc-macro2", "quote", "regex", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", "structmeta 0.3.0", "syn 2.0.72", ] @@ -7662,7 +7662,7 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "rand_xorshift", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", "rusty-fork", "tempfile", "unarray", @@ -8017,7 +8017,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata 0.4.6", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] @@ -8034,7 +8034,7 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.3", + "regex-syntax 0.8.4", ] [[package]] @@ -8045,9 +8045,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "regress" diff --git a/Cargo.toml b/Cargo.toml index 536941a72d..a3042a997a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -452,7 +452,7 @@ oximeter-macro-impl = { path = "oximeter/oximeter-macro-impl" } oximeter-producer = { path = "oximeter/producer" } oximeter-timeseries-macro = { path = "oximeter/timeseries-macro" } p256 = "0.13" -parse-display = "0.9.1" +parse-display = "0.10.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } parse-size = "1.0.0" paste = "1.0.15" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 5f34c76db9..be786db56b 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -86,7 +86,7 @@ predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } regex = { version = "1.10.6" } regex-automata = { version = "0.4.6", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "unicode"] } -regex-syntax = { version = "0.8.3" } +regex-syntax = { version = "0.8.4" } reqwest = { version = "0.11.27", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } @@ -190,7 +190,7 @@ predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } regex = { version = "1.10.6" } regex-automata = { version = "0.4.6", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "unicode"] } -regex-syntax = { version = "0.8.3" } +regex-syntax = { version = "0.8.4" } reqwest = { version = "0.11.27", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } From c5778af29fd27dee77e5c2ff945cbbd152506ad8 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 10 Aug 2024 23:58:07 +0000 Subject: [PATCH 03/91] Update Rust crate ratatui to 0.28.0 (#6267) --- Cargo.lock | 97 +++++++++++++------- Cargo.toml | 6 +- dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs | 2 +- wicket/src/runner.rs | 2 +- wicket/src/ui/main.rs | 4 +- wicket/src/ui/splash.rs | 4 +- wicket/src/ui/widgets/animated_logo.rs | 2 +- wicket/src/ui/widgets/box_connector.rs | 8 +- wicket/src/ui/widgets/rack.rs | 6 +- workspace-hack/Cargo.toml | 26 ++++-- 10 files changed, 97 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eef8c03f57..b874f5a705 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -891,9 +891,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "castaway" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5" dependencies = [ "rustversion", ] @@ -1196,13 +1196,14 @@ dependencies = [ [[package]] name = "compact_str" -version = "0.7.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644" dependencies = [ "castaway", "cfg-if", "itoa", + "rustversion", "ryu", "static_assertions", ] @@ -1461,9 +1462,8 @@ checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ "bitflags 2.6.0", "crossterm_winapi", - "futures-core", "libc", - "mio", + "mio 0.8.11", "parking_lot 0.12.2", "serde", "signal-hook", @@ -1471,6 +1471,23 @@ dependencies = [ "winapi", ] +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags 2.6.0", + "crossterm_winapi", + "futures-core", + "mio 1.0.1", + "parking_lot 0.12.2", + "rustix", + "signal-hook", + "signal-hook-mio", + "winapi", +] + [[package]] name = "crossterm_winapi" version = "0.9.1" @@ -3725,6 +3742,16 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instability" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c" +dependencies = [ + "quote", + "syn 2.0.72", +] + [[package]] name = "installinator" version = "0.1.0" @@ -4518,6 +4545,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "mio" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +dependencies = [ + "hermit-abi 0.3.9", + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + [[package]] name = "mockall" version = "0.13.0" @@ -5832,7 +5872,7 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "crossterm", + "crossterm 0.28.1", "crucible-agent-client", "csv", "diesel", @@ -6147,7 +6187,6 @@ dependencies = [ "const-oid", "crossbeam-epoch", "crossbeam-utils", - "crossterm", "crypto-common", "der", "digest", @@ -6183,7 +6222,7 @@ dependencies = [ "log", "managed", "memchr", - "mio", + "mio 0.8.11", "nom", "num-bigint", "num-integer", @@ -6209,6 +6248,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "signal-hook-mio", "similar", "slog", "smallvec 1.13.2", @@ -6603,7 +6643,7 @@ dependencies = [ "chrono", "clap", "clickward", - "crossterm", + "crossterm 0.28.1", "dropshot", "expectorate", "futures", @@ -7837,18 +7877,18 @@ dependencies = [ [[package]] name = "ratatui" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d16546c5b5962abf8ce6e2881e722b4e0ae3b6f1a08a26ae3573c55853ca68d3" +checksum = "5ba6a365afbe5615999275bea2446b970b10a41102500e27ce7678d50d978303" dependencies = [ "bitflags 2.6.0", "cassowary", "compact_str", - "crossterm", + "crossterm 0.28.1", + "instability", "itertools 0.13.0", "lru", "paste", - "stability", "strum", "strum_macros 0.26.4", "unicode-segmentation", @@ -7975,7 +8015,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65ebc241ed0ccea0bbbd775a55a76f0dd9971ef084589dea938751a03ffedc14" dependencies = [ "chrono", - "crossterm", + "crossterm 0.27.0", "fd-lock", "itertools 0.12.1", "nu-ansi-term", @@ -8994,12 +9034,13 @@ dependencies = [ [[package]] name = "signal-hook-mio" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" dependencies = [ "libc", - "mio", + "mio 0.8.11", + "mio 1.0.1", "signal-hook", ] @@ -9519,16 +9560,6 @@ dependencies = [ "syn 2.0.72", ] -[[package]] -name = "stability" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ff9eaf853dec4c8802325d8b6d3dffa86cc707fd7a1a4cdbf416e13b061787a" -dependencies = [ - "quote", - "syn 2.0.72", -] - [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -10138,7 +10169,7 @@ dependencies = [ "backtrace", "bytes", "libc", - "mio", + "mio 0.8.11", "num_cpus", "parking_lot 0.12.2", "pin-project-lite", @@ -10617,9 +10648,9 @@ dependencies = [ [[package]] name = "tui-tree-widget" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ac69db35529be6a75f9d27516ff33df299e2e8e961a1986d52185cef0427352" +checksum = "df0b54061d997162f225bed5d2147574af0648480214759a000e33f6cea0017a" dependencies = [ "ratatui", "unicode-width", @@ -11264,7 +11295,7 @@ dependencies = [ "camino", "ciborium", "clap", - "crossterm", + "crossterm 0.28.1", "expectorate", "futures", "humantime", @@ -11338,7 +11369,7 @@ dependencies = [ "camino", "ciborium", "clap", - "crossterm", + "crossterm 0.28.1", "omicron-workspace-hack", "reedline", "serde", diff --git a/Cargo.toml b/Cargo.toml index a3042a997a..32f11e456f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -301,7 +301,7 @@ const_format = "0.2.32" cookie = "0.18" criterion = { version = "0.5.1", features = [ "async_tokio" ] } crossbeam = "0.8" -crossterm = { version = "0.27.0", features = ["event-stream"] } +crossterm = { version = "0.28.1", features = ["event-stream"] } crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } @@ -477,7 +477,7 @@ rand = "0.8.5" rand_core = "0.6.4" rand_distr = "0.4.3" rand_seeder = "0.3.0" -ratatui = "0.27.0" +ratatui = "0.28.0" rayon = "1.10" rcgen = "0.12.1" reedline = "0.31.0" @@ -569,7 +569,7 @@ trust-dns-server = "0.22" trybuild = "1.0.99" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } -tui-tree-widget = "0.21.0" +tui-tree-widget = "0.22.0" typed-rng = { path = "typed-rng" } unicode-width = "0.1.13" update-common = { path = "update-common" } diff --git a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs index cd7628a840..b3f34d5791 100644 --- a/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs +++ b/dev-tools/omdb/src/bin/omdb/mgs/dashboard.rs @@ -1091,7 +1091,7 @@ fn draw_status(f: &mut Frame, parent: Rect, status: &[(&str, &str)]) { } fn draw(f: &mut Frame, dashboard: &mut Dashboard) { - let size = f.size(); + let size = f.area(); let screen = Layout::default() .direction(Direction::Vertical) diff --git a/wicket/src/runner.rs b/wicket/src/runner.rs index 3af68ccbec..0e201478a8 100644 --- a/wicket/src/runner.rs +++ b/wicket/src/runner.rs @@ -75,7 +75,7 @@ impl RunnerCore { /// Resize and draw the initial screen before handling `Event`s pub fn init_screen(&mut self) -> anyhow::Result<()> { // Size the initial screen - let rect = self.terminal.get_frame().size(); + let rect = self.terminal.get_frame().area(); self.screen.resize(&mut self.state, rect.width, rect.height); // Draw the initial screen diff --git a/wicket/src/ui/main.rs b/wicket/src/ui/main.rs index 379cbd03af..ae6924071a 100644 --- a/wicket/src/ui/main.rs +++ b/wicket/src/ui/main.rs @@ -64,7 +64,7 @@ impl MainScreen { terminal: &mut Term, ) -> anyhow::Result<()> { terminal.draw(|frame| { - let mut rect = frame.size(); + let mut rect = frame.area(); rect.height -= 1; let statusbar_rect = Rect { @@ -85,7 +85,7 @@ impl MainScreen { // Draw all the components, starting with the background let background = Block::default().style(style::background()); - frame.render_widget(background, frame.size()); + frame.render_widget(background, frame.area()); self.sidebar.draw(state, frame, chunks[0], self.sidebar.active); self.draw_pane(state, frame, chunks[1]); self.draw_statusbar(state, frame, statusbar_rect); diff --git a/wicket/src/ui/splash.rs b/wicket/src/ui/splash.rs index 9da9fa8648..18d7b37f08 100644 --- a/wicket/src/ui/splash.rs +++ b/wicket/src/ui/splash.rs @@ -33,7 +33,7 @@ impl SplashScreen { fn draw_background(&self, f: &mut Frame) { let block = Block::default().style(style::background()); - f.render_widget(block, f.size()); + f.render_widget(block, f.area()); } // Sweep left to right, painting the banner white, with @@ -41,7 +41,7 @@ impl SplashScreen { fn animate_logo(&self, f: &mut Frame) { // Center the banner let rect = f - .size() + .area() .center_horizontally(LOGO_WIDTH) .center_vertically(LOGO_HEIGHT); diff --git a/wicket/src/ui/widgets/animated_logo.rs b/wicket/src/ui/widgets/animated_logo.rs index cae6487fa8..f8919bad1d 100644 --- a/wicket/src/ui/widgets/animated_logo.rs +++ b/wicket/src/ui/widgets/animated_logo.rs @@ -67,7 +67,7 @@ impl<'a> Widget for Logo<'a> { for (x, c) in line.chars().enumerate() { if c == '#' { let cell = buf - .get_mut(x as u16 + area.left(), y as u16 + area.top()) + [(x as u16 + area.left(), y as u16 + area.top())] .set_symbol(" "); if x < paint_point { // The cell is highlighted diff --git a/wicket/src/ui/widgets/box_connector.rs b/wicket/src/ui/widgets/box_connector.rs index af4630773c..71af4d383e 100644 --- a/wicket/src/ui/widgets/box_connector.rs +++ b/wicket/src/ui/widgets/box_connector.rs @@ -31,15 +31,15 @@ impl Widget for BoxConnector { if self.kind == BoxConnectorKind::Top || self.kind == BoxConnectorKind::Both { - buf.get_mut(rect.x, rect.y - 1).set_symbol("├"); - buf.get_mut(rect.x + rect.width - 1, rect.y - 1).set_symbol("┤"); + buf[(rect.x, rect.y - 1)].set_symbol("├"); + buf[(rect.x + rect.width - 1, rect.y - 1)].set_symbol("┤"); } if self.kind == BoxConnectorKind::Bottom || self.kind == BoxConnectorKind::Both { - buf.get_mut(rect.x, rect.y + rect.height).set_symbol("├"); - buf.get_mut(rect.x + rect.width - 1, rect.y + rect.height) + buf[(rect.x, rect.y + rect.height)].set_symbol("├"); + buf[(rect.x + rect.width - 1, rect.y + rect.height)] .set_symbol("┤"); } } diff --git a/wicket/src/ui/widgets/rack.rs b/wicket/src/ui/widgets/rack.rs index 42ebf39d02..b393b63528 100644 --- a/wicket/src/ui/widgets/rack.rs +++ b/wicket/src/ui/widgets/rack.rs @@ -65,7 +65,7 @@ impl<'a> Rack<'a> { // TODO: Draw 10 only? - That may not scale down as well for x in inner.left()..inner.right() { for y in inner.top()..inner.bottom() { - let cell = buf.get_mut(x, y).set_symbol("▕"); + let cell = buf[(x, y)].set_symbol("▕"); if self.state.selected == component_id { if let Some(KnightRiderMode { count }) = self.state.knight_rider_mode @@ -118,7 +118,7 @@ impl<'a> Rack<'a> { if presence == ComponentPresence::Present { for x in inner.left()..inner.right() { for y in inner.top()..inner.bottom() { - buf.get_mut(x, y).set_symbol("❒"); + buf[(x, y)].set_symbol("❒"); } } } @@ -156,7 +156,7 @@ impl<'a> Rack<'a> { for x in inner.left() + border..inner.right() - border { for y in inner.top()..inner.bottom() { if x % step != 0 { - buf.get_mut(x, y).set_symbol("█"); + buf[(x, y)].set_symbol("█"); } } } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index be786db56b..d90e6ccb9f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -38,7 +38,6 @@ console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } crossbeam-utils = { version = "0.8.19" } -crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } der = { version = "0.7.9", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } @@ -142,7 +141,6 @@ console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } crossbeam-utils = { version = "0.8.19" } -crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } der = { version = "0.7.9", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } @@ -232,40 +230,47 @@ dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-unknown-linux-gnu.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.build-dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.build-dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-unknown-illumos.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } @@ -273,7 +278,8 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } once_cell = { version = "1.19.0" } -rustix = { version = "0.38.34", features = ["fs", "system", "termios"] } +rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } +signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } From ed13d7efd25b923303c10eca3b60fbefd5277d75 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sun, 11 Aug 2024 04:34:29 +0000 Subject: [PATCH 04/91] Update taiki-e/install-action digest to 8efaa9b (#6284) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`ada21a8` -> `8efaa9b`](https://togithub.com/taiki-e/install-action/compare/ada21a8...8efaa9b) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 21125cf034..7de7cb0ee1 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@ada21a86dcbd8480ccdd77e11e167f51a002fb3e # v2 + uses: taiki-e/install-action@8efaa9bb37d22aefc9d331dfbd45e2d230acfc33 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From a7275314471fab510d2b5fd308a8ea696cd5e8a6 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 10 Aug 2024 23:51:06 -0700 Subject: [PATCH 05/91] Update Rust crate clap to v4.5.15 (#6286) --- Cargo.lock | 8 ++++---- workspace-hack/Cargo.toml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b874f5a705..b79091878b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1068,9 +1068,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.13" +version = "4.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fbb260a053428790f3de475e304ff84cdbc4face759ea7a3e64c1edd938a7fc" +checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" dependencies = [ "clap_builder", "clap_derive", @@ -1078,9 +1078,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.13" +version = "4.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b17d7ea74e9f833c7dbf2cbe4fb12ff26783eda4782a8975b72f895c9b4d99" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" dependencies = [ "anstream", "anstyle", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index d90e6ccb9f..1774dd7b5c 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -32,8 +32,8 @@ byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } chrono = { version = "0.4.38", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } -clap = { version = "4.5.13", features = ["cargo", "derive", "env", "wrap_help"] } -clap_builder = { version = "4.5.13", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } +clap = { version = "4.5.15", features = ["cargo", "derive", "env", "wrap_help"] } +clap_builder = { version = "4.5.15", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } @@ -135,8 +135,8 @@ byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } chrono = { version = "0.4.38", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } -clap = { version = "4.5.13", features = ["cargo", "derive", "env", "wrap_help"] } -clap_builder = { version = "4.5.13", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } +clap = { version = "4.5.15", features = ["cargo", "derive", "env", "wrap_help"] } +clap_builder = { version = "4.5.15", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } crossbeam-epoch = { version = "0.9.18" } From 75d74a90024921ef6f29dbcfe74bbc9c97d36c45 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 10 Aug 2024 23:51:38 -0700 Subject: [PATCH 06/91] Update Rust crate filetime to 0.2.24 (#6287) --- Cargo.lock | 82 ++++++++++++++++++++++++++++++------------------------ Cargo.toml | 2 +- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b79091878b..1c348a8803 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1014,7 +1014,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -2541,14 +2541,14 @@ checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" [[package]] name = "filetime" -version = "0.2.23" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +checksum = "bf401df4a4e3872c4fe8151134cf483738e74b67fc934d6532c882b3d24a4550" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", - "windows-sys 0.52.0", + "libredox", + "windows-sys 0.59.0", ] [[package]] @@ -2951,7 +2951,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc3655aa6818d65bc620d6911f05aa7b6aeb596291e1e9f79e52df85583d1e30" dependencies = [ "rustix", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -4187,7 +4187,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.5", + "windows-targets 0.48.5", ] [[package]] @@ -4260,6 +4260,7 @@ checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags 2.6.0", "libc", + "redox_syscall 0.5.1", ] [[package]] @@ -6910,7 +6911,7 @@ dependencies = [ "libc", "redox_syscall 0.5.1", "smallvec 1.13.2", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -11543,7 +11544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -11552,7 +11553,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -11570,7 +11571,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -11590,18 +11600,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.5", - "windows_aarch64_msvc 0.52.5", - "windows_i686_gnu 0.52.5", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.5", - "windows_x86_64_gnu 0.52.5", - "windows_x86_64_gnullvm 0.52.5", - "windows_x86_64_msvc 0.52.5", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -11612,9 +11622,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -11624,9 +11634,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -11636,15 +11646,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -11654,9 +11664,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -11666,9 +11676,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -11678,9 +11688,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -11690,9 +11700,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" diff --git a/Cargo.toml b/Cargo.toml index 32f11e456f..47542f2be2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -326,7 +326,7 @@ dyn-clone = "1.0.17" either = "1.13.0" expectorate = "1.1.0" fatfs = "0.3.6" -filetime = "0.2.23" +filetime = "0.2.24" flate2 = "1.0.31" float-ord = "0.3.2" flume = "0.11.0" From d3257b9d8d48fa94ed11020598a723644aec9f05 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sun, 11 Aug 2024 08:10:57 +0000 Subject: [PATCH 07/91] Update Rust crate assert_cmd to 2.0.16 (#6285) --- Cargo.lock | 5 +++-- Cargo.toml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1c348a8803..5b575be25a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -234,13 +234,14 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc65048dd435533bb1baf2ed9956b9a278fbfdcf90301b39ee117f06c0199d37" +checksum = "dc1835b7f27878de8525dc71410b5a31cdcc5f230aed5ba5df968e09c201b23d" dependencies = [ "anstyle", "bstr 1.9.1", "doc-comment", + "libc", "predicates", "predicates-core", "predicates-tree", diff --git a/Cargo.toml b/Cargo.toml index 47542f2be2..c8ba9490b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ anstyle = "1.0.8" api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" -assert_cmd = "2.0.15" +assert_cmd = "2.0.16" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.81" atomicwrites = "0.4.3" From 4f27433d1bca57eb02073a4ea1cd14557f70b8c7 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 12 Aug 2024 11:10:43 -0400 Subject: [PATCH 08/91] Return a 409 instead of 500 when grabbing lock (#6280) Return a `409 Conflict` instead of a `500 Internal Error` when grabbing a volume repair lock that's already grabbed. 500s were seen trying to run the snapshot antagonist from omicron-stress, and a 409 makes more sense in this context. --- .../src/db/datastore/volume_repair.rs | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 5230e60e3e..c4e9f8b090 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -13,6 +13,7 @@ use crate::db::error::ErrorHandler; use crate::db::model::VolumeRepair; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind; use diesel::result::Error as DieselError; use omicron_common::api::external::Error; use uuid::Uuid; @@ -39,7 +40,18 @@ impl DataStore { .execute_async(&*conn) .await .map(|_| ()) - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref error_information, + ) if error_information.constraint_name() + == Some("volume_repair_pkey") => + { + Error::conflict("volume repair lock") + } + + _ => public_error_from_diesel(e, ErrorHandler::Server), + }) } pub(super) fn volume_repair_delete_query( @@ -83,3 +95,35 @@ impl DataStore { .await } } + +#[cfg(test)] +mod test { + use super::*; + + use crate::db::datastore::test_utils::datastore_test; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + + #[tokio::test] + async fn volume_lock_conflict_error_returned() { + let logctx = dev::test_setup_log("volume_lock_conflict_error_returned"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let lock_1 = Uuid::new_v4(); + let lock_2 = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore.volume_repair_lock(&opctx, volume_id, lock_1).await.unwrap(); + + let err = datastore + .volume_repair_lock(&opctx, volume_id, lock_2) + .await + .unwrap_err(); + + assert!(matches!(err, Error::Conflict { .. })); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} From de3d777ac897860acad5d348b2a127b046b72fd0 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 12 Aug 2024 12:52:20 -0400 Subject: [PATCH 09/91] [#5333 2/6] Region snapshot replacement omdb cmds (#6279) This commit adds some commands to omdb related to region snapshot replacement: Usage: omdb db region-snapshot-replacement [OPTIONS] Commands: list List region snapshot replacement requests status Show current region snapshot replacements and their status info Show detailed information for a region snapshot replacement request Manually request a region snapshot replacement `list` will list all region snapshot replacement requests, along with request time and state `status` will show a summary of all non-complete region snapshot replacements, along with their state and progress. `info` will show details for a single region snapshot replacement. Finally, `request` will request that a region snapshot be replaced, and return the ID of the replacement. --- dev-tools/omdb/src/bin/omdb/db.rs | 326 ++++++++++++++++++++++++++ dev-tools/omdb/tests/usage_errors.out | 72 +++--- 2 files changed, 364 insertions(+), 34 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 2fddda0dbb..1030e4288b 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -72,6 +72,8 @@ use nexus_db_model::RegionReplacementState; use nexus_db_model::RegionReplacementStep; use nexus_db_model::RegionReplacementStepType; use nexus_db_model::RegionSnapshot; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_model::Sled; use nexus_db_model::Snapshot; use nexus_db_model::SnapshotState; @@ -302,6 +304,9 @@ enum DbCommands { /// Query for information about region replacements, optionally manually /// triggering one. RegionReplacement(RegionReplacementArgs), + /// Query for information about region snapshot replacements, optionally + /// manually triggering one. + RegionSnapshotReplacement(RegionSnapshotReplacementArgs), /// Print information about sleds Sleds(SledsArgs), /// Print information about customer instances @@ -659,6 +664,53 @@ struct SnapshotInfoArgs { uuid: Uuid, } +#[derive(Debug, Args)] +struct RegionSnapshotReplacementArgs { + #[command(subcommand)] + command: RegionSnapshotReplacementCommands, +} + +#[derive(Debug, Subcommand)] +enum RegionSnapshotReplacementCommands { + /// List region snapshot replacement requests + List(RegionSnapshotReplacementListArgs), + /// Show current region snapshot replacements and their status + Status, + /// Show detailed information for a region snapshot replacement + Info(RegionSnapshotReplacementInfoArgs), + /// Manually request a region snapshot replacement + Request(RegionSnapshotReplacementRequestArgs), +} + +#[derive(Debug, Args)] +struct RegionSnapshotReplacementListArgs { + /// Only show region snapshot replacement requests in this state + #[clap(long)] + state: Option, + + /// Only show region snapshot replacement requests after a certain date + #[clap(long)] + after: Option>, +} + +#[derive(Debug, Args)] +struct RegionSnapshotReplacementInfoArgs { + /// The UUID of the region snapshot replacement request + replacement_id: Uuid, +} + +#[derive(Debug, Args)] +struct RegionSnapshotReplacementRequestArgs { + /// The dataset id for a given region snapshot + dataset_id: Uuid, + + /// The region id for a given region snapshot + region_id: Uuid, + + /// The snapshot id for a given region snapshot + snapshot_id: Uuid, +} + #[derive(Debug, Args)] struct ValidateArgs { #[command(subcommand)] @@ -859,6 +911,51 @@ impl DbArgs { DbCommands::Snapshots(SnapshotArgs { command: SnapshotCommands::List, }) => cmd_db_snapshot_list(&datastore, &self.fetch_opts).await, + DbCommands::RegionSnapshotReplacement( + RegionSnapshotReplacementArgs { + command: RegionSnapshotReplacementCommands::List(args), + }, + ) => { + cmd_db_region_snapshot_replacement_list( + &datastore, + &self.fetch_opts, + args, + ) + .await + } + DbCommands::RegionSnapshotReplacement( + RegionSnapshotReplacementArgs { + command: RegionSnapshotReplacementCommands::Status, + }, + ) => { + cmd_db_region_snapshot_replacement_status( + &opctx, + &datastore, + &self.fetch_opts, + ) + .await + } + DbCommands::RegionSnapshotReplacement( + RegionSnapshotReplacementArgs { + command: RegionSnapshotReplacementCommands::Info(args), + }, + ) => { + cmd_db_region_snapshot_replacement_info( + &opctx, &datastore, args, + ) + .await + } + DbCommands::RegionSnapshotReplacement( + RegionSnapshotReplacementArgs { + command: RegionSnapshotReplacementCommands::Request(args), + }, + ) => { + let token = omdb.check_allow_destructive()?; + cmd_db_region_snapshot_replacement_request( + &opctx, &datastore, args, token, + ) + .await + } DbCommands::Validate(ValidateArgs { command: ValidateCommands::ValidateVolumeReferences, }) => cmd_db_validate_volume_references(&datastore).await, @@ -3390,6 +3487,235 @@ async fn cmd_db_network_list_vnics( Ok(()) } +// REGION SNAPSHOT REPLACEMENTS + +/// List all region snapshot replacement requests +async fn cmd_db_region_snapshot_replacement_list( + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &RegionSnapshotReplacementListArgs, +) -> Result<(), anyhow::Error> { + let ctx = || "listing region snapshot replacement requests".to_string(); + let limit = fetch_opts.fetch_limit; + + let requests: Vec = { + let conn = datastore.pool_connection_for_tests().await?; + + use db::schema::region_snapshot_replacement::dsl; + + match (args.state, args.after) { + (Some(state), Some(after)) => { + dsl::region_snapshot_replacement + .filter(dsl::replacement_state.eq(state)) + .filter(dsl::request_time.gt(after)) + .limit(i64::from(u32::from(limit))) + .select(RegionSnapshotReplacement::as_select()) + .get_results_async(&*conn) + .await? + } + + (Some(state), None) => { + dsl::region_snapshot_replacement + .filter(dsl::replacement_state.eq(state)) + .limit(i64::from(u32::from(limit))) + .select(RegionSnapshotReplacement::as_select()) + .get_results_async(&*conn) + .await? + } + + (None, Some(after)) => { + dsl::region_snapshot_replacement + .filter(dsl::request_time.gt(after)) + .limit(i64::from(u32::from(limit))) + .select(RegionSnapshotReplacement::as_select()) + .get_results_async(&*conn) + .await? + } + + (None, None) => { + dsl::region_snapshot_replacement + .limit(i64::from(u32::from(limit))) + .select(RegionSnapshotReplacement::as_select()) + .get_results_async(&*conn) + .await? + } + } + }; + + check_limit(&requests, limit, ctx); + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct Row { + pub id: Uuid, + pub request_time: DateTime, + pub replacement_state: String, + } + + let mut rows = Vec::with_capacity(requests.len()); + + for request in requests { + rows.push(Row { + id: request.id, + request_time: request.request_time, + replacement_state: format!("{:?}", request.replacement_state), + }); + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .with(tabled::settings::Panel::header( + "Region snapshot replacement requests", + )) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +/// Display all non-complete region snapshot replacements +async fn cmd_db_region_snapshot_replacement_status( + opctx: &OpContext, + datastore: &DataStore, + fetch_opts: &DbFetchOptions, +) -> Result<(), anyhow::Error> { + let ctx = || "listing region snapshot replacement requests".to_string(); + let limit = fetch_opts.fetch_limit; + + let requests: Vec = { + let conn = datastore.pool_connection_for_tests().await?; + + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter( + dsl::replacement_state + .ne(RegionSnapshotReplacementState::Complete), + ) + .limit(i64::from(u32::from(limit))) + .select(RegionSnapshotReplacement::as_select()) + .get_results_async(&*conn) + .await? + }; + + check_limit(&requests, limit, ctx); + + for request in requests { + let steps_left = datastore + .in_progress_region_snapshot_replacement_steps(opctx, request.id) + .await?; + + println!("{}:", request.id); + println!(); + + println!(" started: {}", request.request_time); + println!( + " state: {:?}", + request.replacement_state + ); + println!( + " region snapshot: {} {} {}", + request.old_dataset_id, + request.old_region_id, + request.old_snapshot_id, + ); + println!(" new region id: {:?}", request.new_region_id); + println!(" in-progress steps left: {:?}", steps_left); + println!(); + } + + Ok(()) +} + +/// Show details for a single region snapshot replacement +async fn cmd_db_region_snapshot_replacement_info( + opctx: &OpContext, + datastore: &DataStore, + args: &RegionSnapshotReplacementInfoArgs, +) -> Result<(), anyhow::Error> { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + opctx, + args.replacement_id, + ) + .await?; + + // Show details + let steps_left = datastore + .in_progress_region_snapshot_replacement_steps(opctx, request.id) + .await?; + + println!("{}:", request.id); + println!(); + + println!(" started: {}", request.request_time); + println!(" state: {:?}", request.replacement_state); + println!( + " region snapshot: {} {} {}", + request.old_dataset_id, request.old_region_id, request.old_snapshot_id, + ); + println!(" new region id: {:?}", request.new_region_id); + println!(" in-progress steps left: {:?}", steps_left); + println!(); + + Ok(()) +} + +/// Manually request a region snapshot replacement +async fn cmd_db_region_snapshot_replacement_request( + opctx: &OpContext, + datastore: &DataStore, + args: &RegionSnapshotReplacementRequestArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + let Some(region_snapshot) = datastore + .region_snapshot_get(args.dataset_id, args.region_id, args.snapshot_id) + .await? + else { + bail!("region snapshot not found!"); + }; + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + let request_id = request.id; + + // If this function indirectly uses + // `insert_region_snapshot_replacement_request`, there could be an authz + // related `ObjectNotFound` due to the opctx being for the privileged test + // user. Lookup the snapshot here, and directly use + // `insert_snapshot_replacement_request_with_volume_id` instead. + + let db_snapshots = { + use db::schema::snapshot::dsl; + let conn = datastore.pool_connection_for_tests().await?; + dsl::snapshot + .filter(dsl::id.eq(args.snapshot_id)) + .limit(1) + .select(Snapshot::as_select()) + .load_async(&*conn) + .await + .context("loading requested snapshot")? + }; + + assert_eq!(db_snapshots.len(), 1); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + opctx, + request, + db_snapshots[0].volume_id, + ) + .await?; + + println!("region snapshot replacement {request_id} created"); + + Ok(()) +} + +// VALIDATION + /// Validate the `volume_references` column of the region snapshots table async fn cmd_db_validate_volume_references( datastore: &DataStore, diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index d4ea5450a6..ca70412d84 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -104,23 +104,25 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: - rack Print information about the rack - disks Print information about virtual disks - dns Print information about internal and external DNS - inventory Print information about collected hardware/software inventory - physical-disks Print information about physical disks - reconfigurator-save Save the current Reconfigurator inputs to a file - region Print information about regions - region-replacement Query for information about region replacements, optionally manually - triggering one - sleds Print information about sleds - instances Print information about customer instances - network Print information about the network - migrations Print information about migrations - snapshots Print information about snapshots - validate Validate the contents of the database - volumes Print information about volumes - help Print this message or the help of the given subcommand(s) + rack Print information about the rack + disks Print information about virtual disks + dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory + physical-disks Print information about physical disks + reconfigurator-save Save the current Reconfigurator inputs to a file + region Print information about regions + region-replacement Query for information about region replacements, optionally manually + triggering one + region-snapshot-replacement Query for information about region snapshot replacements, optionally + manually triggering one + sleds Print information about sleds + instances Print information about customer instances + network Print information about the network + migrations Print information about migrations + snapshots Print information about snapshots + validate Validate the contents of the database + volumes Print information about volumes + help Print this message or the help of the given subcommand(s) Options: --log-level log level filter [env: LOG_LEVEL=] [default: warn] @@ -148,23 +150,25 @@ Query the control plane database (CockroachDB) Usage: omdb db [OPTIONS] Commands: - rack Print information about the rack - disks Print information about virtual disks - dns Print information about internal and external DNS - inventory Print information about collected hardware/software inventory - physical-disks Print information about physical disks - reconfigurator-save Save the current Reconfigurator inputs to a file - region Print information about regions - region-replacement Query for information about region replacements, optionally manually - triggering one - sleds Print information about sleds - instances Print information about customer instances - network Print information about the network - migrations Print information about migrations - snapshots Print information about snapshots - validate Validate the contents of the database - volumes Print information about volumes - help Print this message or the help of the given subcommand(s) + rack Print information about the rack + disks Print information about virtual disks + dns Print information about internal and external DNS + inventory Print information about collected hardware/software inventory + physical-disks Print information about physical disks + reconfigurator-save Save the current Reconfigurator inputs to a file + region Print information about regions + region-replacement Query for information about region replacements, optionally manually + triggering one + region-snapshot-replacement Query for information about region snapshot replacements, optionally + manually triggering one + sleds Print information about sleds + instances Print information about customer instances + network Print information about the network + migrations Print information about migrations + snapshots Print information about snapshots + validate Validate the contents of the database + volumes Print information about volumes + help Print this message or the help of the given subcommand(s) Options: --log-level log level filter [env: LOG_LEVEL=] [default: warn] From 1d791143a7aee45ae97d05a7434b8e4512a38042 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Mon, 12 Aug 2024 13:53:43 -0400 Subject: [PATCH 10/91] psc-v1.0.22 and gimlet-v1.0.23 (#6292) --- tools/permslip_staging | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/permslip_staging b/tools/permslip_staging index 6f5f925eb0..2d0603c3d8 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ -c28eaa13638f55100a42916727227242ee02d18cebecb1412d6af5c8aa945b99 manifest-gimlet-v1.0.22.toml +34cf117633f82cc8f665dc3b6c78dc2aff61ca87d2b2687290605080265dda30 manifest-gimlet-v1.0.23.toml 201ff5580bb4b0b01419d7c5e580af9926103e2b6d3024e6b49cee6fab415519 manifest-oxide-rot-1-v1.0.12.toml -6d53bfbfdd6baa3fc150153a003abfac6d4b46c34f61fa7a8ec2af8af19a7d5a manifest-psc-v1.0.21.toml +db995edfe91959df3cb20ea8156f75b9dcff5ec5e77f98a28766617a8ed2e0c5 manifest-psc-v1.0.22.toml 26b6096a377edb3d7da50b1b499af104e6195bc7c7c6eb1b2751b32434d7ac9e manifest-sidecar-v1.0.23.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml From afc564e7cfbfa15bac6a14e69b83469629afae3a Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 12 Aug 2024 11:14:03 -0700 Subject: [PATCH 11/91] [sled-agent] remove NexusClientWithResolver (#6283) Previously, we would carry around a DNS resolver -- however, the resolver has been unused for a while. And in fact there's a warning about being careful around using it. Because the type was public, the rustc dead code detector didn't figure that out until my change to switch to an API trait in #6159 exposed this. Remove `NexusClientWithResolver`, adding a couple of free functions to make a `NexusClient` instead. --- sled-agent/src/instance.rs | 27 +++++----- sled-agent/src/instance_manager.rs | 8 +-- sled-agent/src/nexus.rs | 80 ++++++++++-------------------- sled-agent/src/probe_manager.rs | 8 ++- sled-agent/src/server.rs | 5 +- sled-agent/src/sled_agent.rs | 12 ++--- sled-agent/src/updates.rs | 2 +- 7 files changed, 53 insertions(+), 89 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 631f2b83f6..5dc4e1e6a2 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -12,7 +12,7 @@ use crate::instance_manager::{ Error as ManagerError, InstanceManagerServices, InstanceTicket, }; use crate::metrics::MetricsRequestQueue; -use crate::nexus::NexusClientWithResolver; +use crate::nexus::NexusClient; use crate::params::ZoneBundleMetadata; use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ @@ -349,7 +349,7 @@ struct InstanceRunner { running_state: Option, // Connection to Nexus - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, // Storage resources storage: StorageHandle, @@ -528,7 +528,6 @@ impl InstanceRunner { ); self.nexus_client - .client() .cpapi_instances_put( &self.id().into_untyped_uuid(), &state.into(), @@ -1568,6 +1567,7 @@ mod tests { use super::*; use crate::fakes::nexus::{FakeNexusServer, ServerContext}; use crate::metrics; + use crate::nexus::make_nexus_client_with_port; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::CleanupContext; use camino_tempfile::Utf8TempDir; @@ -1634,7 +1634,7 @@ mod tests { } struct FakeNexusParts { - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, _nexus_server: HttpServer, state_rx: Receiver, _dns_server: TransientServer, @@ -1662,12 +1662,11 @@ mod tests { .unwrap(), ); - let nexus_client = - NexusClientWithResolver::new_from_resolver_with_port( - &log, - resolver, - _nexus_server.local_addr().port(), - ); + let nexus_client = make_nexus_client_with_port( + &log, + resolver, + _nexus_server.local_addr().port(), + ); Self { nexus_client, _nexus_server, state_rx, _dns_server } } @@ -1760,7 +1759,7 @@ mod tests { async fn instance_struct( log: &Logger, propolis_addr: SocketAddr, - nexus_client_with_resolver: NexusClientWithResolver, + nexus_client: NexusClient, storage_handle: StorageHandle, temp_dir: &String, ) -> (Instance, MetricsRx) { @@ -1774,7 +1773,7 @@ mod tests { let (services, rx) = fake_instance_manager_services( log, storage_handle, - nexus_client_with_resolver, + nexus_client, temp_dir, ); @@ -1850,7 +1849,7 @@ mod tests { fn fake_instance_manager_services( log: &Logger, storage_handle: StorageHandle, - nexus_client_with_resolver: NexusClientWithResolver, + nexus_client: NexusClient, temp_dir: &String, ) -> (InstanceManagerServices, MetricsRx) { let vnic_allocator = @@ -1869,7 +1868,7 @@ mod tests { let (metrics_queue, rx) = MetricsRequestQueue::for_test(); let services = InstanceManagerServices { - nexus_client: nexus_client_with_resolver, + nexus_client, vnic_allocator, port_manager, storage: storage_handle, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 1b2fb204d0..fe070464ad 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -7,7 +7,7 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::metrics::MetricsRequestQueue; -use crate::nexus::NexusClientWithResolver; +use crate::nexus::NexusClient; use crate::params::InstanceExternalIpBody; use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; @@ -74,7 +74,7 @@ pub enum Error { } pub(crate) struct InstanceManagerServices { - pub nexus_client: NexusClientWithResolver, + pub nexus_client: NexusClient, pub vnic_allocator: VnicAllocator, pub port_manager: PortManager, pub storage: StorageHandle, @@ -103,7 +103,7 @@ impl InstanceManager { #[allow(clippy::too_many_arguments)] pub fn new( log: Logger, - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, etherstub: Etherstub, port_manager: PortManager, storage: StorageHandle, @@ -422,7 +422,7 @@ struct InstanceManagerRunner { terminate_tx: mpsc::UnboundedSender, terminate_rx: mpsc::UnboundedReceiver, - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, // TODO: If we held an object representing an enum of "Created OR Running" // instance, we could avoid the methods within "instance.rs" that panic diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index 9565e34b1f..26b0e3de59 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -2,12 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -pub use nexus_client::Client as NexusClient; use omicron_common::api::external::Generation; use omicron_common::disk::DiskVariant; use crate::vmm_reservoir::VmmReservoirManagerHandle; -use internal_dns::resolver::{ResolveError, Resolver}; +use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use nexus_client::types::SledAgentInfo; use omicron_common::address::NEXUS_INTERNAL_PORT; @@ -19,62 +18,33 @@ use tokio::sync::{broadcast, mpsc, oneshot, Notify}; use tokio::time::{interval, Duration, MissedTickBehavior}; use uuid::Uuid; -/// A thin wrapper over a progenitor-generated NexusClient. -/// -/// Also attaches the "DNS resolver" for historical reasons. -#[derive(Clone)] -pub struct NexusClientWithResolver { - client: NexusClient, +// Re-export the nexus_client::Client crate. (Use a type alias to be more +// rust-analyzer friendly.) +pub(crate) type NexusClient = nexus_client::Client; + +pub(crate) fn make_nexus_client( + log: &Logger, resolver: Arc, +) -> NexusClient { + make_nexus_client_with_port(log, resolver, NEXUS_INTERNAL_PORT) } -impl NexusClientWithResolver { - pub fn new( - log: &Logger, - resolver: Arc, - ) -> Result { - Ok(Self::new_from_resolver_with_port( - log, - resolver, - NEXUS_INTERNAL_PORT, - )) - } - - pub fn new_from_resolver_with_port( - log: &Logger, - resolver: Arc, - port: u16, - ) -> Self { - let client = reqwest::ClientBuilder::new() - .dns_resolver(resolver.clone()) - .build() - .expect("Failed to build client"); - - let dns_name = ServiceName::Nexus.srv_name(); - Self { - client: NexusClient::new_with_client( - &format!("http://{dns_name}:{port}"), - client, - log.new(o!("component" => "NexusClient")), - ), - resolver, - } - } - - /// Access the progenitor-based Nexus Client. - pub fn client(&self) -> &NexusClient { - &self.client - } - - /// Access the DNS resolver used by the Nexus Client. - /// - /// WARNING: If you're using this resolver to access an IP address of - /// another service, be aware that it might change if that service moves - /// around! Be cautious when accessing and persisting IP addresses of other - /// services. - pub fn resolver(&self) -> &Arc { - &self.resolver - } +pub(crate) fn make_nexus_client_with_port( + log: &Logger, + resolver: Arc, + port: u16, +) -> NexusClient { + let client = reqwest::ClientBuilder::new() + .dns_resolver(resolver) + .build() + .expect("Failed to build client"); + + let dns_name = ServiceName::Nexus.srv_name(); + NexusClient::new_with_client( + &format!("http://{dns_name}:{port}"), + client, + log.new(o!("component" => "NexusClient")), + ) } pub fn d2n_params( diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index fa2e9dfa3d..529ef392b7 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -1,5 +1,5 @@ use crate::metrics::MetricsRequestQueue; -use crate::nexus::NexusClientWithResolver; +use crate::nexus::NexusClient; use anyhow::{anyhow, Result}; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; @@ -54,7 +54,7 @@ struct RunningProbes { pub(crate) struct ProbeManagerInner { join_handle: Mutex>>, - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, log: Logger, sled_id: Uuid, vnic_allocator: VnicAllocator, @@ -67,7 +67,7 @@ pub(crate) struct ProbeManagerInner { impl ProbeManager { pub(crate) fn new( sled_id: Uuid, - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, etherstub: Etherstub, storage: StorageHandle, port_manager: PortManager, @@ -248,7 +248,6 @@ impl ProbeManagerInner { if n_added > 0 { if let Err(e) = self .nexus_client - .client() .bgtask_activate(&BackgroundTasksActivateRequest { bgtask_names: vec!["vpc_route_manager".into()], }) @@ -439,7 +438,6 @@ impl ProbeManagerInner { async fn target_state(self: &Arc) -> Result> { Ok(self .nexus_client - .client() .probes_get( &self.sled_id, None, //limit diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index ec86066096..43fb64914f 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -9,7 +9,7 @@ use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use crate::bootstrap::params::StartSledAgentRequest; use crate::long_running_tasks::LongRunningTaskHandles; -use crate::nexus::NexusClientWithResolver; +use crate::nexus::make_nexus_client; use crate::services::ServiceManager; use internal_dns::resolver::Resolver; use slog::Logger; @@ -52,8 +52,7 @@ impl Server { .map_err(|e| e.to_string())?, ); - let nexus_client = NexusClientWithResolver::new(&log, resolver) - .map_err(|e| e.to_string())?; + let nexus_client = make_nexus_client(&log, resolver); let sled_agent = SledAgent::new( &config, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index f8454a0f7b..d87df0d7c5 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -13,8 +13,7 @@ use crate::instance_manager::InstanceManager; use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{ - NexusClientWithResolver, NexusNotifierHandle, NexusNotifierInput, - NexusNotifierTask, + NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask, }; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, @@ -320,7 +319,7 @@ struct SledAgentInner { services: ServiceManager, // Connection to Nexus. - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, // A mechanism for notifiying nexus about sled-agent updates nexus_notifier: NexusNotifierHandle, @@ -365,7 +364,7 @@ impl SledAgent { pub async fn new( config: &Config, log: Logger, - nexus_client: NexusClientWithResolver, + nexus_client: NexusClient, request: StartSledAgentRequest, services: ServiceManager, long_running_task_handles: LongRunningTaskHandles, @@ -552,7 +551,7 @@ impl SledAgent { let nexus_notifier_input = NexusNotifierInput { sled_id: request.body.id, sled_address: get_sled_address(request.body.subnet), - nexus_client: nexus_client.client().clone(), + nexus_client: nexus_client.clone(), hardware: long_running_task_handles.hardware_manager.clone(), vmm_reservoir_manager: vmm_reservoir_manager.clone(), }; @@ -688,7 +687,6 @@ impl SledAgent { self.inner .nexus_client - .client() .sled_firewall_rules_request(&sled_id) .await .map_err(|err| Error::FirewallRequest(err))?; @@ -1074,7 +1072,7 @@ impl SledAgent { ) -> Result<(), Error> { self.inner .updates - .download_artifact(artifact, &self.inner.nexus_client.client()) + .download_artifact(artifact, &self.inner.nexus_client) .await?; Ok(()) } diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 9193a855b0..a928abe9b3 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -252,8 +252,8 @@ impl UpdateManager { mod test { use super::*; use crate::fakes::nexus::FakeNexusServer; + use crate::nexus::NexusClient; use flate2::write::GzEncoder; - use nexus_client::Client as NexusClient; use omicron_common::api::external::{Error, SemverVersion}; use omicron_common::api::internal::nexus::UpdateArtifactId; use omicron_test_utils::dev::test_setup_log; From f483f9de84d8230bb595222bdde609f6de461187 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:14:21 -0700 Subject: [PATCH 12/91] Update Rust crate serde to v1.0.206 (#6290) --- Cargo.lock | 8 ++++---- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b575be25a..a40e71259b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8795,9 +8795,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.205" +version = "1.0.206" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150" +checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" dependencies = [ "serde_derive", ] @@ -8833,9 +8833,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.205" +version = "1.0.206" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1" +checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" dependencies = [ "proc-macro2", "quote", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 1774dd7b5c..470cd1d621 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -91,7 +91,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.205", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } @@ -194,7 +194,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.205", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } From 90d4b2ab98fe3041a95ba8a9504f5568904e4d69 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 11:14:42 -0700 Subject: [PATCH 13/91] Update Rust crate reedline to 0.33.0 (#6289) --- Cargo.lock | 6 +++--- Cargo.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a40e71259b..eae64d1728 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4188,7 +4188,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -8012,9 +8012,9 @@ dependencies = [ [[package]] name = "reedline" -version = "0.31.0" +version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65ebc241ed0ccea0bbbd775a55a76f0dd9971ef084589dea938751a03ffedc14" +checksum = "2f8c676a3f3814a23c6a0fc9dff6b6c35b2e04df8134aae6f3929cc34de21a53" dependencies = [ "chrono", "crossterm 0.27.0", diff --git a/Cargo.toml b/Cargo.toml index c8ba9490b2..326a7a285e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -480,7 +480,7 @@ rand_seeder = "0.3.0" ratatui = "0.28.0" rayon = "1.10" rcgen = "0.12.1" -reedline = "0.31.0" +reedline = "0.33.0" ref-cast = "1.0" regex = "1.10.6" regress = "0.9.1" From a193490ca7418099e2d8f342aac77d414d1c160f Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Mon, 12 Aug 2024 16:19:04 -0400 Subject: [PATCH 14/91] Oxide rot 1 v1.0.13 (#6295) --- tools/permslip_production | 2 +- tools/permslip_staging | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/permslip_production b/tools/permslip_production index 5e9b76f980..4f82e4d6ed 100644 --- a/tools/permslip_production +++ b/tools/permslip_production @@ -1,2 +1,2 @@ -905d38cb8298c72ecac5cf31f792919fbcd69a4ad656c40e53b3ce2d80140111 manifest-oxide-rot-1-v1.0.12.toml +55336a274d0f100d5ef51cb653ec285b651eaba139c35a533e300e6d7d46032c manifest-oxide-rot-1-v1.0.13.toml 74e754e68705cf6fed4152a92bc1ee9667d1d98a21fc12993a2232dbe34bfccb manifest-bootleby-v1.3.0.toml diff --git a/tools/permslip_staging b/tools/permslip_staging index 2d0603c3d8..d886cc4246 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ 34cf117633f82cc8f665dc3b6c78dc2aff61ca87d2b2687290605080265dda30 manifest-gimlet-v1.0.23.toml -201ff5580bb4b0b01419d7c5e580af9926103e2b6d3024e6b49cee6fab415519 manifest-oxide-rot-1-v1.0.12.toml +85553dd164933a9b9e4f22409abd1190b1d632d192b5f7527129acaa778a671a manifest-oxide-rot-1-v1.0.13.toml db995edfe91959df3cb20ea8156f75b9dcff5ec5e77f98a28766617a8ed2e0c5 manifest-psc-v1.0.22.toml 26b6096a377edb3d7da50b1b499af104e6195bc7c7c6eb1b2751b32434d7ac9e manifest-sidecar-v1.0.23.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml From 9c04ddc5246d2102adb5db233a43023fd94be2d4 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:49:59 -0700 Subject: [PATCH 15/91] Update Rust crate syn to v2.0.74 (#6288) --- Cargo.lock | 154 +++++++++++++++++++------------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 +- 3 files changed, 80 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eae64d1728..74a7405e57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,7 +166,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -274,7 +274,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -296,7 +296,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -307,7 +307,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -360,7 +360,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -518,7 +518,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.72", + "syn 2.0.74", "which", ] @@ -551,7 +551,7 @@ checksum = "1657dce144574f921af10a92876a96f0ca05dd830900598d21d91c8e4cf78f74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1099,7 +1099,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1643,7 +1643,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1667,7 +1667,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1678,7 +1678,7 @@ checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" dependencies = [ "darling_core", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1733,7 +1733,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1777,7 +1777,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1810,7 +1810,7 @@ checksum = "5fe87ce4529967e0ba1dcf8450bab64d97dfd5010a6256187ffe2e43e6f0e049" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1831,7 +1831,7 @@ checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1852,7 +1852,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1862,7 +1862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" dependencies = [ "derive_builder_core", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1875,7 +1875,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version 0.4.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1940,7 +1940,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1949,7 +1949,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2219,7 +2219,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2233,7 +2233,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2640,7 +2640,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2752,7 +2752,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3750,7 +3750,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4062,7 +4062,7 @@ version = "0.1.0" source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4583,7 +4583,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4964,7 +4964,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5210,7 +5210,7 @@ version = "0.1.0" dependencies = [ "omicron-workspace-hack", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5381,7 +5381,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6258,7 +6258,7 @@ dependencies = [ "string_cache", "subtle", "syn 1.0.109", - "syn 2.0.72", + "syn 2.0.74", "time", "time-macros", "tokio", @@ -6406,7 +6406,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6557,7 +6557,7 @@ dependencies = [ "oximeter-macro-impl", "oximeter-timeseries-macro", "prettyplease", - "syn 2.0.72", + "syn 2.0.74", "toml 0.8.19", "uuid", ] @@ -6706,7 +6706,7 @@ dependencies = [ "serde_json", "slog-error-chain", "strum", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "toml 0.8.19", "trybuild", @@ -6745,7 +6745,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6781,7 +6781,7 @@ dependencies = [ "oximeter-impl", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6937,7 +6937,7 @@ dependencies = [ "regex", "regex-syntax 0.8.4", "structmeta 0.3.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7105,7 +7105,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7175,7 +7175,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7439,7 +7439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7535,7 +7535,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "typify", "unicode-ident", @@ -7555,7 +7555,7 @@ dependencies = [ "serde_json", "serde_tokenstream", "serde_yaml", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8047,7 +8047,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8301,7 +8301,7 @@ dependencies = [ "regex", "relative-path", "rustc_version 0.4.0", - "syn 2.0.72", + "syn 2.0.74", "unicode-ident", ] @@ -8693,7 +8693,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8719,7 +8719,7 @@ checksum = "7f81c2fde025af7e69b1d1420531c8a8811ca898919db177141a85313b1cb932" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8839,7 +8839,7 @@ checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8850,7 +8850,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8901,7 +8901,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8922,7 +8922,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8964,7 +8964,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9334,7 +9334,7 @@ source = "git+https://github.com/oxidecomputer/slog-error-chain?branch=main#15f6 dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9461,7 +9461,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9559,7 +9559,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9645,7 +9645,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive 0.2.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9657,7 +9657,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive 0.3.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9668,7 +9668,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9679,7 +9679,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9714,7 +9714,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9727,7 +9727,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9774,9 +9774,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.72" +version = "2.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" dependencies = [ "proc-macro2", "quote", @@ -9950,7 +9950,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta 0.2.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9981,7 +9981,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10118,7 +10118,7 @@ checksum = "8d9ef545650e79f30233c0003bcc2504d7efac6dad25fca40744de773fe2049c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10189,7 +10189,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10466,7 +10466,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10743,7 +10743,7 @@ dependencies = [ "semver 1.0.23", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "unicode-ident", ] @@ -10760,7 +10760,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "typify-impl", ] @@ -10977,7 +10977,7 @@ dependencies = [ "proc-macro2", "quote", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "usdt-impl", ] @@ -10995,7 +10995,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "thread-id", "version_check", @@ -11011,7 +11011,7 @@ dependencies = [ "proc-macro2", "quote", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "usdt-impl", ] @@ -11190,7 +11190,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-shared", ] @@ -11224,7 +11224,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -11848,7 +11848,7 @@ checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -11859,7 +11859,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -11879,7 +11879,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 326a7a285e..cf78ea4a79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ anstyle = "1.0.8" api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" -assert_cmd = "2.0.16" +assert_cmd = "2.0.15" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.81" atomicwrites = "0.4.3" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 470cd1d621..f5562a279f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -100,7 +100,7 @@ smallvec = { version = "1.13.2", default-features = false, features = ["const_ne spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.72", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } @@ -204,7 +204,7 @@ spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.72", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] } tokio = { version = "1.38.1", features = ["full", "test-util"] } From 914f5fd7d51f9b060dcc0382a30b607e25df49b2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 14:49:41 -0700 Subject: [PATCH 16/91] add internal demo saga (#6281) --- clients/nexus-client/src/lib.rs | 1 + dev-tools/omdb/src/bin/omdb/nexus.rs | 147 ++++++++++++++++ dev-tools/omdb/tests/env.out | 16 ++ dev-tools/omdb/tests/successes.out | 35 ++++ dev-tools/omdb/tests/test_all_output.rs | 23 +++ dev-tools/omdb/tests/usage_errors.out | 38 ++++ docs/demo-saga.adoc | 195 +++++++++++++++++++++ nexus/internal-api/src/lib.rs | 36 +++- nexus/src/app/mod.rs | 27 ++- nexus/src/app/saga.rs | 24 ++- nexus/src/app/sagas/demo.rs | 135 ++++++++++++++ nexus/src/app/sagas/mod.rs | 2 + nexus/src/internal_api/http_entrypoints.rs | 35 ++++ nexus/tests/integration_tests/demo_saga.rs | 74 ++++++++ nexus/tests/integration_tests/mod.rs | 1 + nexus/types/src/internal_api/views.rs | 8 + openapi/nexus-internal.json | 74 ++++++++ uuid-kinds/src/lib.rs | 1 + 18 files changed, 864 insertions(+), 8 deletions(-) create mode 100644 docs/demo-saga.adoc create mode 100644 nexus/src/app/sagas/demo.rs create mode 100644 nexus/tests/integration_tests/demo_saga.rs diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index b7722144fe..62366c45e1 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -42,6 +42,7 @@ progenitor::generate_api!( OmicronPhysicalDisksConfig = nexus_types::disk::OmicronPhysicalDisksConfig, RecoverySiloConfig = nexus_sled_agent_shared::recovery_silo::RecoverySiloConfig, TypedUuidForCollectionKind = omicron_uuid_kinds::CollectionUuid, + TypedUuidForDemoSagaKind = omicron_uuid_kinds::DemoSagaUuid, TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::TypedUuid, TypedUuidForSledKind = omicron_uuid_kinds::TypedUuid, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ec3e519cbc..9aae6b2205 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -25,6 +25,7 @@ use nexus_client::types::BackgroundTasksActivateRequest; use nexus_client::types::CurrentStatus; use nexus_client::types::LastResult; use nexus_client::types::PhysicalDiskPath; +use nexus_client::types::SagaState; use nexus_client::types::SledSelector; use nexus_client::types::UninitializedSledId; use nexus_db_queries::db::lookup::LookupPath; @@ -34,6 +35,7 @@ use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::DemoSagaUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; @@ -71,6 +73,8 @@ enum NexusCommands { BackgroundTasks(BackgroundTasksArgs), /// interact with blueprints Blueprints(BlueprintsArgs), + /// view sagas, create and complete demo sagas + Sagas(SagasArgs), /// interact with sleds Sleds(SledsArgs), } @@ -244,6 +248,36 @@ struct BlueprintImportArgs { input: Utf8PathBuf, } +#[derive(Debug, Args)] +struct SagasArgs { + #[command(subcommand)] + command: SagasCommands, +} + +#[derive(Debug, Subcommand)] +enum SagasCommands { + /// List sagas run by this Nexus + /// + /// Note that this is reporting in-memory state about sagas run by *this* + /// Nexus instance. You'll get different answers if you ask different Nexus + /// instances. + List, + + /// Create a "demo" saga + /// + /// This saga will wait until it's explicitly completed using the + /// "demo-complete" subcommand. + DemoCreate, + + /// Complete a demo saga started with "demo-create". + DemoComplete(DemoSagaIdArgs), +} + +#[derive(Debug, Args)] +struct DemoSagaIdArgs { + demo_saga_id: DemoSagaUuid, +} + #[derive(Debug, Args)] struct SledsArgs { #[command(subcommand)] @@ -402,6 +436,34 @@ impl NexusArgs { cmd_nexus_blueprints_import(&client, token, args).await } + NexusCommands::Sagas(SagasArgs { command }) => { + if self.nexus_internal_url.is_none() { + eprintln!( + "{}", + textwrap::wrap( + "WARNING: A Nexus instance was selected from DNS \ + because a specific one was not specified. But \ + the `omdb nexus sagas` commands usually only make \ + sense when targeting a specific Nexus instance.", + 80 + ) + .join("\n") + ); + } + match command { + SagasCommands::List => cmd_nexus_sagas_list(&client).await, + SagasCommands::DemoCreate => { + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_create(&client, token).await + } + SagasCommands::DemoComplete(args) => { + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_complete(&client, args, token) + .await + } + } + } + NexusCommands::Sleds(SledsArgs { command: SledsCommands::ListUninitialized, }) => cmd_nexus_sleds_list_uninitialized(&client).await, @@ -1626,6 +1688,91 @@ async fn cmd_nexus_blueprints_import( Ok(()) } +/// Runs `omdb nexus sagas list` +async fn cmd_nexus_sagas_list( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + // We don't want users to confuse this with a general way to list all sagas. + // Such a command would read database state and it would go under "omdb db". + eprintln!( + "{}", + textwrap::wrap( + "NOTE: This command only reads in-memory state from the targeted \ + Nexus instance. Sagas may be missing if they were run by a \ + different Nexus instance or if they finished before this Nexus \ + instance last started up.", + 80 + ) + .join("\n") + ); + + let saga_stream = client.saga_list_stream(None, None); + let sagas = + saga_stream.try_collect::>().await.context("listing sagas")?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SagaRow { + saga_id: Uuid, + state: &'static str, + } + let rows = sagas.into_iter().map(|saga| SagaRow { + saga_id: saga.id, + state: match saga.state { + SagaState::Running => "running", + SagaState::Succeeded => "succeeded", + SagaState::Failed { .. } => "failed", + SagaState::Stuck { .. } => "stuck", + }, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", table); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-create` +async fn cmd_nexus_sagas_demo_create( + client: &nexus_client::Client, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + let demo_saga = + client.saga_demo_create().await.context("creating demo saga")?; + println!("saga id: {}", demo_saga.saga_id); + println!( + "demo saga id: {} (use this with `demo-complete`)", + demo_saga.demo_saga_id, + ); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-complete` +async fn cmd_nexus_sagas_demo_complete( + client: &nexus_client::Client, + args: &DemoSagaIdArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + if let Err(error) = client + .saga_demo_complete(&args.demo_saga_id) + .await + .context("completing demo saga") + { + eprintln!("error: {:#}", error); + eprintln!( + "note: `demo-complete` must be run against the same Nexus \ + instance that is currently running that saga." + ); + eprintln!( + "note: Be sure that you're using the demo_saga_id, not the saga_id." + ); + Err(error) + } else { + Ok(()) + } +} + /// Runs `omdb nexus sleds list-uninitialized` async fn cmd_nexus_sleds_list_uninitialized( client: &nexus_client::Client, diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 67f113a801..5755df9488 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -472,6 +472,22 @@ note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=d note: database schema version matches expected () note: listing all commissioned sleds (use -F to filter, e.g. -F in-service) ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: Nexus URL not specified. Will pick one from DNS. +note: using Nexus URL http://[::ffff:127.0.0.1]:REDACTED_PORT +WARNING: A Nexus instance was selected from DNS because a specific one was not +specified. But the `omdb nexus sagas` commands usually only make sense when +targeting a specific Nexus instance. +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["oximeter", "--oximeter-url", "junk", "list-producers"] termination: Exited(1) --------------------------------------------- diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index d4c07899f4..66f07cb2f0 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -625,6 +625,41 @@ warning: unknown background task: "vpc_route_manager" (don't know how to interpr stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= +EXECUTING COMMAND: omdb ["--destructive", "nexus", "sagas", "demo-create"] +termination: Exited(0) +--------------------------------------------- +stdout: +saga id: ..................... +demo saga id: ..................... (use this with `demo-complete`) +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +..................... running +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["--destructive", "nexus", "background-tasks", "activate", "inventory_collection"] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 6a959d726a..d0258aeaed 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -81,6 +81,16 @@ async fn test_omdb_usage_errors() { &["nexus"], &["nexus", "background-tasks"], &["nexus", "blueprints"], + &["nexus", "sagas"], + // Missing "--destructive" flag. The URL is bogus but just ensures that + // we get far enough to hit the error we care about. + &[ + "nexus", + "--nexus-internal-url", + "http://[::1]:111", + "sagas", + "demo-create", + ], &["nexus", "sleds"], &["sled-agent"], &["sled-agent", "zones"], @@ -134,6 +144,9 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], + &["nexus", "sagas", "list"], + &["--destructive", "nexus", "sagas", "demo-create"], + &["nexus", "sagas", "list"], &[ "--destructive", "nexus", @@ -326,6 +339,16 @@ async fn test_omdb_env_settings(cptestctx: &ControlPlaneTestContext) { let args = &["--dns-server", &dns_sockaddr.to_string(), "db", "sleds"]; do_run(&mut output, move |exec| exec, &cmd_path, args).await; + // That said, the "sagas" command prints an extra warning in this case. + let args = &["nexus", "sagas", "list"]; + do_run( + &mut output, + move |exec| exec.env("OMDB_DNS_SERVER", &dns_sockaddr.to_string()), + &cmd_path, + args, + ) + .await; + // Case: specified in multiple places (command-line argument wins) let args = &["oximeter", "--oximeter-url", "junk", "list-producers"]; let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index ca70412d84..1ee07410bf 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -446,6 +446,7 @@ Usage: omdb nexus [OPTIONS] Commands: background-tasks print information about background tasks blueprints interact with blueprints + sagas view sagas, create and complete demo sagas sleds interact with sleds help Print this message or the help of the given subcommand(s) @@ -522,6 +523,43 @@ Connection Options: Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas"] +termination: Exited(2) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +view sagas, create and complete demo sagas + +Usage: omdb nexus sagas [OPTIONS] + +Commands: + list List sagas run by this Nexus + demo-create Create a "demo" saga + demo-complete Complete a demo saga started with "demo-create" + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + -h, --help Print help + +Connection Options: + --nexus-internal-url URL of the Nexus internal API [env: + OMDB_NEXUS_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +============================================= +EXECUTING COMMAND: omdb ["nexus", "--nexus-internal-url", "http://[::1]:111", "sagas", "demo-create"] +termination: Exited(1) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +note: using Nexus URL http://[::1]:111 +Error: This command is potentially destructive. Pass the `-w` / `--destructive` flag to allow it. +============================================= EXECUTING COMMAND: omdb ["nexus", "sleds"] termination: Exited(2) --------------------------------------------- diff --git a/docs/demo-saga.adoc b/docs/demo-saga.adoc new file mode 100644 index 0000000000..316050fc23 --- /dev/null +++ b/docs/demo-saga.adoc @@ -0,0 +1,195 @@ +:showtitle: +:numbered: +:toc: left + += Demo saga + +Nexus ships with a "demo" saga that can be used to interactively experiment with sagas, saga recovery, and saga transfer (after Nexus zone expungement). The demo saga consists of a single action that blocks until it's instructed to proceed. You instruct it to proceed using a request to the Nexus _internal_ API. + +In the example below, we'll: + +. Use `omicron-dev run-all` to run a simulated control plane stack +. Start a second Nexus whose execution we can control precisely +. Use the `omdb nexus sagas demo-create` command to kick off a demo saga +. Use the `omdb nexus sagas demo-complete` command to instruct that saga to finish + +For steps 1-2, we're just following the https://github.com/oxidecomputer/omicron/blob/main/docs/how-to-run-simulated.adoc#using-both-omicron-dev-run-all-and-running-nexus-manually[docs for running a simulated stack and a second Nexus]. First, run `omicron-dev run-all`: + +```terminal +$ cargo xtask omicron-dev run-all +... +omicron-dev: setting up all services ... +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log" +DB URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +DB address: [::1]:43428 +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log" +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log" +omicron-dev: services are running. +omicron-dev: nexus external API: 127.0.0.1:12220 +omicron-dev: nexus internal API: [::1]:12221 +omicron-dev: cockroachdb pid: 7166 +omicron-dev: cockroachdb URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +omicron-dev: cockroachdb directory: /dangerzone/omicron_tmp/.tmpkzPi6h +omicron-dev: internal DNS HTTP: http://[::1]:55952 +omicron-dev: internal DNS: [::1]:36474 +omicron-dev: external DNS name: oxide-dev.test +omicron-dev: external DNS HTTP: http://[::1]:64396 +omicron-dev: external DNS: [::1]:35977 +omicron-dev: e.g. `dig @::1 -p 35977 test-suite-silo.sys.oxide-dev.test` +omicron-dev: management gateway: http://[::1]:33325 (switch0) +omicron-dev: management gateway: http://[::1]:61144 (switch1) +omicron-dev: silo name: test-suite-silo +omicron-dev: privileged user name: test-privileged +``` + +Then follow those docs to configure and start a second Nexus: + +```terminal +$ cargo run --bin=nexus -- config-second.toml +... +Aug 12 20:16:25.405 INFO listening, local_addr: [::1]:12223, component: dropshot_internal, name: a4ef738a-1fb0-47b1-9da2-4919c7ec7c7f, file: /home/dap/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:205 +... +``` + +The rest of these instructions will use `omdb` pointed at the second Nexus instance, so we'll set OMDB_NEXUS_URL in the environment: + +```terminal +$ export OMDB_NEXUS_URL=http://[::1]:12223 +``` + +Now we can use `omdb nexus sagas list` to list the sagas that have run _in that second Nexus process_ only: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +``` + +Now we can create a demo saga: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create +... +note: using Nexus URL http://[::1]:12223 +saga id: f7765d6a-6e45-4c13-8904-2677b79a97eb +demo saga id: 88eddf09-dda3-4d70-8d99-1d3b441c57da (use this with `demo-complete`) +``` + +We have to use the `--destructive` option because this command by nature changes state in Nexus and `omdb` won't allow commands that change state by default. + +We can see the new saga in the list of sagas now. It's running: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb running +``` + +and it will stay running indefinitely until we run `demo-complete`. Let's do that: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da +... +note: using Nexus URL http://[::1]:12223 +``` + +and then list sagas again: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb succeeded +``` + +It works across recovery, too. You can go through the same loop again, but this time kill Nexus and start it again: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create +... +note: using Nexus URL http://[::1]:12223 +saga id: 65253cb6-4428-4aa7-9afc-bf9b42166cb5 +demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete`) +``` + +Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb` we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 running +``` + +Side note: we can see it was recovered: + +```terminal +$ cargo run --bin=omdb -- nexus background-tasks show +... +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: iter 1, triggered by a periodic timer firing + started at 2024-08-12T20:20:41.714Z (44s ago) and ran for 79ms + since Nexus started: + sagas recovered: 1 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 1 (in-progress, assigned to this Nexus) + recovered: 1 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + recently recovered sagas (1): + TIME SAGA_ID + 2024-08-12T20:20:41Z 65253cb6-4428-4aa7-9afc-bf9b42166cb5 + no saga recovery failures +... +``` + +Now we can complete that saga: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b +... +note: using Nexus URL http://[::1]:12223 +``` + +and see it finish: + +``` +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 succeeded +``` + +Note too that the completion is not synchronous with the `demo-complete` command, though it usually _is_ pretty quick. It's possible you'll catch it `running` if you run `nexus sagas list` right after running `nexus sagas demo-complete`, but you should quickly see it `succeeded` if you keep running `nexus sagas list`. diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index c6ade3b1a2..6a98c44614 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -23,7 +23,7 @@ use nexus_types::{ OximeterInfo, RackInitializationRequest, SledAgentInfo, SwitchPutRequest, SwitchPutResponse, }, - views::{BackgroundTask, Ipv4NatEntryView, Saga}, + views::{BackgroundTask, DemoSaga, Ipv4NatEntryView, Saga}, }, }; use omicron_common::{ @@ -39,7 +39,8 @@ use omicron_common::{ update::ArtifactId, }; use omicron_uuid_kinds::{ - DownstairsKind, SledUuid, TypedUuid, UpstairsKind, UpstairsRepairKind, + DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind, + UpstairsRepairKind, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -282,6 +283,31 @@ pub trait NexusInternalApi { path_params: Path, ) -> Result, HttpError>; + /// Kick off an instance of the "demo" saga + /// + /// This saga is used for demo and testing. The saga just waits until you + /// complete using the `saga_demo_complete` API. + #[endpoint { + method = POST, + path = "/demo-saga", + }] + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Complete a waiting demo saga + /// + /// Note that the id used here is not the same as the id of the saga. It's + /// the one returned by the `saga_demo_create` API. + #[endpoint { + method = POST, + path = "/demo-saga/{demo_saga_id}/complete", + }] + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + // Background Tasks /// List background tasks @@ -565,6 +591,12 @@ pub struct SagaPathParam { pub saga_id: Uuid, } +/// Path parameters for DemoSaga requests +#[derive(Deserialize, JsonSchema)] +pub struct DemoSagaPathParam { + pub demo_saga_id: DemoSagaUuid, +} + /// Path parameters for Background Task requests #[derive(Deserialize, JsonSchema)] pub struct BackgroundTaskPathParam { diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 60ed611bd7..5cfacd0c9c 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -7,6 +7,7 @@ use self::external_endpoints::NexusCertResolver; use self::saga::SagaExecutor; use crate::app::background::BackgroundTasksData; +use crate::app::background::SagaRecoveryHelpers; use crate::app::oximeter::LazyTimeseriesClient; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -19,6 +20,7 @@ use nexus_config::NexusConfig; use nexus_config::RegionAllocationStrategy; use nexus_config::Tunables; use nexus_config::UpdatesConfig; +use nexus_db_model::AllSchemaVersions; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; @@ -35,6 +37,7 @@ use std::net::SocketAddrV6; use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; use std::sync::OnceLock; +use tokio::sync::mpsc; use uuid::Uuid; // The implementation of Nexus is large, and split into a number of submodules @@ -89,12 +92,9 @@ pub(crate) mod sagas; // TODO: When referring to API types, we should try to include // the prefix unless it is unambiguous. -pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; - -use crate::app::background::SagaRecoveryHelpers; -use nexus_db_model::AllSchemaVersions; pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE; -use tokio::sync::mpsc; +pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; +use sagas::demo::CompletingDemoSagas; // XXX: Might want to recast as max *floating* IPs, we have at most one // ephemeral (so bounded in saga by design). @@ -204,6 +204,9 @@ pub struct Nexus { /// Default Crucible region allocation strategy default_region_allocation_strategy: RegionAllocationStrategy, + + /// List of demo sagas awaiting a request to complete them + demo_sagas: Arc>, } impl Nexus { @@ -480,6 +483,9 @@ impl Nexus { .pkg .default_region_allocation_strategy .clone(), + demo_sagas: Arc::new(std::sync::Mutex::new( + CompletingDemoSagas::new(), + )), }; // TODO-cleanup all the extra Arcs here seems wrong @@ -955,6 +961,17 @@ impl Nexus { } Ok(clients.into_iter().collect::>()) } + + pub(crate) fn demo_sagas( + &self, + ) -> Result, Error> { + self.demo_sagas.lock().map_err(|error| { + Error::internal_error(&format!( + "failed to acquire demo_sagas lock: {:#}", + error + )) + }) + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index fcdbb0db59..5bc69946ad 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -58,12 +58,14 @@ use futures::FutureExt; use futures::StreamExt; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; +use nexus_types::internal_api::views::DemoSaga; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResult; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; +use omicron_uuid_kinds::DemoSagaUuid; use std::sync::Arc; use std::sync::OnceLock; use steno::SagaDag; @@ -296,7 +298,6 @@ pub(crate) struct RunnableSaga { } impl RunnableSaga { - #[cfg(test)] pub(crate) fn id(&self) -> SagaId { self.id } @@ -457,4 +458,25 @@ impl super::Nexus { pub(crate) fn sec(&self) -> &steno::SecClient { &self.sagas.sec_client } + + pub(crate) async fn saga_demo_create(&self) -> Result { + use crate::app::sagas::demo; + let demo_saga_id = DemoSagaUuid::new_v4(); + let saga_params = demo::Params { id: demo_saga_id }; + let saga_dag = create_saga_dag::(saga_params)?; + let runnable_saga = self.sagas.saga_prepare(saga_dag).await?; + let saga_id = runnable_saga.id().0; + // We don't need the handle that runnable_saga.start() returns because + // we're not going to wait for the saga to finish here. + let _ = runnable_saga.start().await?; + Ok(DemoSaga { saga_id, demo_saga_id }) + } + + pub(crate) fn saga_demo_complete( + &self, + demo_saga_id: DemoSagaUuid, + ) -> Result<(), Error> { + let mut demo_sagas = self.demo_sagas()?; + demo_sagas.complete(demo_saga_id) + } } diff --git a/nexus/src/app/sagas/demo.rs b/nexus/src/app/sagas/demo.rs new file mode 100644 index 0000000000..4a8eda8b80 --- /dev/null +++ b/nexus/src/app/sagas/demo.rs @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Demo saga, used for testing and interactive debugging +//! +//! The "Demo" saga exists so that developers and automated tests can create a +//! saga that will not complete until they take some action to complete it. The +//! saga just waits until it gets the message that it should finish. Users +//! create demo sagas and complete them using requests to the internal API. +//! +//! The implementation is entirely in-memory, which means you have to send the +//! completion message to the Nexus that's running the saga. However, it does +//! work across Nexus restarts, so this can be used to exercise the saga +//! recovery path. +//! +//! It's tempting to build this only for development and not official releases, +//! but that'd be more work, there's little downside to always including it, and +//! it's conceivable that it'd be useful for production systems, too. + +use super::NexusActionContext; +use super::{ActionRegistry, NexusSaga, SagaInitError}; +use crate::app::sagas::declare_saga_actions; +use anyhow::ensure; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::DemoSagaUuid; +use serde::Deserialize; +use serde::Serialize; +use slog::info; +use std::collections::BTreeMap; +use steno::ActionError; +use tokio::sync::oneshot; + +/// Set of demo sagas that have been marked completed +/// +/// Nexus maintains one of these at the top level. Individual demo sagas wait +/// until their id shows up here, then remove it and proceed. +pub struct CompletingDemoSagas { + ids: BTreeMap>, +} + +impl CompletingDemoSagas { + pub fn new() -> CompletingDemoSagas { + CompletingDemoSagas { ids: BTreeMap::new() } + } + + pub fn complete(&mut self, id: DemoSagaUuid) -> Result<(), Error> { + self.ids + .remove(&id) + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!( + "demo saga with id {:?}", + id + )) + })? + .send(()) + .map_err(|_| { + Error::internal_error( + "saga stopped listening (Nexus shutting down?)", + ) + }) + } + + pub fn subscribe( + &mut self, + id: DemoSagaUuid, + ) -> Result, anyhow::Error> { + let (tx, rx) = oneshot::channel(); + ensure!( + self.ids.insert(id, tx).is_none(), + "multiple subscriptions for the same demo saga" + ); + Ok(rx) + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub id: DemoSagaUuid, +} + +declare_saga_actions! { + demo; + DEMO_WAIT -> "demo_wait" { + + demo_wait + } +} + +#[derive(Debug)] +pub(crate) struct SagaDemo; +impl NexusSaga for SagaDemo { + const NAME: &'static str = "demo"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + demo_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(demo_wait_action()); + Ok(builder.build()?) + } +} + +async fn demo_wait(sagactx: NexusActionContext) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let demo_id = sagactx.saga_params::()?.id; + let log = osagactx.log(); + info!(log, "demo saga: begin wait"; "id" => %demo_id); + let rx = { + let mut demo_sagas = osagactx + .nexus() + .demo_sagas() + .map_err(ActionError::action_failed)?; + demo_sagas.subscribe(demo_id).map_err(|e| { + ActionError::action_failed(Error::internal_error(&format!( + "demo saga subscribe failed: {:#}", + e + ))) + })? + }; + match rx.await { + Ok(_) => { + info!(log, "demo saga: completing"; "id" => %demo_id); + } + Err(_) => { + info!(log, "demo saga: waiting failed (Nexus shutting down?)"; + "id" => %demo_id); + } + } + Ok(()) +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 0c57a5b2dc..b944fb4d2b 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -22,6 +22,7 @@ use steno::SagaType; use thiserror::Error; use uuid::Uuid; +pub mod demo; pub mod disk_create; pub mod disk_delete; pub mod finalize_disk; @@ -134,6 +135,7 @@ fn make_action_registry() -> ActionRegistry { let mut registry = steno::ActionRegistry::new(); registry.register(Arc::clone(&*ACTION_GENERATE_ID)); + ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions( diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 33b626a7fc..c5322e3930 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -35,6 +35,7 @@ use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; use nexus_types::internal_api::views::BackgroundTask; +use nexus_types::internal_api::views::DemoSaga; use nexus_types::internal_api::views::Ipv4NatEntryView; use nexus_types::internal_api::views::Saga; use omicron_common::api::external::http_pagination::data_page_params_for; @@ -530,6 +531,40 @@ impl NexusInternalApi for NexusInternalApiImpl { .await } + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let demo_saga = nexus.saga_demo_create().await?; + Ok(HttpResponseOk(demo_saga)) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + nexus.saga_demo_complete(path.demo_saga_id)?; + Ok(HttpResponseUpdatedNoContent()) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + // Background Tasks async fn bgtask_list( diff --git a/nexus/tests/integration_tests/demo_saga.rs b/nexus/tests/integration_tests/demo_saga.rs new file mode 100644 index 0000000000..888fa35965 --- /dev/null +++ b/nexus/tests/integration_tests/demo_saga.rs @@ -0,0 +1,74 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Smoke test for the demo saga + +use futures::TryStreamExt; +use nexus_client::types::Saga; +use nexus_client::types::SagaState; +use nexus_test_interface::NexusServer; +use nexus_test_utils_macros::nexus_test; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; +use std::time::Duration; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + +// Tests that we can create a demo saga, then mark it completed, and the actual +// saga's state matches what we expect along the way. +#[nexus_test] +async fn test_demo_saga(cptestctx: &ControlPlaneTestContext) { + let log = &cptestctx.logctx.log; + let nexus_internal_url = format!( + "http://{}", + cptestctx.server.get_http_server_internal_address().await + ); + let nexus_client = + nexus_client::Client::new(&nexus_internal_url, log.clone()); + + let sagas_before = list_sagas(&nexus_client).await; + eprintln!("found sagas (before): {:?}", sagas_before); + let demo_saga = nexus_client.saga_demo_create().await.unwrap(); + let saga_id = demo_saga.saga_id; + assert!(!sagas_before.into_iter().any(|s| s.id == saga_id)); + + let sagas_after = list_sagas(&nexus_client).await; + eprintln!("found sagas (after): {:?}", sagas_after); + let found = sagas_after.into_iter().find(|s| s.id == saga_id).unwrap(); + assert!(matches!(found.state, SagaState::Running)); + + // It is hard to verify that the saga is not going to complete by itself. + // No matter how long we wait and make sure it didn't complete, it might + // have completed after that. And then we've made the test suite take that + // much longer. But we can at least make sure that completing the saga + // does cause it to finish. + nexus_client.saga_demo_complete(&demo_saga.demo_saga_id).await.unwrap(); + + // Completion is not synchronous -- that just unblocked the saga. So we + // need to poll a bit to wait for it to actually finish. + let found = wait_for_condition( + || async { + let sagas = list_sagas(&nexus_client).await; + eprintln!("found sagas (last): {:?}", sagas); + let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap(); + if matches!(found.state, SagaState::Succeeded) { + Ok(found) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .unwrap(); + + assert_eq!(found.id, saga_id); + assert!(matches!(found.state, SagaState::Succeeded)); +} + +async fn list_sagas(client: &nexus_client::Client) -> Vec { + client.saga_list_stream(None, None).try_collect::>().await.unwrap() +} diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 5054527c63..fdf14dbd07 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -11,6 +11,7 @@ mod basic; mod certificates; mod commands; mod console_api; +mod demo_saga; mod device_auth; mod disks; mod external_ips; diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index b71fd04779..a4557ffd31 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -9,6 +9,7 @@ use futures::stream::StreamExt; use omicron_common::api::external::MacAddr; use omicron_common::api::external::ObjectStream; use omicron_common::api::external::Vni; +use omicron_uuid_kinds::DemoSagaUuid; use schemars::JsonSchema; use serde::Serialize; use std::net::Ipv4Addr; @@ -152,6 +153,13 @@ impl From for SagaState { } } +/// Identifies an instance of the demo saga +#[derive(Clone, Debug, Serialize, JsonSchema)] +pub struct DemoSaga { + pub saga_id: Uuid, + pub demo_saga_id: DemoSagaUuid, +} + /// Background tasks /// /// These are currently only intended for observability by developers. We will diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 7e4d6e6c02..a181d14540 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -364,6 +364,59 @@ } } }, + "/demo-saga": { + "post": { + "summary": "Kick off an instance of the \"demo\" saga", + "description": "This saga is used for demo and testing. The saga just waits until you complete using the `saga_demo_complete` API.", + "operationId": "saga_demo_create", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DemoSaga" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/demo-saga/{demo_saga_id}/complete": { + "post": { + "summary": "Complete a waiting demo saga", + "description": "Note that the id used here is not the same as the id of the saga. It's the one returned by the `saga_demo_create` API.", + "operationId": "saga_demo_complete", + "parameters": [ + { + "in": "path", + "name": "demo_saga_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -2624,6 +2677,23 @@ "kind" ] }, + "DemoSaga": { + "description": "Identifies an instance of the demo saga", + "type": "object", + "properties": { + "demo_saga_id": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + }, + "saga_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "demo_saga_id", + "saga_id" + ] + }, "DiskIdentity": { "description": "Uniquely identifies a disk.", "type": "object", @@ -4897,6 +4967,10 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDemoSagaKind": { + "type": "string", + "format": "uuid" + }, "TypedUuidForDownstairsRegionKind": { "type": "string", "format": "uuid" diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 8f695d2399..ba586c03a5 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -51,6 +51,7 @@ macro_rules! impl_typed_uuid_kind { impl_typed_uuid_kind! { Collection => "collection", Dataset => "dataset", + DemoSaga => "demo_saga", Downstairs => "downstairs", DownstairsRegion => "downstairs_region", ExternalIp => "external_ip", From 3ad79c1cbdc6fb77515bc10ce5f4a7d7c8687624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Tue, 13 Aug 2024 10:15:00 +1200 Subject: [PATCH 17/91] Remove outdated TODO file (#6291) --- TODO.adoc | 113 ------------------------------------------------------ 1 file changed, 113 deletions(-) delete mode 100644 TODO.adoc diff --git a/TODO.adoc b/TODO.adoc deleted file mode 100644 index 40c38e14b3..0000000000 --- a/TODO.adoc +++ /dev/null @@ -1,113 +0,0 @@ -:showtitle: -:icons: font - -= TODO - -API endpoints: - -* RFD 24: regions, AZs, etc -* (lots more) - -Work queue (see also: existing GitHub issues): - -* use CARGO_BIN_EXE for paths to binaries -https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-crates -* dropshot: allow consumers to provide error codes for dropshot errors -* general maintenance and cleanup -** replace &Arc with &T, and some instances of Arc as well -** all identifiers could be newtypes, with a prefix for the type (like AWS - "i-123" for instances) -** rethinking ApiError a bit -- should it use thiserror, or at least impl - std::error::Error? -** scope out switching to sync (see RFD 79) -** proper config + logging for sled agent -* settle on an approach for modification of resources and implement it once -* implement behavior of server restarting (e.g., sled agent starting up) -** This would help validate some of the architectural choices. Current thinking - is that this will notify OXCP of the restart, and OXCP will find instances - that are supposed to be on that server and run instance_ensure(). It will - also want to do that for the disks associated with those instances. - IMPORTANT: this process should also _remove_ any resources that are currently - on that system, so the notification to OXCP about a restart may need to - include the list of resources that the SA knows about and their current - states. -* implement audit log -* implement alerts -* implement external user authentication -* implement external user authorization mechanism -* implement throttling and load shedding described in RFD 6 -* implement hardening in RFD 10 -* implement ETag / If-Match / If-None-Match -* implement limits for all types of resources -* implement scheme for API versioning -** how to identify the requested version -- header or URI? -** translators for older versions? -** integration of supported API versions into build artifact? -** Should all the uses of serde_json disallow unrecognized fields? Should any? -* debugging/monitoring: Prometheus? -* debugging/monitoring: OpenTracing? OpenTelemetry? -* debugging/monitoring: Dynamic tracing? -* debugging/monitoring: Core files? -* Automated testing -** General API testing: there's a lot of boilerplate in hand-generated tests - for each kind of resource. Would it be reasonable / possible to have a sort - of omnibus test that's given the OpenAPI spec (or something like it), - creates a hierarchy with at least one of every possible resource, and does - things like: For each possible resource -*** attempt to (create, get, put, delete) one with an invalid name -*** attempt to (GET, DELETE, PUT) one that does not exist -*** attempt to create one with invalid JSON -*** attempt to create one with a duplicate name of the one we know about -*** exercise list operation with marker and limit (may need to create many of them) -*** for each required input property: -**** attempt to create a resource without that property -*** for each input property: attempt to create a resource with invalid values - for that property -*** list instances of that resource and expect to find the one we know about -*** GET the one instance we know about -*** DELETE the one instance we know about -*** GET the one instance we know about again and expect it to fail -*** list instances again and expect to find nothing -* We will need archivers for deleted records -- especially saga logs - -External dependencies / open questions: - -* Should we create a more first-class notion of objects in the API? -** This would be a good way to enforce built-in limits. -** This would be a good way to enforce uniformity of pagination. -** If each resource provides a way to construct ETags, we could provide - automatic implementation of If-Match, etc. -** With the right interface, we could provide automatic implementations of PUT - or PATCH with JSON Merge Patch and JSON Patch given any one of these. -* would like to require that servers have unique, immutable uuids -* TLS: -** How will we do TLS termination? -** How will we manage server certificates? -** How will we manage client certificates? -* what does bootstrapping / key management look like? -* what does internal authorization look like? - -Other activities: - -* Performance testing -* Stress testing -* Fault testing / under load -* Fuzz testing -* Security review - -Nice-to-haves: - -* API consistency checks: e.g., camel case every where - -Things we're going to want to build once: - -* metric export -* structured event reporting (e.g., audit log, alert log, fault log) -* opentracing-type reporting -* client-side circuit breakers -* service discovery -* client connection pooling -* server-side throttling -* command-line utilities - -Check out linkerd (for inspiration -- it looks K8s-specific) From b83f6094acdbce8e32878b201632a0b9a3a84966 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 04:30:15 +0000 Subject: [PATCH 18/91] Update taiki-e/install-action digest to 7f737c1 (#6301) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`8efaa9b` -> `7f737c1`](https://togithub.com/taiki-e/install-action/compare/8efaa9b...7f737c1) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 7de7cb0ee1..e310c011e7 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@8efaa9bb37d22aefc9d331dfbd45e2d230acfc33 # v2 + uses: taiki-e/install-action@7f737c1056bae14d45b3daec1a2d26ad480e50f7 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From a3062747f03bdcf91dc80ee98ab26f48a6364267 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 04:39:15 +0000 Subject: [PATCH 19/91] Update Rust crate assert_cmd to 2.0.16 (#6302) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index cf78ea4a79..326a7a285e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ anstyle = "1.0.8" api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" -assert_cmd = "2.0.15" +assert_cmd = "2.0.16" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.81" atomicwrites = "0.4.3" From 8592a6bdb56500f2c920bf07d49c85ed8c1ecef6 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 05:36:04 +0000 Subject: [PATCH 20/91] Update Rust crate serde to v1.0.207 (#6303) --- Cargo.lock | 8 ++++---- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74a7405e57..a686b41823 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8795,9 +8795,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.206" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" dependencies = [ "serde_derive", ] @@ -8833,9 +8833,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.206" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" dependencies = [ "proc-macro2", "quote", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index f5562a279f..ad845e8073 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -91,7 +91,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } @@ -194,7 +194,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } From b08cce78804c94e19a8404d58beafa3d6d296d4a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 13 Aug 2024 09:20:46 -0700 Subject: [PATCH 21/91] [oximeter] Add some units from physical reality (#6296) In service of future changes to record data from power, temperature, and fan speed sensors in Oximeter, this branch adds some physical quantities to the `Units` enum: `Volts`, `Amps`, `DegreesCelcius`, and `Rpm`. I've added all of these as whole numbers of the measured quantities, with the expectation that they will probably be recorded as floating-point. We could consider instead using a smaller unit like `MilliAmps`, and recording them as integers, but that introduces a bunch of dimensional analysis that I'm not sure if we want to be doing. --- openapi/nexus.json | 12 +++++++++++- oximeter/impl/src/schema/codegen.rs | 6 ++++++ oximeter/impl/src/schema/mod.rs | 6 +++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/openapi/nexus.json b/openapi/nexus.json index ae5eaeae64..da77eec2a8 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -19805,7 +19805,10 @@ "count", "bytes", "seconds", - "nanoseconds" + "nanoseconds", + "volts", + "amps", + "degrees_celcius" ] }, { @@ -19814,6 +19817,13 @@ "enum": [ "none" ] + }, + { + "description": "Rotations per minute.", + "type": "string", + "enum": [ + "rpm" + ] } ] }, diff --git a/oximeter/impl/src/schema/codegen.rs b/oximeter/impl/src/schema/codegen.rs index d433441718..4778cf4970 100644 --- a/oximeter/impl/src/schema/codegen.rs +++ b/oximeter/impl/src/schema/codegen.rs @@ -522,6 +522,12 @@ impl quote::ToTokens for Units { Units::Nanoseconds => { quote! { ::oximeter::schema::Units::Nanoseconds } } + Units::Amps => quote! { ::oximeter::schema::Units::Amps }, + Units::Volts => quote! { ::oximeter::schema::Units::Volts }, + Units::DegreesCelcius => { + quote! { ::oximeter::schema::Units::DegreesCelcius } + } + Units::Rpm => quote! { ::oximeter::schema::Units::Rpm }, }; toks.to_tokens(tokens); } diff --git a/oximeter/impl/src/schema/mod.rs b/oximeter/impl/src/schema/mod.rs index 7743034e31..250604d7be 100644 --- a/oximeter/impl/src/schema/mod.rs +++ b/oximeter/impl/src/schema/mod.rs @@ -179,7 +179,6 @@ pub struct TimeseriesDescription { /// Measurement units for timeseries samples. #[derive(Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] -// TODO-completeness: Include more units, such as power / temperature. // TODO-completeness: Decide whether and how to handle dimensional analysis // during queries, if needed. pub enum Units { @@ -189,6 +188,11 @@ pub enum Units { Bytes, Seconds, Nanoseconds, + Volts, + Amps, + DegreesCelcius, + /// Rotations per minute. + Rpm, } /// The schema for a timeseries. From 33ab24fc05c2755c352641f8265094af792bac75 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:22:29 -0700 Subject: [PATCH 22/91] Update Rust crate serde_json to 1.0.124 (#6305) --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a686b41823..5b38a4905e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8864,9 +8864,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" dependencies = [ "itoa", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 326a7a285e..bb899f8825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -498,7 +498,7 @@ secrecy = "0.8.0" semver = { version = "1.0.23", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } -serde_json = "1.0.122" +serde_json = "1.0.124" serde_path_to_error = "0.1.16" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index ad845e8073..854a020167 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -92,7 +92,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } @@ -195,7 +195,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } From d25b10258ff479c2081a67e4e523bf161f44b7b7 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 13 Aug 2024 13:45:02 -0400 Subject: [PATCH 23/91] Add a new `ClickhouseServer` Omicron Zone (#6297) The new zone type reflects the zone with which we'll deploy clickhouse server nodes in a replicated setup. We decided on using a new zone type rather than adding a boolean field to the existing `Clickhouse` zone type that is used for single server deployments in last Tuesday's (Aug 6, 2024) update huddle. This is the fist part of the work to be done for replicated clickhouse deployments that are automated via reconfigurator. As such, the actual zone deployment is left as a `todo`. --- common/src/api/internal/shared.rs | 5 ++ dev-tools/omdb/src/bin/omdb/db.rs | 1 + internal-dns/src/names.rs | 3 + nexus-sled-agent-shared/src/inventory.rs | 24 +++++++ nexus/db-model/src/dataset_kind.rs | 4 ++ nexus/db-model/src/inventory.rs | 4 ++ nexus/db-model/src/omicron_zone_config.rs | 13 ++++ nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/service_kind.rs | 1 + .../deployment/external_networking.rs | 1 + nexus/db-queries/src/db/datastore/rack.rs | 1 + nexus/reconfigurator/execution/src/dns.rs | 3 + .../execution/src/omicron_zones.rs | 1 + .../src/planner/omicron_zone_placement.rs | 1 + .../background/tasks/sync_service_zone_nat.rs | 1 + nexus/types/src/deployment.rs | 5 ++ nexus/types/src/deployment/zone_type.rs | 25 ++++++++ openapi/nexus-internal.json | 63 ++++++++++++++++--- openapi/sled-agent.json | 25 ++++++++ schema/all-zones-requests.json | 25 ++++++++ .../up1.sql | 1 + .../up2.sql | 1 + .../up3.sql | 1 + schema/crdb/dbinit.sql | 5 +- schema/rss-service-plan-v3.json | 25 ++++++++ sled-agent/src/params.rs | 3 + sled-agent/src/services.rs | 18 ++++++ sled-storage/src/dataset.rs | 3 + 28 files changed, 256 insertions(+), 10 deletions(-) create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up1.sql create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up2.sql create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up3.sql diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index e457d08fb2..3856a472ab 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -710,8 +710,12 @@ pub struct ResolvedVpcRouteSet { pub enum DatasetKind { Crucible, Cockroach, + /// Used for single-node clickhouse deployments Clickhouse, + /// Used for replicated clickhouse deployments ClickhouseKeeper, + /// Used for replicated clickhouse deployments + ClickhouseServer, ExternalDns, InternalDns, } @@ -724,6 +728,7 @@ impl fmt::Display for DatasetKind { Cockroach => "cockroach", Clickhouse => "clickhouse", ClickhouseKeeper => "clickhouse_keeper", + ClickhouseServer => "clickhouse_server", ExternalDns => "external_dns", InternalDns => "internal_dns", }; diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 1030e4288b..9ce4c66a80 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1106,6 +1106,7 @@ async fn lookup_service_info( | BlueprintZoneType::InternalNtp(_) => ServiceKind::Ntp, BlueprintZoneType::Clickhouse(_) => ServiceKind::Clickhouse, BlueprintZoneType::ClickhouseKeeper(_) => ServiceKind::ClickhouseKeeper, + BlueprintZoneType::ClickhouseServer(_) => ServiceKind::ClickhouseServer, BlueprintZoneType::CockroachDb(_) => ServiceKind::Cockroach, BlueprintZoneType::Crucible(_) => ServiceKind::Crucible, BlueprintZoneType::CruciblePantry(_) => ServiceKind::CruciblePantry, diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index f975029d69..a9fe1a36bf 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -25,6 +25,7 @@ pub const DNS_ZONE_EXTERNAL_TESTING: &str = "oxide-dev.test"; pub enum ServiceName { Clickhouse, ClickhouseKeeper, + ClickhouseServer, Cockroach, InternalDns, ExternalDns, @@ -48,6 +49,7 @@ impl ServiceName { match self { ServiceName::Clickhouse => "clickhouse", ServiceName::ClickhouseKeeper => "clickhouse-keeper", + ServiceName::ClickhouseServer => "clickhouse-server", ServiceName::Cockroach => "cockroach", ServiceName::ExternalDns => "external-dns", ServiceName::InternalDns => "nameservice", @@ -73,6 +75,7 @@ impl ServiceName { match self { ServiceName::Clickhouse | ServiceName::ClickhouseKeeper + | ServiceName::ClickhouseServer | ServiceName::Cockroach | ServiceName::InternalDns | ServiceName::ExternalDns diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 8a793f6150..2f1361a6f2 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -134,15 +134,26 @@ pub enum OmicronZoneType { snat_cfg: SourceNatConfig, }, + /// Type of clickhouse zone used for a single node clickhouse deployment Clickhouse { address: SocketAddrV6, dataset: OmicronZoneDataset, }, + /// A zone used to run a Clickhouse Keeper node + /// + /// Keepers are only used in replicated clickhouse setups ClickhouseKeeper { address: SocketAddrV6, dataset: OmicronZoneDataset, }, + + /// A zone used to run a Clickhouse Server in a replicated deployment + ClickhouseServer { + address: SocketAddrV6, + dataset: OmicronZoneDataset, + }, + CockroachDb { address: SocketAddrV6, dataset: OmicronZoneDataset, @@ -212,6 +223,9 @@ impl OmicronZoneType { OmicronZoneType::ClickhouseKeeper { .. } => { ZoneKind::ClickhouseKeeper } + OmicronZoneType::ClickhouseServer { .. } => { + ZoneKind::ClickhouseServer + } OmicronZoneType::CockroachDb { .. } => ZoneKind::CockroachDb, OmicronZoneType::Crucible { .. } => ZoneKind::Crucible, OmicronZoneType::CruciblePantry { .. } => ZoneKind::CruciblePantry, @@ -252,6 +266,7 @@ impl OmicronZoneType { OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -271,6 +286,7 @@ impl OmicronZoneType { | OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -289,6 +305,7 @@ impl OmicronZoneType { | OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::CruciblePantry { .. } | OmicronZoneType::ExternalDns { .. } @@ -310,6 +327,7 @@ impl OmicronZoneType { OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -328,6 +346,7 @@ impl OmicronZoneType { OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -367,6 +386,7 @@ pub enum ZoneKind { BoundaryNtp, Clickhouse, ClickhouseKeeper, + ClickhouseServer, CockroachDb, Crucible, CruciblePantry, @@ -390,6 +410,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroachdb" for historical reasons. ZoneKind::CockroachDb => "cockroachdb", ZoneKind::Crucible => "crucible", @@ -409,6 +430,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroachdb" for historical reasons. ZoneKind::CockroachDb => "cockroachdb", ZoneKind::Crucible => "crucible", @@ -431,6 +453,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse-keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroach" for historical reasons. ZoneKind::CockroachDb => "cockroach", ZoneKind::Crucible => "crucible", @@ -451,6 +474,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp => "boundary_ntp", ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", ZoneKind::CockroachDb => "cockroach_db", ZoneKind::Crucible => "crucible", ZoneKind::CruciblePantry => "crucible_pantry", diff --git a/nexus/db-model/src/dataset_kind.rs b/nexus/db-model/src/dataset_kind.rs index 395d01353e..4a86efaca1 100644 --- a/nexus/db-model/src/dataset_kind.rs +++ b/nexus/db-model/src/dataset_kind.rs @@ -20,6 +20,7 @@ impl_enum_type!( Cockroach => b"cockroach" Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" ExternalDns => b"external_dns" InternalDns => b"internal_dns" ); @@ -35,6 +36,9 @@ impl From for DatasetKind { internal::shared::DatasetKind::ClickhouseKeeper => { DatasetKind::ClickhouseKeeper } + internal::shared::DatasetKind::ClickhouseServer => { + DatasetKind::ClickhouseServer + } internal::shared::DatasetKind::ExternalDns => { DatasetKind::ExternalDns } diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 400acc68b3..87986c4f54 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -985,6 +985,7 @@ impl_enum_type!( BoundaryNtp => b"boundary_ntp" Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" CockroachDb => b"cockroach_db" Crucible => b"crucible" CruciblePantry => b"crucible_pantry" @@ -1001,6 +1002,7 @@ impl From for ServiceKind { ZoneType::BoundaryNtp | ZoneType::InternalNtp => Self::Ntp, ZoneType::Clickhouse => Self::Clickhouse, ZoneType::ClickhouseKeeper => Self::ClickhouseKeeper, + ZoneType::ClickhouseServer => Self::ClickhouseServer, ZoneType::CockroachDb => Self::Cockroach, ZoneType::Crucible => Self::Crucible, ZoneType::CruciblePantry => Self::CruciblePantry, @@ -1020,6 +1022,7 @@ impl From for nexus_sled_agent_shared::inventory::ZoneKind { ZoneType::BoundaryNtp => BoundaryNtp, ZoneType::Clickhouse => Clickhouse, ZoneType::ClickhouseKeeper => ClickhouseKeeper, + ZoneType::ClickhouseServer => ClickhouseServer, ZoneType::CockroachDb => CockroachDb, ZoneType::Crucible => Crucible, ZoneType::CruciblePantry => CruciblePantry, @@ -1040,6 +1043,7 @@ impl From for ZoneType { BoundaryNtp => ZoneType::BoundaryNtp, Clickhouse => ZoneType::Clickhouse, ClickhouseKeeper => ZoneType::ClickhouseKeeper, + ClickhouseServer => ZoneType::ClickhouseServer, CockroachDb => ZoneType::CockroachDb, Crucible => ZoneType::Crucible, CruciblePantry => ZoneType::CruciblePantry, diff --git a/nexus/db-model/src/omicron_zone_config.rs b/nexus/db-model/src/omicron_zone_config.rs index 9236fc9407..23e1ef2dd9 100644 --- a/nexus/db-model/src/omicron_zone_config.rs +++ b/nexus/db-model/src/omicron_zone_config.rs @@ -109,6 +109,9 @@ impl OmicronZone { OmicronZoneType::ClickhouseKeeper { address, dataset } => { (ZoneType::ClickhouseKeeper, address, Some(dataset)) } + OmicronZoneType::ClickhouseServer { address, dataset } => { + (ZoneType::ClickhouseServer, address, Some(dataset)) + } OmicronZoneType::CockroachDb { address, dataset } => { (ZoneType::CockroachDb, address, Some(dataset)) } @@ -258,6 +261,12 @@ impl OmicronZone { dataset: common.dataset?, }, ), + ZoneType::ClickhouseServer => BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { + address, + dataset: common.dataset?, + }, + ), ZoneType::CockroachDb => BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, @@ -392,6 +401,10 @@ impl OmicronZone { address, dataset: common.dataset?, }, + ZoneType::ClickhouseServer => OmicronZoneType::ClickhouseServer { + address, + dataset: common.dataset?, + }, ZoneType::CockroachDb => OmicronZoneType::CockroachDb { address, dataset: common.dataset?, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index dd9c9dc667..1e0caabb02 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(86, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(87, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(87, "add-clickhouse-server-enum-variants"), KnownVersion::new(86, "snapshot-replacement"), KnownVersion::new(85, "add-migrations-by-time-created-index"), KnownVersion::new(84, "region-read-only"), diff --git a/nexus/db-model/src/service_kind.rs b/nexus/db-model/src/service_kind.rs index 016de9c44e..04fbab20b2 100644 --- a/nexus/db-model/src/service_kind.rs +++ b/nexus/db-model/src/service_kind.rs @@ -20,6 +20,7 @@ impl_enum_type!( // Enum values Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" Cockroach => b"cockroach" Crucible => b"crucible" CruciblePantry => b"crucible_pantry" diff --git a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs index b6ced8e2c5..7ace07305d 100644 --- a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -327,6 +327,7 @@ impl DataStore { ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, ZoneKind::Clickhouse | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer | ZoneKind::CockroachDb | ZoneKind::Crucible | ZoneKind::CruciblePantry diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index c9fb61b15a..c069a72955 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -595,6 +595,7 @@ impl DataStore { BlueprintZoneType::InternalNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8bcae27bc0..885ffa67d1 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -275,6 +275,9 @@ pub fn blueprint_internal_dns_config( BlueprintZoneType::ClickhouseKeeper( blueprint_zone_type::ClickhouseKeeper { address, .. }, ) => (ServiceName::ClickhouseKeeper, address.port()), + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { address, .. }, + ) => (ServiceName::ClickhouseServer, address.port()), BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, .. }, ) => (ServiceName::Cockroach, address.port()), diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index acbb7a6b33..b40bbab982 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -138,6 +138,7 @@ pub(crate) async fn clean_up_expunged_zones( BlueprintZoneType::BoundaryNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) diff --git a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs index c08f30124c..2fb60e66f8 100644 --- a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs +++ b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs @@ -31,6 +31,7 @@ impl DiscretionaryOmicronZone { // Zones that we should place but don't yet. BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::InternalDns(_) diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs index eb9554cff7..4fbef3ae2e 100644 --- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -239,6 +239,7 @@ impl BackgroundTask for ServiceZoneNatTracker { // well OmicronZoneType::Clickhouse {..} => continue, OmicronZoneType::ClickhouseKeeper {..} => continue, + OmicronZoneType::ClickhouseServer{..} => continue, OmicronZoneType::CockroachDb {..} => continue, OmicronZoneType::Crucible {..} => continue, OmicronZoneType::CruciblePantry {..} => continue, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 4342adb02b..cc48f2646a 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -671,6 +671,11 @@ impl BlueprintZoneConfig { blueprint_zone_type::ClickhouseKeeper { address, dataset }, ) } + OmicronZoneType::ClickhouseServer { address, dataset } => { + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { address, dataset }, + ) + } OmicronZoneType::CockroachDb { address, dataset } => { BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, dataset }, diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index 789a0215b7..e4958fc3c3 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -25,6 +25,7 @@ pub enum BlueprintZoneType { BoundaryNtp(blueprint_zone_type::BoundaryNtp), Clickhouse(blueprint_zone_type::Clickhouse), ClickhouseKeeper(blueprint_zone_type::ClickhouseKeeper), + ClickhouseServer(blueprint_zone_type::ClickhouseServer), CockroachDb(blueprint_zone_type::CockroachDb), Crucible(blueprint_zone_type::Crucible), CruciblePantry(blueprint_zone_type::CruciblePantry), @@ -60,6 +61,7 @@ impl BlueprintZoneType { } BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -78,6 +80,7 @@ impl BlueprintZoneType { | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -94,6 +97,7 @@ impl BlueprintZoneType { | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -110,6 +114,7 @@ impl BlueprintZoneType { BlueprintZoneType::BoundaryNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) @@ -129,6 +134,9 @@ impl BlueprintZoneType { BlueprintZoneType::ClickhouseKeeper( blueprint_zone_type::ClickhouseKeeper { dataset, address }, ) => (dataset, DatasetKind::ClickhouseKeeper, address), + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { dataset, address }, + ) => (dataset, DatasetKind::ClickhouseServer, address), BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { dataset, address }, ) => (dataset, DatasetKind::Cockroach, address), @@ -185,6 +193,12 @@ impl From for OmicronZoneType { dataset: zone.dataset, } } + BlueprintZoneType::ClickhouseServer(zone) => { + Self::ClickhouseServer { + address: zone.address, + dataset: zone.dataset, + } + } BlueprintZoneType::CockroachDb(zone) => Self::CockroachDb { address: zone.address, dataset: zone.dataset, @@ -235,6 +249,7 @@ impl BlueprintZoneType { Self::BoundaryNtp(_) => ZoneKind::BoundaryNtp, Self::Clickhouse(_) => ZoneKind::Clickhouse, Self::ClickhouseKeeper(_) => ZoneKind::ClickhouseKeeper, + Self::ClickhouseServer(_) => ZoneKind::ClickhouseServer, Self::CockroachDb(_) => ZoneKind::CockroachDb, Self::Crucible(_) => ZoneKind::Crucible, Self::CruciblePantry(_) => ZoneKind::CruciblePantry, @@ -273,6 +288,7 @@ pub mod blueprint_zone_type { pub external_ip: OmicronZoneExternalSnatIp, } + /// Used in single-node clickhouse setups #[derive( Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] @@ -289,6 +305,15 @@ pub mod blueprint_zone_type { pub dataset: OmicronZoneDataset, } + /// Used in replicated clickhouse setups + #[derive( + Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + )] + pub struct ClickhouseServer { + pub address: SocketAddrV6, + pub dataset: OmicronZoneDataset, + } + #[derive( Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a181d14540..5dd7d3dea3 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2154,6 +2154,7 @@ ] }, { + "description": "Used in single-node clickhouse setups", "type": "object", "properties": { "address": { @@ -2197,6 +2198,29 @@ "type" ] }, + { + "description": "Used in replicated clickhouse setups", + "type": "object", + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/components/schemas/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + }, + "required": [ + "address", + "dataset", + "type" + ] + }, { "type": "object", "properties": { @@ -2645,14 +2669,37 @@ }, "DatasetKind": { "description": "Describes the purpose of the dataset.", - "type": "string", - "enum": [ - "crucible", - "cockroach", - "clickhouse", - "clickhouse_keeper", - "external_dns", - "internal_dns" + "oneOf": [ + { + "type": "string", + "enum": [ + "crucible", + "cockroach", + "external_dns", + "internal_dns" + ] + }, + { + "description": "Used for single-node clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse" + ] + }, + { + "description": "Used for replicated clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse_keeper" + ] + }, + { + "description": "Used for replicated clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse_server" + ] + } ] }, "DatasetPutRequest": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index ecaff33042..21e1451689 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3747,6 +3747,7 @@ ] }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "properties": { "address": { @@ -3769,6 +3770,7 @@ ] }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "properties": { "address": { @@ -3790,6 +3792,29 @@ "type" ] }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/components/schemas/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + }, + "required": [ + "address", + "dataset", + "type" + ] + }, { "type": "object", "properties": { diff --git a/schema/all-zones-requests.json b/schema/all-zones-requests.json index 910feb8c74..4d20959ad1 100644 --- a/schema/all-zones-requests.json +++ b/schema/all-zones-requests.json @@ -353,6 +353,7 @@ } }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "required": [ "address", @@ -375,6 +376,7 @@ } }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "required": [ "address", @@ -396,6 +398,29 @@ } } }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + } + }, { "type": "object", "required": [ diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up1.sql b/schema/crdb/add-clickhouse-server-enum-variants/up1.sql new file mode 100644 index 0000000000..9f1b4e419c --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up1.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.service_kind ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up2.sql b/schema/crdb/add-clickhouse-server-enum-variants/up2.sql new file mode 100644 index 0000000000..b94a4df0cf --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up2.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.dataset_kind ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up3.sql b/schema/crdb/add-clickhouse-server-enum-variants/up3.sql new file mode 100644 index 0000000000..874ccec8f2 --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up3.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.zone_type ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2a83f01298..ddc399d282 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -288,6 +288,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_switch_by_rack ON omicron.public.switch CREATE TYPE IF NOT EXISTS omicron.public.service_kind AS ENUM ( 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'cockroach', 'crucible', 'crucible_pantry', @@ -506,6 +507,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.dataset_kind AS ENUM ( 'cockroach', 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'external_dns', 'internal_dns' ); @@ -3209,6 +3211,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.zone_type AS ENUM ( 'boundary_ntp', 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'cockroach_db', 'crucible', 'crucible_pantry', @@ -4214,7 +4217,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '86.0.0', NULL) + (TRUE, NOW(), NOW(), '87.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/rss-service-plan-v3.json b/schema/rss-service-plan-v3.json index fd4b9c7064..a003cde6f0 100644 --- a/schema/rss-service-plan-v3.json +++ b/schema/rss-service-plan-v3.json @@ -494,6 +494,7 @@ } }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "required": [ "address", @@ -516,6 +517,7 @@ } }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "required": [ "address", @@ -537,6 +539,29 @@ } } }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + } + }, { "type": "object", "required": [ diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 4a7885279c..aa5e8fd26f 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -277,6 +277,9 @@ pub(crate) trait OmicronZoneTypeExt { OmicronZoneType::ClickhouseKeeper { dataset, address, .. } => { Some((dataset, DatasetType::ClickhouseKeeper, address)) } + OmicronZoneType::ClickhouseServer { dataset, address, .. } => { + Some((dataset, DatasetType::ClickhouseServer, address)) + } OmicronZoneType::CockroachDb { dataset, address, .. } => { Some((dataset, DatasetType::CockroachDb, address)) } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index a79d5b68e7..e319b3fa15 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1587,6 +1587,24 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } + ZoneArgs::Omicron(OmicronZoneConfigLocal { + zone: + OmicronZoneConfig { + zone_type: OmicronZoneType::ClickhouseServer { .. }, + underlay_address: _, + .. + }, + .. + }) => { + // We aren't yet deploying this service + error!( + &self.inner.log, + "Deploying ClickhouseServer zones is not yet supported" + ); + + todo!() + } + ZoneArgs::Omicron(OmicronZoneConfigLocal { zone: OmicronZoneConfig { diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 26b5085609..74f2be782f 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -142,6 +142,7 @@ pub enum DatasetType { Crucible, Clickhouse, ClickhouseKeeper, + ClickhouseServer, ExternalDns, InternalDns, } @@ -164,6 +165,7 @@ impl DatasetType { Self::CockroachDb => DatasetKind::Cockroach, Self::Clickhouse => DatasetKind::Clickhouse, Self::ClickhouseKeeper => DatasetKind::ClickhouseKeeper, + Self::ClickhouseServer => DatasetKind::ClickhouseServer, Self::ExternalDns => DatasetKind::ExternalDns, Self::InternalDns => DatasetKind::InternalDns, } @@ -206,6 +208,7 @@ impl std::fmt::Display for DatasetType { CockroachDb => "cockroachdb", Clickhouse => "clickhouse", ClickhouseKeeper => "clickhouse_keeper", + ClickhouseServer => "clickhouse_server", ExternalDns => "external_dns", InternalDns => "internal_dns", }; From cf0b0fda6c792bfc3d347fdefec368dd1e7b1f93 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 13 Aug 2024 10:46:18 -0700 Subject: [PATCH 24/91] Update Dendrite to use new timeseries, expunge the old (#6271) --- .../replicated/9/timeseries-to-delete.txt | 47 +++++++++++++++++++ .../single-node/9/timeseries-to-delete.txt | 47 +++++++++++++++++++ oximeter/db/src/model.rs | 2 +- package-manifest.toml | 12 ++--- tools/dendrite_openapi_version | 4 +- tools/dendrite_stub_checksums | 6 +-- 6 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 oximeter/db/schema/replicated/9/timeseries-to-delete.txt create mode 100644 oximeter/db/schema/single-node/9/timeseries-to-delete.txt diff --git a/oximeter/db/schema/replicated/9/timeseries-to-delete.txt b/oximeter/db/schema/replicated/9/timeseries-to-delete.txt new file mode 100644 index 0000000000..449d2e9155 --- /dev/null +++ b/oximeter/db/schema/replicated/9/timeseries-to-delete.txt @@ -0,0 +1,47 @@ +data_link:abort +data_link:b_e_r_check_done +data_link:b_e_r_check_start +data_link:bad_sync_headers +data_link:disabled +data_link:enabled +data_link:end +data_link:errored_blocks +data_link:fec_align +data_link:fec_corr_cnt +data_link:fec_hi_ser +data_link:fec_ser_lane0 +data_link:fec_ser_lane1 +data_link:fec_ser_lane2 +data_link:fec_ser_lane3 +data_link:fec_ser_lane4 +data_link:fec_ser_lane5 +data_link:fec_ser_lane6 +data_link:fec_ser_lane7 +data_link:fec_uncorr_cnt +data_link:idle +data_link:link_down +data_link:link_up +data_link:monitor_p_r_b_s_errors +data_link:pci_hi_ber +data_link:pcs_block_lock_loss +data_link:pcs_invalid_errors +data_link:pcs_sync_loss +data_link:pcs_unknown_errors +data_link:pcs_valid_errors +data_link:remote_fault +data_link:rx_buf_full +data_link:rx_bytes +data_link:rx_crc_errs +data_link:rx_errs +data_link:rx_pkts +data_link:tofino3_states +data_link:tx_bytes +data_link:tx_errs +data_link:tx_pkts +data_link:wait_auto_neg_done +data_link:wait_auto_neg_link_training_done +data_link:wait_d_f_e_done +data_link:wait_p_l_l_ready +data_link:wait_signal_o_k +data_link:wait_test_done +sidecar:sample_time diff --git a/oximeter/db/schema/single-node/9/timeseries-to-delete.txt b/oximeter/db/schema/single-node/9/timeseries-to-delete.txt new file mode 100644 index 0000000000..449d2e9155 --- /dev/null +++ b/oximeter/db/schema/single-node/9/timeseries-to-delete.txt @@ -0,0 +1,47 @@ +data_link:abort +data_link:b_e_r_check_done +data_link:b_e_r_check_start +data_link:bad_sync_headers +data_link:disabled +data_link:enabled +data_link:end +data_link:errored_blocks +data_link:fec_align +data_link:fec_corr_cnt +data_link:fec_hi_ser +data_link:fec_ser_lane0 +data_link:fec_ser_lane1 +data_link:fec_ser_lane2 +data_link:fec_ser_lane3 +data_link:fec_ser_lane4 +data_link:fec_ser_lane5 +data_link:fec_ser_lane6 +data_link:fec_ser_lane7 +data_link:fec_uncorr_cnt +data_link:idle +data_link:link_down +data_link:link_up +data_link:monitor_p_r_b_s_errors +data_link:pci_hi_ber +data_link:pcs_block_lock_loss +data_link:pcs_invalid_errors +data_link:pcs_sync_loss +data_link:pcs_unknown_errors +data_link:pcs_valid_errors +data_link:remote_fault +data_link:rx_buf_full +data_link:rx_bytes +data_link:rx_crc_errs +data_link:rx_errs +data_link:rx_pkts +data_link:tofino3_states +data_link:tx_bytes +data_link:tx_errs +data_link:tx_pkts +data_link:wait_auto_neg_done +data_link:wait_auto_neg_link_training_done +data_link:wait_d_f_e_done +data_link:wait_p_l_l_ready +data_link:wait_signal_o_k +data_link:wait_test_done +sidecar:sample_time diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index 05667058b5..f27df4ed49 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 8; +pub const OXIMETER_VERSION: u64 = 9; // Wrapper type to represent a boolean in the database. // diff --git a/package-manifest.toml b/package-manifest.toml index 5ee81e722b..2c68257050 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -660,8 +660,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "4b09ea6d89af353fd4240a3cfde8655c555f6f42e05c6fc4a4e32724f86bb749" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" output.type = "zone" output.intermediate_only = true @@ -687,8 +687,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "224ff076a3031d5b913e40084a48fce7bdd08e8ef1abd1ab74df0058963bb3b2" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "68bf16452a3159529fb1bd11f43adfb002020d086e0f64f48bd766bf47843ae9" output.type = "zone" output.intermediate_only = true @@ -707,8 +707,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "66b04128c41ad9cd26ca3746d51fff5d295ca65f48e7aabee616026934cc8d5e" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "b7d6a1a20f302ded9c6e4bbba66b9432bec5edda593edfcdbb9429a95201655a" output.type = "zone" output.intermediate_only = true diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index b7d34debd1..652ebc31eb 100755 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="9811438cc91c6ec4e8a8ca12479c920bb25fec81" -SHA2="12dc61e7c62b2e1ee1cf3c2bf7cdda6bee6ec96925d2fc1c021c6c1a8fdd56cd" +COMMIT="8293f28df659c070b48e13f87a51b836238b406e" +SHA2="3a54305ab4b1270c9a5fb0603f481fce199f3767c174a03559ff642f7f44687e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 5d5b60ff57..cd8eb65a3e 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="4b09ea6d89af353fd4240a3cfde8655c555f6f42e05c6fc4a4e32724f86bb749" -CIDL_SHA256_LINUX_DPD="fb597785b6fd94b0840a80ff82bc596426aa6b815dd64793075f05d2ba5db38d" -CIDL_SHA256_LINUX_SWADM="9be30b688301debe4103057730ff9a426c96b45d571a6287268f381d8a11dbc1" +CIDL_SHA256_ILLUMOS="7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" +CIDL_SHA256_LINUX_DPD="290edfc4076d31d6f70aa7cc16ce758e10d14777d8542b688fa2880fdfde398c" +CIDL_SHA256_LINUX_SWADM="e1e35784538a4fdd76dc257cc636ac3f43f7ef2842dabfe981f17f8ce6b8e1a2" From b6e9078b6e15b2bc48874312d876e3aa01b52881 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 13 Aug 2024 10:49:53 -0700 Subject: [PATCH 25/91] move relevant docs from RFDs 48, 61 into Omicron (#6206) --- README.adoc | 2 +- docs/architecture-rev-2024-08-01.svg | 1 + docs/control-plane-architecture.adoc | 254 +++++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 docs/architecture-rev-2024-08-01.svg create mode 100644 docs/control-plane-architecture.adoc diff --git a/README.adoc b/README.adoc index 6b24821c6e..449bd3e5ea 100644 --- a/README.adoc +++ b/README.adoc @@ -14,7 +14,7 @@ Omicron is open-source. But we're pretty focused on our own goals for the forese https://docs.oxide.computer/api[Docs are automatically generated for the public (externally-facing) API] based on the OpenAPI spec that itself is automatically generated from the server implementation. You can generate your own docs for either the public API or any of the internal APIs by feeding the corresponding OpenAPI specs (in link:./openapi[]) into an OpenAPI doc generator. -There are some internal design docs in the link:./docs[] directory. +There are some internal design docs in the link:./docs[] directory. You might start with link:./docs/control-plane-architecture.adoc[]. For more design documentation and internal Rust API docs, see the https://rust.docs.corp.oxide.computer/omicron/[generated Rust documentation]. You can generate this yourself with: diff --git a/docs/architecture-rev-2024-08-01.svg b/docs/architecture-rev-2024-08-01.svg new file mode 100644 index 0000000000..a952297de4 --- /dev/null +++ b/docs/architecture-rev-2024-08-01.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/control-plane-architecture.adoc b/docs/control-plane-architecture.adoc new file mode 100644 index 0000000000..931da9ce02 --- /dev/null +++ b/docs/control-plane-architecture.adoc @@ -0,0 +1,254 @@ +:showtitle: +:numbered: +:toc: left + +// +// The sources for the diagrams in this document are in the Oxide Google Drive +// folder for Control Plane Architecture: +// https://drive.google.com/open?id=1OI-QxmapK7oYGFRGp0suJdpQDft-qVAz +// + += Control plane architecture + +NOTE: Much of this material originally came from <> and <>. This is now the living documentation for all the material covered here. + +NOTE: The RFD references in this documentation may be Oxide-internal. Where possible, we're trying to move relevant documentation from those RFDs into docs here. + +== What is the control plane + +In software systems the terms **data plane** and **control plane** are often used to refer to the parts of the system that directly provide resources to users (the data plane) and the parts that support the configuration, control, monitoring, and operation of the system (the control plane). Within the Oxide system, we say that the data plane comprises those parts that provide CPU resources (including both the host CPU and hypervisor software), storage resources, and network resources. The control plane provides the APIs through which users provision, configure, and monitor these resources and the mechanisms through which these APIs are implemented. Also part of the control plane are the APIs and facilities through which operators manage the system itself, including fault management, alerting, software updates for various components of the system, and so on. + +Broadly, the control plane must provide: + +* an externally-facing API endpoint described in <> through which users can provision elastic infrastructure backed by the system. This includes APIs for compute instances, storage, networking, as well as supporting resources like organizations, users, groups, ssh keys, tags, and so on. This API may be used by developers directly as well as the developer console backend. See <>. +* an externally-facing API endpoint for all operator functions. This is a long list, including configuration and management of hardware and software components and monitoring. +* implementation of lifecycle activities, like initial system setup; adding, removing, or replacing servers or other components; and the like. +* facilities for remote support by Oxide, including secure access to crash dumps, core files, log files, and system consoles. + +== Fundamental properties + +NOTE: These are design goals. They have not all been fully implemented yet. + +**Availability.** Availability of the control plane refers to the property that requests to provision resources succeed when the underlying resources are available within the system and requests to reconfigure or monitor resources succeed as long as they are well-formed. Unavailability refers to request failure due to hardware or software failure. + +IMPORTANT: Generally, the control plane is expected to remain **available** in the face of any two hardware or software failures, including transient failures of individual compute sleds, power rectifiers, switches, or the like. + +**Durability.** Along the same lines, resources created in the control plane are expected to be durable unless otherwise specified. That is, if the whole system is powered off and on again ("cold start"), the system should converge to a point where all instances, disks, and networking resources that were running before the power outage are available as they were from the user's perspective before the event. Similarly, if a compute server is lost (either through graceful decommissioning or otherwise), it should be possible to resume service of resources that were running on that server (e.g., instances, disks) on other servers in the system. There may be additional constraints on how many servers can fail permanently before data is lost, but in no case should it be possible to permanently lose an instance, disk, or other resource after the permanent failure of two compute sleds. + +IMPORTANT: Resources created by users should generally survive permanent failure of any two hardware or software components. + +**Consistency.** Generally, users can expect strong consistency for resources within some namespace. The bounds of the namespace for a particular resource may vary as described in <>. For example, if a user creates an instance, another user with appropriate permissions should immediately see that instance. In terms of https://en.wikipedia.org/wiki/CAP_theorem[**CAP**], the system is generally CP, with an emphasis on avoiding partitions through reliable software and hardware. + +IMPORTANT: The API namespace is generally expected to provide strong consistency. + +**Scalability and performance.** The API is designed with a scheme for naming and pagination that supports operating on arbitrarily large collections, so in principle it's expected to support arbitrary numbers of most resources. In practice, the system is intended to support on the order of 100 servers in a rack and 10,000 VMs in a rack. While these numbers are unlikely to change drastically in the future, the long-term goal of providing a single view over multiple racks means the system will need to support much larger numbers of servers and other resources. To avoid catastrophic degradation in performance (to the point of unavailability) as the system is scaled, aggressive limits will be imposed on the numbers of most resources. Operators may choose to raise these limits but will be advised to test the system's performance at the new scale. + +IMPORTANT: The API should support arbitrarily large systems. The system itself should be clear about its target scale and avoid catastrophic degradation due to users consuming too many resources. + +**Security.** Older versions of <> discussed control plane security in great detail. That content needs to be extracted from the history and probably put here. + +**Supportability and debuggability.** Effective customer support includes rapidly diagnosing issues and releasing fixes with low-risk updates. To achieve this, all the software in the system, including the control plane, must be built with supportability in mind, which means being able to collect enough information about failures to diagnose them from their first occurrence in the field as much as possible and being able to update software with low risk to the system. Details will be covered in an RFD to-be-named-later. + +== Parts of the control plane + +=== Crash course on hardware architecture + +For our purposes, an Oxide rack comprises three types of boards (systems): + +* Up to 32 compute **sleds** (servers). These are sometimes called **Gimlets**, though "Gimlet" technically refers to a particular hardware generation. Within the sled, the **host system** is the x86 box we generally think of as "the server". +* 1 or 2 **switches**, each attached via PCIe to one of the 32 compute sleds. (The switches are _also_ connected to each of the 32 sleds for networking. This PCIe connection we're talking about is for control of the switch itself, which is only done by one sled.) The chassis that house the switches are sometimes called **Sidecars**, though "Sidecar" technically refers to a particular hardware generation. Sleds that are attached to switches are often called **Scrimlets** (which is a little unfortunate since the name obviously comes from "Gimlet", but it might not be a Gimlet (since Gimlet refers to a specific hardware generation)). +* 1-2 power shelves, each with a **Power Shelf Controller (PSC)** that provides basic monitoring and control for the rectifiers that make up the power shelf. + +Each type of system (Gimlet, Sidecar, and PSC) contains a **service processor** (SP) that's responsible for basic monitoring and control, typically including power control and thermal management. + +<> discusses service processors in more detail. + +=== Components that run alongside specific hardware + +.Overview of the control plane +image::architecture-rev-2024-08-01.svg[Control Plane Architecture] + +At the "bottom" of the stack, we have a few basic components that reside alongside the specific pieces of hardware that they manage: + +* On each sled, the **sled agent** manages instances, storage, networking, and the sled's other resources. Sled agent also collects information about hardware and reports it to Nexus. Each sled also runs either a **boundary NTP** or **internal NTP** service to synchronize the sled's clock. More on boundary NTP below. +* On the two Scrimlets, a "switch zone" provides additional functionality related to the switch: +** **Dendrite** provides APIs for configuring the switch itself (e.g., populating various tables used for packet forwarding, NAT, etc.). +** **Management Gateway Service (MGS)** provides APIs for communicating with all the rack's service processors (including those on the sleds, Sidecars, and PSCs). See <> for details. +** **Wicket** and its associated service **wicketd** provide a text user interface (TUI) that's accessible over the rack's technician ports. Wicket is used for initial system setup (before networking has been configured) and for support. +** **Boundary NTP** provides NTP service for all sleds in the rack based on upstream NTP servers provided by the customer. + +.Components deployed alongside specific hardware +[cols="1h,2,4",stripes="none",options="header"] +|=== +| Component +| How it's deployed +| Availability/scalability + +| Sled agent +| One per sled, tied to that specific sled +| N/A + +| Internal DNS +| One zone per non-Scrimlet sled +| N/A + +| Boundary NTP +| One zone per Scrimlet. Both instances within a rack are fungible. +| There are two. Short-term failure (order of hours or even days) is unlikely to affect anything since sled clocks do not drift that quickly. + +| Dendrite +| Part of the switch zone (one per Scrimlet), tied to that specific switch +| Unavailability of either instance results in loss of ability to configure and monitor the corresponding switch. + +| Management Gateway +| Part of the switch zone (one per Scrimlet) Both instances within one rack are fungible. +| Only one of the two instances are generally required to maintain service. + +| Wicket +| Part of the switch zone (one per Scrimlet). Both instances within one rack are fungible. +| Wickets operate independently. Failure of one means unavailability of the TUI over that technician port. + +|=== + +=== Higher-level components + +Most other components: + +* are deployed in illumos zones +* don't care where they run and can even be deployed multiple times on the same sled +* can be deployed multiple times for availability, horizontal scalability, or both + +They are: + +* **Nexus** provides primary control for the whole control plane. Nexus hosts all user-facing APIs (both operator and customer), the web console, and internal APIs for other control plane components to report inventory, generate alerts, and so on. Nexus is also responsible for background control plane activity, including utilization management, server failure detection and recovery, and the like. Persistent state is stored elsewhere (in CockroachDB), which allows Nexus to be scaled separately. +* **CockroachDB** provides a replicated, strongly-consistent, horizontally scalable database that stores virtually all control plane data. See <> and <> for details. +* **Clickhouse** provides storage and querying services for metric data collected from all components in the rack. See <> for more information. +* **Oximeter** collects metric data from the other components and store it into Clickhouse. See <> for more information. +* **External DNS** operates authoritative DNS nameservers for end users and operators. These are authoritative nameservers for whatever DNS name the customer specifies. They currently just provide DNS names for the the external API and web console. +* **Internal DNS** provides DNS names for all control plane components. This is how most of the control plane discovers its dependencies. (See <> and <>.) + + +.Hardware-agnostic components +[cols="1h,2,4,4",stripes="none",options="header"] +|=== +| Component +| How it's deployed +| Horizontal scalability +| Availability + +| Nexus +| Using zones, as many as needed. Instances are fungible. +| Not architecturally limited. State provided by CockroachDB. +| With N instances needed to handle load, and M instances deployed, can survive M - N failures. + +| CockroachDB +| Using zones, as many as needed. Instances are fungible. +| Required, provided by CockroachDB cluster expansion. +| Required, provided by CockroachDB range replication. + +| Clickhouse +| Using zones, as many as needed. Instances are fungible. +| TBD +| Required, provided by Clickhouse replication (see <>). + +| Oximeter +| Using zones, as many as needed. +| Yes. Configuration managed by Nexus, stored in CockroachDB, and cached in local storage for improved availability when other components are down +| TBD. + +| External DNS +| Using zones, as many as needed. Instances are fungible. +| Not architecturally limited. Generally limited by the number of external DNS server IP addresses provided by the customer, which is usually 2-5. +| Generally, only one is needed for service. + +| Internal DNS +| Using zones, as many as needed. Instances are fungible. +| Hardcoded limit of 5. +| With N instances needed to handle load, and M instances deployed, can survive M - N failures. + +|=== + +== Design principles + +=== Basics + +As much as possible, components are deployed in illumos zones. These are lightweight containers that act as their own complete systems (e.g., with their own dedicated networking stack with its own interfaces, IPs, etc.). + +Oxide-produced components are written in Rust. They communicate over HTTP using APIs managed via OpenAPI using Dropshot. HTTP may not provide the best latency, but we don't expect the throughput of API requests to be so high or the target latency so low that the overhead of HTTP internally will noticeably impact the customer experience. Using OpenAPI enables us to leverage investments in OpenAPI libraries, tooling, and documentation that we need for the external API. Rigorous use of OpenAPI, including automatically generating OpenAPI specifications from server implementations, allows us to automatically identify potentially breaking API changes. This information will eventually be included in metadata associated with each component's update images so that the upgrade software can use this to ensure that only compatible combinations of components are deployed. + +Service discovery happens via DNS. See <> and <>. + +=== Nexus, data flow + +Nexus is the place where system-wide decisions get made. CockroachDB is the source of truth for all configuration. + +Nexus stores all of its state in CockroachDB. It's the only component that communicates directly with CockroachDB. + +Nexus instances operate independently, without directly coordinating with each other except through CockroachDB. + +Generally, when a change gets made, the process is: + +1. Nexus receives a request to make the change (e.g., via the external API) +2. Nexus validates the requested change +3. Nexus stores the information into CockroachDB. (This is the point where change is serialized against any concurrent changes.) +4. Nexus propagates the change to other components that need to know about it. + +There are a few basic contexts in Nexus: + +* **API requests** from either the external or internal API. Here, Nexus is latency-sensitive. When we make database queries or other requests in this context, we usually do _not_ retry transient failures, but leave that to callers (See https://en.wikipedia.org/wiki/End-to-end_principle["end-to-end principle"]). API request handlers may kick off sagas or activate background tasks. +* **Distributed sagas** are a https://www.youtube.com/watch?v=0UTOLRTwOX0[design pattern] for carrying out multi-step operations in a distributed system. Saga actions generally _do_ retry transient errors indefinitely. +* **Background tasks** are periodic or event-triggered activities that manage everything else that has to happen in the system (e.g., change propagation, CockroachDB cluster management, fault tolerance, etc.). Nexus has a framework for background tasks that's oriented around the "reconciler" pattern (see <>). In this context, we also usually don't retry individual operations -- instead, the entire activity will be retried on a periodic basis. Background tasks are structured to re-evaluate the state of the world each time they're run and then determine what to do, on the assumption that things may have changed since the last time they ran. + +It's essential that components provide visibility into what they're doing for debugging and support. Software should be able to exonerate itself when things are broken. + +* API requests are short-lived. The Nexus log is currently the only real way to see what these have done. +* Sagas are potentially long-lived. Without needing any per-saga work, the saga log provides detailed information about which steps have run, which steps are in-progress, and the results of each step that completed. +* Background tasks are continuous processes. They can provide whatever detailed status they want to, including things like: activity counters, error counters, ringbuffers of recent events, data produced by the task, etc. These can be viewed with `omdb`. + +== Cold start + +"Cold start" refers to starting the control plane from a rack that's completely powered off. Achieving this requires careful consideration of where configuration is stored and how configuration changes flow through the system. + +We'll start from the point where sleds are powered on, even though a lot happens with the rectifiers, service processors, Sidecars, etc. before that point. Once host systems are powered on: + +* Sled agents start up, communicate with each other, and form a trust quorum that enables each of them to decrypt their local storage. This local storage includes: +** a **bootstore** containing basic network configuration needed to bring up the rack +** information about what control plane services are running on this sled +* Sled agents apply any needed network configuration and start any services they're supposed to be running: +** On Scrimlets, the switch zone and boundary NTP are started. Boundary NTP synchronizes time from the customer-provided NTP servers. +** On non-Scrimlets, internal DNS is started. The rest of cold boot waits until time has been synchronized from the boundary NTP instances. +** Once time is synchronized, internal DNS services are started so that components can find each other. +** Once internal DNS is available, all other services are started concurrently. +*** CockroachDB nodes start up, discover the rest of the cluster via DNS, and form a cluster. +*** Nexus starts up and waits for CockroachDB to become available. +*** All other services start up and wait for their dependencies to become available. + +For this to work: + +* **Bootstore** must contain enough information to configure networking on the switches and each host to reach other services within the rack as well as the outside world (for NTP). +* **Internal DNS** must be able to come up without any external dependencies, meaning it stores a complete copy of all DNS data locally. + +However, Nexus is the place where all _changes_ to configuration are made, and CockroachDB is the source of truth for all configuration. As a result, when changing bootstore contents or internal DNS, the change is first made at Nexus, stored into CockroachDB, and then propagated to all sleds and internal DNS instances for local persistent storage so that it's available on cold start (of the _sled_) without the rest of the control plane being up. + +This is a very rough approximation, but gives an idea of the dependencies associated with cold start. + +[bibliography] +== References + +Unfortunately, most of these RFDs are not yet public. + +* [[[rfd4, RFD 4]]] https://rfd.shared.oxide.computer/rfd/4/[RFD 4 User Facing API] +* [[[rfd6, RFD 6]]] https://rfd.shared.oxide.computer/rfd/6/[RFD 6 Threat Model]. Note the reference above comes from an earlier version of RFD 6 (7e44771b239c0458aea2b6e2045294d41b79cb22 or earlier). +* [[[rfd24, RFD 24]]] https://rfd.shared.oxide.computer/rfd/24/[RFD 24 Multi-Rack Oxide Deployments] +* [[[rfd30, RFD 30]]] https://rfd.shared.oxide.computer/rfd/30/[RFD 30 Oxide Console Prototype] +* [[[rfd48, RFD 48]]] https://rfd.shared.oxide.computer/rfd/48/[RFD 48 Control Plane Requirements] +* [[[rfd53, RFD 53]]] https://rfd.shared.oxide.computer/rfd/53/[RFD 53 Control plane data storage requirements] +* [[[rfd61, RFD 61]]] https://rfd.shared.oxide.computer/rfd/61/[RFD 61 Control Plane Architecture and Design] +* [[[rfd110, RFD 110]]] https://rfd.shared.oxide.computer/rfd/110/[RFD 110 CockroachDB for the control plane database] +* [[[rfd125, RFD 125]]] https://rfd.shared.oxide.computer/rfd/125/[RFD 125 Telemetry requirements and building blocks] +* [[[rfd162, RFD 162]]] https://rfd.shared.oxide.computer/rfd/162/[RFD 162 Metrics collection architecture and design] +* [[[rfd206, RFD 206]]] https://rfd.shared.oxide.computer/rfd/206/[RFD 206 Service Discovery] +* [[[rfd210, RFD 210]]] https://rfd.shared.oxide.computer/rfd/210/[RFD 210 Omicron, service processors, and power shelf controllers] +* [[[rfd248, RFD 248]]] https://rfd.shared.oxide.computer/rfd/248/[RFD 248 Omicron service discovery: server side] +* [[[rfd373, RFD 373]]] https://rfd.shared.oxide.computer/rfd/373/[RFD 373 Reliable Persistent Workflows] +* [[[rfd468, RFD 468]]] https://rfd.shared.oxide.computer/rfd/468/[RFD 468 Rolling out replicated ClickHouse to new and existing racks] From e48cd90c6eaca31a7e377256d46aad7aa5167958 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 13 Aug 2024 12:24:30 -0700 Subject: [PATCH 26/91] [omicron-zones] ensure name_prefix for clickhouse-server is valid (#6312) Followup from #6297 -- `name_prefix` requires dashes. --- Cargo.lock | 1 + nexus-sled-agent-shared/Cargo.toml | 1 + nexus-sled-agent-shared/src/inventory.rs | 28 ++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b38a4905e..3f7b669e37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5140,6 +5140,7 @@ dependencies = [ "schemars", "serde", "sled-hardware-types", + "strum", "uuid", ] diff --git a/nexus-sled-agent-shared/Cargo.toml b/nexus-sled-agent-shared/Cargo.toml index 8e2358e902..544cebfbe4 100644 --- a/nexus-sled-agent-shared/Cargo.toml +++ b/nexus-sled-agent-shared/Cargo.toml @@ -14,4 +14,5 @@ omicron-workspace-hack.workspace = true schemars.workspace = true serde.workspace = true sled-hardware-types.workspace = true +strum.workspace = true uuid.workspace = true diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 2f1361a6f2..2a94fc50db 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; // Export this type for convenience -- this way, dependents don't have to // depend on sled-hardware-types. pub use sled_hardware_types::Baseboard; +use strum::EnumIter; use uuid::Uuid; /// Identifies information about disks which may be attached to Sleds. @@ -381,7 +382,9 @@ impl OmicronZoneType { /// the four representations if at all possible. If you must add a new one, /// please add it here rather than doing something ad-hoc in the calling code /// so it's more legible. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, EnumIter, +)] pub enum ZoneKind { BoundaryNtp, Clickhouse, @@ -453,7 +456,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse-keeper", - ZoneKind::ClickhouseServer => "clickhouse_server", + ZoneKind::ClickhouseServer => "clickhouse-server", // Note "cockroach" for historical reasons. ZoneKind::CockroachDb => "cockroach", ZoneKind::Crucible => "crucible", @@ -486,3 +489,24 @@ impl ZoneKind { } } } + +#[cfg(test)] +mod tests { + use omicron_common::api::external::Name; + use strum::IntoEnumIterator; + + use super::*; + + #[test] + fn test_name_prefixes() { + for zone_kind in ZoneKind::iter() { + let name_prefix = zone_kind.name_prefix(); + name_prefix.parse::().unwrap_or_else(|e| { + panic!( + "failed to parse name prefix {:?} for zone kind {:?}: {}", + name_prefix, zone_kind, e + ); + }); + } + } +} From 2c588522a02d8a8de34148f648ad7c1963c695dc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 13 Aug 2024 15:08:04 -0700 Subject: [PATCH 27/91] [nix flake] update lockfile (#6314) The current Rust version required by our `rust-toolchain.toml` wasn't in the version of Nixpkgs that the flake's lockfile was pointed at. This commit updates it so that Nix users can once again get the proper toolchain. --- flake.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flake.lock b/flake.lock index 5a70a42881..2c0393f722 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1712791164, - "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=", + "lastModified": 1723175592, + "narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5", + "rev": "5e0ca22929f3342b19569b21b2f3462f053e497b", "type": "github" }, "original": { @@ -29,11 +29,11 @@ ] }, "locked": { - "lastModified": 1719368303, - "narHash": "sha256-vhkKOUs9eOZgcPrA6wMw7a7J48pEjVuhzQfitVwVv1g=", + "lastModified": 1723429325, + "narHash": "sha256-4x/32xTCd+xCwFoI/kKSiCr5LQA2ZlyTRYXKEni5HR8=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "32415b22fd3b454e4a1385af64aa5cef9766ff4c", + "rev": "65e3dc0fe079fe8df087cd38f1fe6836a0373aad", "type": "github" }, "original": { From 544fa4c681fb7794778d3abe4dcc3ac9d150fd5f Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 13 Aug 2024 15:24:50 -0700 Subject: [PATCH 28/91] [sled-agent] turn API into a trait (#6159) Building on the changes in #6138, this PR turns the sled-agent API into a trait. This is high-impact because it currently is part of a circular dependency with the Nexus internal API -- this PR gets us to the point where the OpenAPI documents can be generated independently of each other. There is some code movement but there are no functional changes in this PR -- I didn't use `replace` here to keep the PR small. I'll try out `replace` for some of the types with `From` impls in a subsequent PR. (I did change the internal `VpcFirewallRule` to `ResolvedVpcFirewallRule` to match the other types in the destination file.) I've also added some documentation about when `nexus-sled-agent-shared` should be used. Depends on #6283. --- Cargo.lock | 32 +- Cargo.toml | 3 + clients/installinator-client/Cargo.toml | 1 + clients/installinator-client/src/lib.rs | 2 +- common/src/api/internal/shared.rs | 49 + common/src/disk.rs | 115 ++ dev-tools/openapi-manager/Cargo.toml | 1 + dev-tools/openapi-manager/src/spec.rs | 10 + illumos-utils/src/opte/firewall_rules.rs | 6 +- illumos-utils/src/opte/mod.rs | 1 - illumos-utils/src/opte/params.rs | 65 - illumos-utils/src/opte/port_manager.rs | 10 +- installinator-common/Cargo.toml | 1 + installinator-common/src/progress.rs | 43 +- installinator/src/write.rs | 11 +- nexus-sled-agent-shared/Cargo.toml | 3 + nexus-sled-agent-shared/README.md | 17 +- nexus-sled-agent-shared/src/lib.rs | 8 + nexus/Cargo.toml | 1 + nexus/networking/src/firewall_rules.rs | 24 +- nexus/src/app/sagas/instance_ip_attach.rs | 15 +- nexus/src/app/vpc.rs | 3 +- openapi/installinator.json | 2 +- openapi/sled-agent.json | 115 +- sled-agent/Cargo.toml | 3 +- sled-agent/api/Cargo.toml | 21 + sled-agent/api/src/lib.rs | 549 ++++++ sled-agent/src/bin/sled-agent.rs | 18 +- sled-agent/src/boot_disk_os_writer.rs | 41 +- sled-agent/src/bootstrap/client.rs | 2 +- sled-agent/src/bootstrap/params.rs | 208 +-- sled-agent/src/bootstrap/rss_handle.rs | 2 +- sled-agent/src/bootstrap/server.rs | 5 +- sled-agent/src/bootstrap/sprockets_server.rs | 2 +- sled-agent/src/common/disk.rs | 2 +- sled-agent/src/http_entrypoints.rs | 1560 +++++++---------- sled-agent/src/instance.rs | 24 +- sled-agent/src/instance_manager.rs | 9 +- sled-agent/src/long_running_tasks.rs | 3 +- sled-agent/src/params.rs | 310 +--- sled-agent/src/probe_manager.rs | 7 +- sled-agent/src/rack_setup/plan/service.rs | 2 +- sled-agent/src/rack_setup/plan/sled.rs | 7 +- sled-agent/src/rack_setup/service.rs | 5 +- sled-agent/src/server.rs | 13 +- sled-agent/src/services.rs | 9 +- sled-agent/src/sim/collection.rs | 3 +- sled-agent/src/sim/disk.rs | 2 +- sled-agent/src/sim/http_entrypoints.rs | 18 +- sled-agent/src/sim/instance.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 17 +- sled-agent/src/sim/storage.rs | 4 +- sled-agent/src/sled_agent.rs | 52 +- sled-agent/src/zone_bundle.rs | 519 +----- .../tests/integration_tests/commands.rs | 25 - .../tests/output/cmd-sled-agent-noargs-stderr | 5 +- .../cmd-sled-agent-openapi-bootstrap-stderr | 0 .../output/cmd-sled-agent-openapi-sled-stderr | 0 sled-agent/types/Cargo.toml | 6 +- sled-agent/types/src/boot_disk.rs | 62 + sled-agent/types/src/bootstore.rs | 51 + sled-agent/types/src/disk.rs | 41 + sled-agent/types/src/firewall_rules.rs | 16 + sled-agent/types/src/instance.rs | 172 ++ sled-agent/types/src/lib.rs | 8 + sled-agent/types/src/sled.rs | 219 +++ sled-agent/types/src/time_sync.rs | 30 + sled-agent/types/src/zone_bundle.rs | 529 ++++++ sled-storage/src/manager.rs | 7 +- sled-storage/src/resources.rs | 76 +- wicketd/src/installinator_progress.rs | 6 +- wicketd/src/update_tracker.rs | 2 +- 72 files changed, 2791 insertions(+), 2421 deletions(-) delete mode 100644 illumos-utils/src/opte/params.rs create mode 100644 sled-agent/api/Cargo.toml create mode 100644 sled-agent/api/src/lib.rs delete mode 100644 sled-agent/tests/output/cmd-sled-agent-openapi-bootstrap-stderr delete mode 100644 sled-agent/tests/output/cmd-sled-agent-openapi-sled-stderr create mode 100644 sled-agent/types/src/boot_disk.rs create mode 100644 sled-agent/types/src/bootstore.rs create mode 100644 sled-agent/types/src/disk.rs create mode 100644 sled-agent/types/src/firewall_rules.rs create mode 100644 sled-agent/types/src/instance.rs create mode 100644 sled-agent/types/src/sled.rs create mode 100644 sled-agent/types/src/time_sync.rs create mode 100644 sled-agent/types/src/zone_bundle.rs diff --git a/Cargo.lock b/Cargo.lock index 3f7b669e37..b588f2738d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3822,6 +3822,7 @@ name = "installinator-client" version = "0.1.0" dependencies = [ "installinator-common", + "omicron-common", "omicron-workspace-hack", "progenitor", "regress", @@ -3842,6 +3843,7 @@ dependencies = [ "camino", "illumos-utils", "libc", + "omicron-common", "omicron-workspace-hack", "proptest", "schemars", @@ -5139,8 +5141,10 @@ dependencies = [ "omicron-workspace-hack", "schemars", "serde", + "serde_json", "sled-hardware-types", "strum", + "thiserror", "uuid", ] @@ -5842,6 +5846,7 @@ dependencies = [ "serde_with", "similar-asserts", "sled-agent-client", + "sled-agent-types", "slog", "slog-async", "slog-dtrace", @@ -6071,8 +6076,6 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", - "openapi-lint", - "openapiv3", "opte-ioctl", "oximeter", "oximeter-instruments", @@ -6090,6 +6093,7 @@ dependencies = [ "serde_human_bytes", "serde_json", "sha3", + "sled-agent-api", "sled-agent-client", "sled-agent-types", "sled-hardware", @@ -6369,6 +6373,7 @@ dependencies = [ "oximeter-api", "serde_json", "similar", + "sled-agent-api", "supports-color", "wicketd-api", ] @@ -9138,6 +9143,23 @@ dependencies = [ "parking_lot 0.11.2", ] +[[package]] +name = "sled-agent-api" +version = "0.1.0" +dependencies = [ + "camino", + "dropshot", + "nexus-sled-agent-shared", + "omicron-common", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "schemars", + "serde", + "sled-agent-types", + "sled-hardware-types", + "uuid", +] + [[package]] name = "sled-agent-client" version = "0.1.0" @@ -9165,21 +9187,25 @@ name = "sled-agent-types" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "bootstore", "camino", "camino-tempfile", + "chrono", "nexus-sled-agent-shared", "omicron-common", "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", "oxnet", + "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=24a74d0c76b6a63961ecef76acb1516b6e66c5c9)", "rcgen", "schemars", "serde", + "serde_human_bytes", "serde_json", + "sha3", "sled-hardware-types", - "sled-storage", "slog", "thiserror", "toml 0.8.19", diff --git a/Cargo.toml b/Cargo.toml index bb899f8825..962bfb82de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -85,6 +85,7 @@ members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/api", "sled-agent/bootstrap-agent-api", "sled-agent/types", "sled-hardware", @@ -196,6 +197,7 @@ default-members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/api", "sled-agent/bootstrap-agent-api", "sled-agent/types", "sled-hardware", @@ -517,6 +519,7 @@ similar-asserts = "1.5.0" # are still doing mupdate a change to the on-disk format will break existing DNS # server zones. sled = "=0.34.7" +sled-agent-api = { path = "sled-agent/api" } sled-agent-client = { path = "clients/sled-agent-client" } sled-agent-types = { path = "sled-agent/types" } sled-hardware = { path = "sled-hardware" } diff --git a/clients/installinator-client/Cargo.toml b/clients/installinator-client/Cargo.toml index ca2de0476a..ba869d79bd 100644 --- a/clients/installinator-client/Cargo.toml +++ b/clients/installinator-client/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] installinator-common.workspace = true +omicron-common.workspace = true progenitor.workspace = true regress.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } diff --git a/clients/installinator-client/src/lib.rs b/clients/installinator-client/src/lib.rs index a39ff3ff80..3b7abc333b 100644 --- a/clients/installinator-client/src/lib.rs +++ b/clients/installinator-client/src/lib.rs @@ -21,7 +21,7 @@ progenitor::generate_api!( replace = { Duration = std::time::Duration, EventReportForInstallinatorSpec = installinator_common::EventReport, - M2Slot = installinator_common::M2Slot, + M2Slot = omicron_common::disk::M2Slot, ProgressEventForGenericSpec = installinator_common::ProgressEvent, ProgressEventForInstallinatorSpec = installinator_common::ProgressEvent, StepEventForGenericSpec = installinator_common::StepEvent, diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 3856a472ab..089ff9b324 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -19,6 +19,8 @@ use std::{ }; use uuid::Uuid; +use super::nexus::HostIdentifier; + /// The type of network interface #[derive( Clone, @@ -635,6 +637,53 @@ pub struct ResolvedVpcRoute { pub target: RouterTarget, } +/// VPC firewall rule after object name resolution has been performed by Nexus +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct ResolvedVpcFirewallRule { + pub status: external::VpcFirewallRuleStatus, + pub direction: external::VpcFirewallRuleDirection, + pub targets: Vec, + pub filter_hosts: Option>, + pub filter_ports: Option>, + pub filter_protocols: Option>, + pub action: external::VpcFirewallRuleAction, + pub priority: external::VpcFirewallRulePriority, +} + +/// A mapping from a virtual NIC to a physical host +#[derive( + Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash, +)] +pub struct VirtualNetworkInterfaceHost { + pub virtual_ip: IpAddr, + pub virtual_mac: external::MacAddr, + pub physical_host_ip: Ipv6Addr, + pub vni: external::Vni, +} + +/// DHCP configuration for a port +/// +/// Not present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we +/// use `InstanceRuntimeState::hostname` for this value. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct DhcpConfig { + /// DNS servers to send to the instance + /// + /// (DHCPv4 option 6; DHCPv6 option 23) + pub dns_servers: Vec, + + /// DNS zone this instance's hostname belongs to (e.g. the `project.example` + /// part of `instance1.project.example`) + /// + /// (DHCPv4 option 15; used in DHCPv6 option 39) + pub host_domain: Option, + + /// DNS search domains + /// + /// (DHCPv4 option 119; DHCPv6 option 24) + pub search_domains: Vec, +} + /// The target for a given router entry. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, diff --git a/common/src/disk.rs b/common/src/disk.rs index 4b4cd2e69d..d8b4c2e0a1 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -4,6 +4,9 @@ //! Disk related types shared among crates +use std::fmt; + +use anyhow::bail; use omicron_uuid_kinds::ZpoolUuid; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -114,3 +117,115 @@ impl From for DiskVariant { } } } + +/// Identifies how a single disk management operation may have succeeded or +/// failed. +#[derive(Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct DiskManagementStatus { + pub identity: DiskIdentity, + pub err: Option, +} + +/// The result from attempting to manage underlying disks. +/// +/// This is more complex than a simple "Error" type because it's possible +/// for some disks to be initialized correctly, while others can fail. +/// +/// This structure provides a mechanism for callers to learn about partial +/// failures, and handle them appropriately on a per-disk basis. +#[derive(Default, Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[must_use = "this `DiskManagementResult` may contain errors, which should be handled"] +pub struct DisksManagementResult { + pub status: Vec, +} + +impl DisksManagementResult { + pub fn has_error(&self) -> bool { + for status in &self.status { + if status.err.is_some() { + return true; + } + } + false + } + + pub fn has_retryable_error(&self) -> bool { + for status in &self.status { + if let Some(err) = &status.err { + if err.retryable() { + return true; + } + } + } + false + } +} + +#[derive(Debug, thiserror::Error, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum DiskManagementError { + #[error("Disk requested by control plane, but not found on device")] + NotFound, + + #[error("Expected zpool UUID of {expected}, but saw {observed}")] + ZpoolUuidMismatch { expected: ZpoolUuid, observed: ZpoolUuid }, + + #[error("Failed to access keys necessary to unlock storage. This error may be transient.")] + KeyManager(String), + + #[error("Other error starting disk management: {0}")] + Other(String), +} + +impl DiskManagementError { + fn retryable(&self) -> bool { + match self { + DiskManagementError::KeyManager(_) => true, + _ => false, + } + } +} + +/// Describes an M.2 slot, often in the context of writing a system image to +/// it. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Deserialize, + Serialize, + JsonSchema, +)] +pub enum M2Slot { + A, + B, +} + +impl fmt::Display for M2Slot { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::A => f.write_str("A"), + Self::B => f.write_str("B"), + } + } +} + +impl TryFrom for M2Slot { + type Error = anyhow::Error; + + fn try_from(value: i64) -> Result { + match value { + // Gimlet should have 2 M.2 drives: drive A is assigned slot 17, and + // drive B is assigned slot 18. + 17 => Ok(Self::A), + 18 => Ok(Self::B), + _ => bail!("unexpected M.2 slot {value}"), + } + } +} diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index e60000cc06..85d27aaafd 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -27,6 +27,7 @@ openapi-lint.workspace = true owo-colors.workspace = true oximeter-api.workspace = true serde_json.workspace = true +sled-agent-api.workspace = true similar.workspace = true supports-color.workspace = true wicketd-api.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index f991d35ec4..37a657ee93 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -87,6 +87,16 @@ pub fn all_apis() -> Vec { filename: "oximeter.json", extra_validation: None, }, + ApiSpec { + title: "Oxide Sled Agent API", + version: "0.0.1", + description: "API for interacting with individual sleds", + boundary: ApiBoundary::Internal, + api_description: + sled_agent_api::sled_agent_api_mod::stub_api_description, + filename: "sled-agent.json", + extra_validation: None, + }, ApiSpec { title: "Oxide Technician Port Control Service", version: "0.0.1", diff --git a/illumos-utils/src/opte/firewall_rules.rs b/illumos-utils/src/opte/firewall_rules.rs index 4dcb390e9e..26ab4d6218 100644 --- a/illumos-utils/src/opte/firewall_rules.rs +++ b/illumos-utils/src/opte/firewall_rules.rs @@ -5,7 +5,6 @@ //! Convert Omicron VPC firewall rules to OPTE firewall rules. use super::net_to_cidr; -use crate::opte::params::VpcFirewallRule; use crate::opte::Vni; use macaddr::MacAddr6; use omicron_common::api::external::VpcFirewallRuleAction; @@ -13,6 +12,7 @@ use omicron_common::api::external::VpcFirewallRuleDirection; use omicron_common::api::external::VpcFirewallRuleProtocol; use omicron_common::api::external::VpcFirewallRuleStatus; use omicron_common::api::internal::nexus::HostIdentifier; +use omicron_common::api::internal::shared::ResolvedVpcFirewallRule; use oxide_vpc::api::Address; use oxide_vpc::api::Direction; use oxide_vpc::api::Filters; @@ -34,7 +34,7 @@ trait FromVpcFirewallRule { fn protos(&self) -> Vec; } -impl FromVpcFirewallRule for VpcFirewallRule { +impl FromVpcFirewallRule for ResolvedVpcFirewallRule { fn action(&self) -> FirewallAction { match self.action { VpcFirewallRuleAction::Allow => FirewallAction::Allow, @@ -118,7 +118,7 @@ impl FromVpcFirewallRule for VpcFirewallRule { /// a single host address and protocol, so we must unroll rules with multiple /// hosts/protocols. pub fn opte_firewall_rules( - rules: &[VpcFirewallRule], + rules: &[ResolvedVpcFirewallRule], vni: &Vni, mac: &MacAddr6, ) -> Vec { diff --git a/illumos-utils/src/opte/mod.rs b/illumos-utils/src/opte/mod.rs index d7fd96b0c0..9a86711ae6 100644 --- a/illumos-utils/src/opte/mod.rs +++ b/illumos-utils/src/opte/mod.rs @@ -13,7 +13,6 @@ cfg_if::cfg_if! { } mod firewall_rules; -pub mod params; mod port; mod port_manager; diff --git a/illumos-utils/src/opte/params.rs b/illumos-utils/src/opte/params.rs deleted file mode 100644 index 17c61d680f..0000000000 --- a/illumos-utils/src/opte/params.rs +++ /dev/null @@ -1,65 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use omicron_common::api::external; -use omicron_common::api::internal::nexus::HostIdentifier; -use omicron_common::api::internal::shared::NetworkInterface; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use std::net::IpAddr; -use std::net::Ipv6Addr; - -/// Update firewall rules for a VPC -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] -pub struct VpcFirewallRulesEnsureBody { - pub vni: external::Vni, - pub rules: Vec, -} - -/// VPC firewall rule after object name resolution has been performed by Nexus -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] -pub struct VpcFirewallRule { - pub status: external::VpcFirewallRuleStatus, - pub direction: external::VpcFirewallRuleDirection, - pub targets: Vec, - pub filter_hosts: Option>, - pub filter_ports: Option>, - pub filter_protocols: Option>, - pub action: external::VpcFirewallRuleAction, - pub priority: external::VpcFirewallRulePriority, -} - -/// A mapping from a virtual NIC to a physical host -#[derive( - Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Hash, -)] -pub struct VirtualNetworkInterfaceHost { - pub virtual_ip: IpAddr, - pub virtual_mac: external::MacAddr, - pub physical_host_ip: Ipv6Addr, - pub vni: external::Vni, -} - -/// DHCP configuration for a port -/// -/// Not present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we -/// use `InstanceRuntimeState::hostname` for this value. -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] -pub struct DhcpConfig { - /// DNS servers to send to the instance - /// - /// (DHCPv4 option 6; DHCPv6 option 23) - pub dns_servers: Vec, - - /// DNS zone this instance's hostname belongs to (e.g. the `project.example` - /// part of `instance1.project.example`) - /// - /// (DHCPv4 option 15; used in DHCPv6 option 39) - pub host_domain: Option, - - /// DNS search domains - /// - /// (DHCPv4 option 119; DHCPv6 option 24) - pub search_domains: Vec, -} diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 93c646cfab..735428907e 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -6,8 +6,6 @@ use crate::dladm::OPTE_LINK_PREFIX; use crate::opte::opte_firewall_rules; -use crate::opte::params::VirtualNetworkInterfaceHost; -use crate::opte::params::VpcFirewallRule; use crate::opte::port::PortData; use crate::opte::Error; use crate::opte::Gateway; @@ -17,6 +15,7 @@ use ipnetwork::IpNetwork; use omicron_common::api::external; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; +use omicron_common::api::internal::shared::ResolvedVpcFirewallRule; use omicron_common::api::internal::shared::ResolvedVpcRoute; use omicron_common::api::internal::shared::ResolvedVpcRouteSet; use omicron_common::api::internal::shared::ResolvedVpcRouteState; @@ -24,6 +23,7 @@ use omicron_common::api::internal::shared::RouterId; use omicron_common::api::internal::shared::RouterTarget as ApiRouterTarget; use omicron_common::api::internal::shared::RouterVersion; use omicron_common::api::internal::shared::SourceNatConfig; +use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DhcpCfg; @@ -96,7 +96,7 @@ pub struct PortCreateParams<'a> { pub source_nat: Option, pub ephemeral_ip: Option, pub floating_ips: &'a [IpAddr], - pub firewall_rules: &'a [VpcFirewallRule], + pub firewall_rules: &'a [ResolvedVpcFirewallRule], pub dhcp_config: DhcpCfg, pub is_service: bool, } @@ -664,7 +664,7 @@ impl PortManager { pub fn firewall_rules_ensure( &self, vni: external::Vni, - rules: &[VpcFirewallRule], + rules: &[ResolvedVpcFirewallRule], ) -> Result<(), Error> { use opte_ioctl::OpteHdl; @@ -705,7 +705,7 @@ impl PortManager { pub fn firewall_rules_ensure( &self, vni: external::Vni, - rules: &[VpcFirewallRule], + rules: &[ResolvedVpcFirewallRule], ) -> Result<(), Error> { info!( self.inner.log, diff --git a/installinator-common/Cargo.toml b/installinator-common/Cargo.toml index 4c5560148f..039304c9de 100644 --- a/installinator-common/Cargo.toml +++ b/installinator-common/Cargo.toml @@ -11,6 +11,7 @@ workspace = true anyhow.workspace = true camino.workspace = true illumos-utils.workspace = true +omicron-common.workspace = true libc.workspace = true schemars.workspace = true serde.workspace = true diff --git a/installinator-common/src/progress.rs b/installinator-common/src/progress.rs index 900fe70028..9078da6ba5 100644 --- a/installinator-common/src/progress.rs +++ b/installinator-common/src/progress.rs @@ -4,9 +4,9 @@ use std::{collections::BTreeSet, fmt, net::SocketAddr}; -use anyhow::bail; use camino::Utf8PathBuf; use illumos_utils::zpool; +use omicron_common::disk::M2Slot; use schemars::{ gen::SchemaGenerator, schema::{Schema, SchemaObject}, @@ -165,47 +165,6 @@ impl WriteOutput { } } -/// An M.2 slot that was written. -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - PartialOrd, - Ord, - Deserialize, - Serialize, - JsonSchema, -)] -pub enum M2Slot { - A, - B, -} - -impl fmt::Display for M2Slot { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::A => f.write_str("A"), - Self::B => f.write_str("B"), - } - } -} - -impl TryFrom for M2Slot { - type Error = anyhow::Error; - - fn try_from(value: i64) -> Result { - match value { - // Gimlet should have 2 M.2 drives: drive A is assigned slot 17, and - // drive B is assigned slot 18. - 17 => Ok(Self::A), - 18 => Ok(Self::B), - _ => bail!("unexpected M.2 slot {value}"), - } - } -} - /// The specification for write events. #[derive(JsonSchema)] pub enum WriteSpec {} diff --git a/installinator/src/write.rs b/installinator/src/write.rs index c7710baff7..fdc83cffa2 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -16,11 +16,14 @@ use bytes::Buf; use camino::{Utf8Path, Utf8PathBuf}; use illumos_utils::zpool::{Zpool, ZpoolName}; use installinator_common::{ - ControlPlaneZonesSpec, ControlPlaneZonesStepId, M2Slot, RawDiskWriter, - StepContext, StepProgress, StepResult, StepSuccess, UpdateEngine, - WriteComponent, WriteError, WriteOutput, WriteSpec, WriteStepId, + ControlPlaneZonesSpec, ControlPlaneZonesStepId, RawDiskWriter, StepContext, + StepProgress, StepResult, StepSuccess, UpdateEngine, WriteComponent, + WriteError, WriteOutput, WriteSpec, WriteStepId, +}; +use omicron_common::{ + disk::M2Slot, + update::{ArtifactHash, ArtifactHashId}, }; -use omicron_common::update::{ArtifactHash, ArtifactHashId}; use sha2::{Digest, Sha256}; use slog::{info, warn, Logger}; use tokio::{ diff --git a/nexus-sled-agent-shared/Cargo.toml b/nexus-sled-agent-shared/Cargo.toml index 544cebfbe4..144c755f34 100644 --- a/nexus-sled-agent-shared/Cargo.toml +++ b/nexus-sled-agent-shared/Cargo.toml @@ -11,8 +11,11 @@ omicron-common.workspace = true omicron-passwords.workspace = true omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true +# TODO: replace uses of propolis_client with local types schemars.workspace = true serde.workspace = true +serde_json.workspace = true sled-hardware-types.workspace = true strum.workspace = true +thiserror.workspace = true uuid.workspace = true diff --git a/nexus-sled-agent-shared/README.md b/nexus-sled-agent-shared/README.md index eeb3492eea..77b4d64486 100644 --- a/nexus-sled-agent-shared/README.md +++ b/nexus-sled-agent-shared/README.md @@ -3,7 +3,15 @@ Internal types shared between Nexus and sled-agent, with extra dependencies not in omicron-common. -**This crate should only be used for internal types and data structures.** +## Guidelines + +This crate should only be used for **internal types and data structures.** + +It should only be used for types that are used by **both `sled-agent-types` and `nexus-types`**. Prefer to put types in `sled-agent-types` or `nexus-types` if possible. + +- If a type is used by `sled-agent-api`, as well as any part of Nexus except `nexus-types`, put it in `sled-agent-types`. +- If a type is used by `nexus-internal-api`, as well as any part of sled-agent except `sled-agent-types`, put it in `nexus-types`. +- Only if a type is used by both `sled-agent-types` and `nexus-types` should it go here. ## Why not omicron-common? @@ -28,9 +36,10 @@ tokio-postgres, a dependency that is not a necessary component of sled-agent. ## Why not sled-agent-types or nexus-types? Types that are primarily used by sled-agent or nexus should continue to go in -those crates. However, types shared by both should go here. `sled-agent-types` -and `nexus-types` can thus avoid a dependency on each other: they're both "on -the same level" and neither dependency direction is clearly correct. +those crates. However, types used by both `nexus-types` and `sled-agent-types` +should go here. `sled-agent-types` and `nexus-types` can thus avoid a +dependency on each other: they're both "on the same level" and neither +dependency direction is clearly correct. ## Why not Progenitor-generated types? diff --git a/nexus-sled-agent-shared/src/lib.rs b/nexus-sled-agent-shared/src/lib.rs index 6781568d62..12fc040bbb 100644 --- a/nexus-sled-agent-shared/src/lib.rs +++ b/nexus-sled-agent-shared/src/lib.rs @@ -5,6 +5,14 @@ //! Internal types shared between Nexus and sled-agent, with extra dependencies //! not in omicron-common. //! +//! Only types that are shared between `nexus-types` and `sled-agent-types` +//! should go here. +//! +//! - If a type is used by `sled-agent-api` and Nexus, but is not required by +//! `nexus-types`, it should go in `sled-agent-types` instead. +//! - If a type is used by `nexus-internal-api` and Nexus, but is not required +//! by `sled-agent-types`, it should go in `nexus-types` instead. +//! //! For more information, see the crate [README](../README.md). pub mod inventory; diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index a949b31f0d..86d9abc460 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -137,6 +137,7 @@ pretty_assertions.workspace = true rcgen.workspace = true regex.workspace = true similar-asserts.workspace = true +sled-agent-types.workspace = true sp-sim.workspace = true rustls.workspace = true subprocess.workspace = true diff --git a/nexus/networking/src/firewall_rules.rs b/nexus/networking/src/firewall_rules.rs index 4ba66ec9f3..8491092353 100644 --- a/nexus/networking/src/firewall_rules.rs +++ b/nexus/networking/src/firewall_rules.rs @@ -49,7 +49,7 @@ pub async fn resolve_firewall_rules_for_sled_agent( vpc: &db::model::Vpc, rules: &[db::model::VpcFirewallRule], log: &Logger, -) -> Result, Error> { +) -> Result, Error> { // Collect the names of instances, subnets, and VPCs that are either // targets or host filters. We have to find the sleds for all the // targets, and we'll need information about the IP addresses or @@ -417,16 +417,18 @@ pub async fn resolve_firewall_rules_for_sled_agent( .as_ref() .map(|protocols| protocols.iter().map(|v| v.0.into()).collect()); - sled_agent_rules.push(sled_agent_client::types::VpcFirewallRule { - status: rule.status.0.into(), - direction: rule.direction.0.into(), - targets, - filter_hosts, - filter_ports, - filter_protocols, - action: rule.action.0.into(), - priority: rule.priority.0 .0, - }); + sled_agent_rules.push( + sled_agent_client::types::ResolvedVpcFirewallRule { + status: rule.status.0.into(), + direction: rule.direction.0.into(), + targets, + filter_hosts, + filter_ports, + filter_protocols, + action: rule.action.0.into(), + priority: rule.priority.0 .0, + }, + ); } debug!( log, diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index b18ac3109f..a14054cf66 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -346,6 +346,7 @@ pub(crate) mod test { }; use nexus_test_utils_macros::nexus_test; use omicron_common::api::external::SimpleIdentity; + use sled_agent_types::instance::InstanceExternalIpBody; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -437,14 +438,12 @@ pub(crate) mod test { // Sled agent has a record of the new external IPs. let mut eips = sled_agent.external_ips.lock().await; let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default(); - assert!(my_eips.iter().any(|v| matches!( - v, - omicron_sled_agent::params::InstanceExternalIpBody::Floating(_) - ))); - assert!(my_eips.iter().any(|v| matches!( - v, - omicron_sled_agent::params::InstanceExternalIpBody::Ephemeral(_) - ))); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Floating(_)))); + assert!(my_eips + .iter() + .any(|v| matches!(v, InstanceExternalIpBody::Ephemeral(_)))); // DB has records for SNAT plus the new IPs. let db_eips = datastore diff --git a/nexus/src/app/vpc.rs b/nexus/src/app/vpc.rs index b3605945d3..56a7777f0e 100644 --- a/nexus/src/app/vpc.rs +++ b/nexus/src/app/vpc.rs @@ -260,7 +260,8 @@ impl super::Nexus { opctx: &OpContext, vpc: &db::model::Vpc, rules: &[db::model::VpcFirewallRule], - ) -> Result, Error> { + ) -> Result, Error> + { nexus_networking::resolve_firewall_rules_for_sled_agent( &self.db_datastore, opctx, diff --git a/openapi/installinator.json b/openapi/installinator.json index 0631344b25..6419760fbd 100644 --- a/openapi/installinator.json +++ b/openapi/installinator.json @@ -397,7 +397,7 @@ ] }, "M2Slot": { - "description": "An M.2 slot that was written.", + "description": "Describes an M.2 slot, often in the context of writing a system image to it.", "type": "string", "enum": [ "A", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 21e1451689..1241248a5e 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -68,7 +68,6 @@ }, "/boot-disk/{boot_disk}/os/write/status": { "get": { - "summary": "Get the status of writing a new host OS", "operationId": "host_os_write_status_get", "parameters": [ { @@ -2945,7 +2944,7 @@ "firewall_rules": { "type": "array", "items": { - "$ref": "#/components/schemas/VpcFirewallRule" + "$ref": "#/components/schemas/ResolvedVpcFirewallRule" } }, "floating_ips": { @@ -4251,6 +4250,60 @@ "rack_subnet" ] }, + "ResolvedVpcFirewallRule": { + "description": "VPC firewall rule after object name resolution has been performed by Nexus", + "type": "object", + "properties": { + "action": { + "$ref": "#/components/schemas/VpcFirewallRuleAction" + }, + "direction": { + "$ref": "#/components/schemas/VpcFirewallRuleDirection" + }, + "filter_hosts": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/HostIdentifier" + } + }, + "filter_ports": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/L4PortRange" + } + }, + "filter_protocols": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/VpcFirewallRuleProtocol" + } + }, + "priority": { + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "status": { + "$ref": "#/components/schemas/VpcFirewallRuleStatus" + }, + "targets": { + "type": "array", + "items": { + "$ref": "#/components/schemas/NetworkInterface" + } + } + }, + "required": [ + "action", + "direction", + "priority", + "status", + "targets" + ] + }, "ResolvedVpcRoute": { "description": "A VPC route resolved into a concrete target.", "type": "object", @@ -5101,60 +5154,6 @@ } ] }, - "VpcFirewallRule": { - "description": "VPC firewall rule after object name resolution has been performed by Nexus", - "type": "object", - "properties": { - "action": { - "$ref": "#/components/schemas/VpcFirewallRuleAction" - }, - "direction": { - "$ref": "#/components/schemas/VpcFirewallRuleDirection" - }, - "filter_hosts": { - "nullable": true, - "type": "array", - "items": { - "$ref": "#/components/schemas/HostIdentifier" - } - }, - "filter_ports": { - "nullable": true, - "type": "array", - "items": { - "$ref": "#/components/schemas/L4PortRange" - } - }, - "filter_protocols": { - "nullable": true, - "type": "array", - "items": { - "$ref": "#/components/schemas/VpcFirewallRuleProtocol" - } - }, - "priority": { - "type": "integer", - "format": "uint16", - "minimum": 0 - }, - "status": { - "$ref": "#/components/schemas/VpcFirewallRuleStatus" - }, - "targets": { - "type": "array", - "items": { - "$ref": "#/components/schemas/NetworkInterface" - } - } - }, - "required": [ - "action", - "direction", - "priority", - "status", - "targets" - ] - }, "VpcFirewallRuleAction": { "type": "string", "enum": [ @@ -5192,7 +5191,7 @@ "rules": { "type": "array", "items": { - "$ref": "#/components/schemas/VpcFirewallRule" + "$ref": "#/components/schemas/ResolvedVpcFirewallRule" } }, "vni": { @@ -5317,7 +5316,7 @@ "pattern": "^ox[ip]_[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" }, "M2Slot": { - "description": "An M.2 slot that was written.", + "description": "Describes an M.2 slot, often in the context of writing a system image to it.", "type": "string", "enum": [ "A", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 52889d8fa2..2aefd8f464 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -69,6 +69,7 @@ serde.workspace = true serde_human_bytes.workspace = true serde_json = { workspace = true, features = ["raw_value"] } sha3.workspace = true +sled-agent-api.workspace = true sled-agent-client.workspace = true sled-agent-types.workspace = true sled-hardware.workspace = true @@ -103,8 +104,6 @@ guppy.workspace = true http.workspace = true hyper.workspace = true omicron-test-utils.workspace = true -openapi-lint.workspace = true -openapiv3.workspace = true pretty_assertions.workspace = true rcgen.workspace = true subprocess.workspace = true diff --git a/sled-agent/api/Cargo.toml b/sled-agent/api/Cargo.toml new file mode 100644 index 0000000000..046f17574b --- /dev/null +++ b/sled-agent/api/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "sled-agent-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +camino.workspace = true +dropshot.workspace = true +nexus-sled-agent-shared.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true +sled-agent-types.workspace = true +sled-hardware-types.workspace = true +uuid.workspace = true diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs new file mode 100644 index 0000000000..c44b24d712 --- /dev/null +++ b/sled-agent/api/src/lib.rs @@ -0,0 +1,549 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::{collections::BTreeMap, time::Duration}; + +use camino::Utf8PathBuf; +use dropshot::{ + FreeformBody, HttpError, HttpResponseCreated, HttpResponseDeleted, + HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent, Path, + Query, RequestContext, StreamingBody, TypedBody, +}; +use nexus_sled_agent_shared::inventory::{ + Inventory, OmicronZonesConfig, SledRole, +}; +use omicron_common::{ + api::internal::{ + nexus::{DiskRuntimeState, SledInstanceState, UpdateArtifactId}, + shared::{ + ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, + SwitchPorts, VirtualNetworkInterfaceHost, + }, + }, + disk::{DiskVariant, DisksManagementResult, OmicronPhysicalDisksConfig}, +}; +use omicron_uuid_kinds::{InstanceUuid, ZpoolUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_agent_types::{ + boot_disk::{ + BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams, + BootDiskWriteStartQueryParams, + }, + bootstore::BootstoreStatus, + disk::DiskEnsureBody, + early_networking::EarlyNetworkConfig, + firewall_rules::VpcFirewallRulesEnsureBody, + instance::{ + InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody, + InstancePutStateResponse, InstanceUnregisterResponse, + }, + sled::AddSledRequest, + time_sync::TimeSync, + zone_bundle::{ + BundleUtilization, CleanupContext, CleanupCount, PriorityOrder, + ZoneBundleId, ZoneBundleMetadata, + }, +}; +use uuid::Uuid; + +#[dropshot::api_description] +pub trait SledAgentApi { + type Context; + + /// List all zone bundles that exist, even for now-deleted zones. + #[endpoint { + method = GET, + path = "/zones/bundles", + }] + async fn zone_bundle_list_all( + rqctx: RequestContext, + query: Query, + ) -> Result>, HttpError>; + + /// List the zone bundles that are available for a running zone. + #[endpoint { + method = GET, + path = "/zones/bundles/{zone_name}", + }] + async fn zone_bundle_list( + rqctx: RequestContext, + params: Path, + ) -> Result>, HttpError>; + + /// Ask the sled agent to create a zone bundle. + #[endpoint { + method = POST, + path = "/zones/bundles/{zone_name}", + }] + async fn zone_bundle_create( + rqctx: RequestContext, + params: Path, + ) -> Result, HttpError>; + + /// Fetch the binary content of a single zone bundle. + #[endpoint { + method = GET, + path = "/zones/bundles/{zone_name}/{bundle_id}", + }] + async fn zone_bundle_get( + rqctx: RequestContext, + params: Path, + ) -> Result>, HttpError>; + + /// Delete a zone bundle. + #[endpoint { + method = DELETE, + path = "/zones/bundles/{zone_name}/{bundle_id}", + }] + async fn zone_bundle_delete( + rqctx: RequestContext, + params: Path, + ) -> Result; + + /// Return utilization information about all zone bundles. + #[endpoint { + method = GET, + path = "/zones/bundle-cleanup/utilization", + }] + async fn zone_bundle_utilization( + rqctx: RequestContext, + ) -> Result< + HttpResponseOk>, + HttpError, + >; + + /// Return context used by the zone-bundle cleanup task. + #[endpoint { + method = GET, + path = "/zones/bundle-cleanup/context", + }] + async fn zone_bundle_cleanup_context( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Update context used by the zone-bundle cleanup task. + #[endpoint { + method = PUT, + path = "/zones/bundle-cleanup/context", + }] + async fn zone_bundle_cleanup_context_update( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Trigger a zone bundle cleanup. + #[endpoint { + method = POST, + path = "/zones/bundle-cleanup", + }] + async fn zone_bundle_cleanup( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// List the zones that are currently managed by the sled agent. + #[endpoint { + method = GET, + path = "/zones", + }] + async fn zones_list( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/omicron-zones", + }] + async fn omicron_zones_get( + rqctx: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/omicron-zones", + }] + async fn omicron_zones_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + #[endpoint { + method = GET, + path = "/omicron-physical-disks", + }] + async fn omicron_physical_disks_get( + rqctx: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/omicron-physical-disks", + }] + async fn omicron_physical_disks_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/zpools", + }] + async fn zpools_get( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/sled-role", + }] + async fn sled_role_get( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Initializes a CockroachDB cluster + #[endpoint { + method = POST, + path = "/cockroachdb", + }] + async fn cockroachdb_init( + rqctx: RequestContext, + ) -> Result; + + #[endpoint { + method = PUT, + path = "/instances/{instance_id}", + }] + async fn instance_register( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + #[endpoint { + method = DELETE, + path = "/instances/{instance_id}", + }] + async fn instance_unregister( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/instances/{instance_id}/state", + }] + async fn instance_put_state( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + #[endpoint { + method = GET, + path = "/instances/{instance_id}/state", + }] + async fn instance_get_state( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/instances/{instance_id}/external-ip", + }] + async fn instance_put_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + #[endpoint { + method = DELETE, + path = "/instances/{instance_id}/external-ip", + }] + async fn instance_delete_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + #[endpoint { + method = PUT, + path = "/disks/{disk_id}", + }] + async fn disk_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError>; + + #[endpoint { + method = POST, + path = "/update" + }] + async fn update_artifact( + rqctx: RequestContext, + artifact: TypedBody, + ) -> Result; + + /// Take a snapshot of a disk that is attached to an instance + #[endpoint { + method = POST, + path = "/instances/{instance_id}/disks/{disk_id}/snapshot", + }] + async fn instance_issue_disk_snapshot_request( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result< + HttpResponseOk, + HttpError, + >; + + #[endpoint { + method = PUT, + path = "/vpc/{vpc_id}/firewall/rules", + }] + async fn vpc_firewall_rules_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result; + + /// Create a mapping from a virtual NIC to a physical host + // Keep interface_id to maintain parity with the simulated sled agent, which + // requires interface_id on the path. + #[endpoint { + method = PUT, + path = "/v2p/", + }] + async fn set_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Delete a mapping from a virtual NIC to a physical host + // Keep interface_id to maintain parity with the simulated sled agent, which + // requires interface_id on the path. + #[endpoint { + method = DELETE, + path = "/v2p/", + }] + async fn del_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// List v2p mappings present on sled + // Used by nexus background task + #[endpoint { + method = GET, + path = "/v2p/", + }] + async fn list_v2p( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/timesync", + }] + async fn timesync_get( + rqctx: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = POST, + path = "/switch-ports", + }] + async fn uplink_ensure( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// This API endpoint is only reading the local sled agent's view of the + /// bootstore. The boostore is a distributed data store that is eventually + /// consistent. Reads from individual nodes may not represent the latest state. + #[endpoint { + method = GET, + path = "/network-bootstore-config", + }] + async fn read_network_bootstore_config_cache( + rqctx: RequestContext, + ) -> Result, HttpError>; + + #[endpoint { + method = PUT, + path = "/network-bootstore-config", + }] + async fn write_network_bootstore_config( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Add a sled to a rack that was already initialized via RSS + #[endpoint { + method = PUT, + path = "/sleds" + }] + async fn sled_add( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Write a new host OS image to the specified boot disk + #[endpoint { + method = POST, + path = "/boot-disk/{boot_disk}/os/write", + }] + async fn host_os_write_start( + rqctx: RequestContext, + path_params: Path, + query_params: Query, + body: StreamingBody, + ) -> Result; + + #[endpoint { + method = GET, + path = "/boot-disk/{boot_disk}/os/write/status", + }] + async fn host_os_write_status_get( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError>; + + /// Clear the status of a completed write of a new host OS + #[endpoint { + method = DELETE, + path = "/boot-disk/{boot_disk}/os/write/status/{update_id}", + }] + async fn host_os_write_status_delete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + + /// Fetch basic information about this sled + #[endpoint { + method = GET, + path = "/inventory", + }] + async fn inventory( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Fetch sled identifiers + #[endpoint { + method = GET, + path = "/sled-identifiers", + }] + async fn sled_identifiers( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Get the internal state of the local bootstore node + #[endpoint { + method = GET, + path = "/bootstore/status", + }] + async fn bootstore_status( + request_context: RequestContext, + ) -> Result, HttpError>; + + /// Get the current versions of VPC routing rules. + #[endpoint { + method = GET, + path = "/vpc-routes", + }] + async fn list_vpc_routes( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// Update VPC routing rules. + #[endpoint { + method = PUT, + path = "/vpc-routes", + }] + async fn set_vpc_routes( + request_context: RequestContext, + body: TypedBody>, + ) -> Result; +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct ZoneBundleFilter { + /// An optional substring used to filter zone bundles. + pub filter: Option, +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct ZonePathParam { + /// The name of the zone. + pub zone_name: String, +} + +/// Parameters used to update the zone bundle cleanup context. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct CleanupContextUpdate { + /// The new period on which automatic cleanups are run. + pub period: Option, + /// The priority ordering for preserving old zone bundles. + pub priority: Option, + /// The new limit on the underlying dataset quota allowed for bundles. + pub storage_limit: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct Zpool { + pub id: ZpoolUuid, + pub disk_type: DiskType, +} + +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub enum DiskType { + U2, + M2, +} + +impl From for DiskType { + fn from(v: DiskVariant) -> Self { + match v { + DiskVariant::U2 => Self::U2, + DiskVariant::M2 => Self::M2, + } + } +} + +/// Path parameters for Instance requests (sled agent API) +#[derive(Deserialize, JsonSchema)] +pub struct InstancePathParam { + pub instance_id: InstanceUuid, +} + +/// Path parameters for Disk requests (sled agent API) +#[derive(Deserialize, JsonSchema)] +pub struct DiskPathParam { + pub disk_id: Uuid, +} + +#[derive(Deserialize, JsonSchema)] +pub struct InstanceIssueDiskSnapshotRequestPathParam { + pub instance_id: Uuid, + pub disk_id: Uuid, +} + +#[derive(Deserialize, JsonSchema)] +pub struct InstanceIssueDiskSnapshotRequestBody { + pub snapshot_id: Uuid, +} + +#[derive(Serialize, JsonSchema)] +pub struct InstanceIssueDiskSnapshotRequestResponse { + pub snapshot_id: Uuid, +} + +/// Path parameters for VPC requests (sled agent API) +#[derive(Deserialize, JsonSchema)] +pub struct VpcPathParam { + pub vpc_id: Uuid, +} diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 6feeffd302..446103e982 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -6,20 +6,14 @@ use anyhow::anyhow; use camino::Utf8PathBuf; -use clap::{Parser, Subcommand}; +use clap::Parser; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::server as bootstrap_server; use omicron_sled_agent::bootstrap::RssAccessError; -use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; +use omicron_sled_agent::config::Config as SledConfig; use sled_agent_types::rack_init::RackInitializeRequest; -#[derive(Subcommand, Debug)] -enum OpenapiFlavor { - /// Generates sled agent openapi spec - Sled, -} - #[derive(Debug, Parser)] #[clap( name = "sled_agent", @@ -27,10 +21,6 @@ enum OpenapiFlavor { version )] enum Args { - /// Generates the OpenAPI specification. - #[command(subcommand)] - Openapi(OpenapiFlavor), - /// Runs the Sled Agent server. Run { #[clap(name = "CONFIG_FILE_PATH", action)] @@ -49,10 +39,6 @@ async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); match args { - Args::Openapi(flavor) => match flavor { - OpenapiFlavor::Sled => sled_server::run_openapi() - .map_err(|err| CmdError::Failure(anyhow!(err))), - }, Args::Run { config_path } => { let config = SledConfig::from_file(&config_path) .map_err(|e| CmdError::Failure(anyhow!(e)))?; diff --git a/sled-agent/src/boot_disk_os_writer.rs b/sled-agent/src/boot_disk_os_writer.rs index a0798ed174..59e79c418f 100644 --- a/sled-agent/src/boot_disk_os_writer.rs +++ b/sled-agent/src/boot_disk_os_writer.rs @@ -5,8 +5,6 @@ //! This module provides `BootDiskOsWriter`, via which sled-agent can write new //! OS images to its boot disks. -use crate::http_entrypoints::BootDiskOsWriteProgress; -use crate::http_entrypoints::BootDiskOsWriteStatus; use async_trait::async_trait; use bytes::Bytes; use camino::Utf8PathBuf; @@ -14,10 +12,12 @@ use display_error_chain::DisplayErrorChain; use dropshot::HttpError; use futures::Stream; use futures::TryStreamExt; -use installinator_common::M2Slot; use installinator_common::RawDiskWriter; +use omicron_common::disk::M2Slot; use sha3::Digest; use sha3::Sha3_256; +use sled_agent_types::boot_disk::BootDiskOsWriteProgress; +use sled_agent_types::boot_disk::BootDiskOsWriteStatus; use slog::Logger; use std::collections::btree_map::Entry; use std::collections::BTreeMap; @@ -37,18 +37,16 @@ use tokio::sync::oneshot::error::TryRecvError; use tokio::sync::watch; use uuid::Uuid; -impl BootDiskOsWriteStatus { - fn from_result( - update_id: Uuid, - result: &Result<(), Arc>, - ) -> Self { - match result { - Ok(()) => Self::Complete { update_id }, - Err(err) => Self::Failed { - update_id, - message: DisplayErrorChain::new(err).to_string(), - }, - } +fn to_boot_disk_status( + update_id: Uuid, + result: &Result<(), Arc>, +) -> BootDiskOsWriteStatus { + match result { + Ok(()) => BootDiskOsWriteStatus::Complete { update_id }, + Err(err) => BootDiskOsWriteStatus::Failed { + update_id, + message: DisplayErrorChain::new(err).to_string(), + }, } } @@ -393,9 +391,7 @@ impl BootDiskOsWriter { match running.complete_rx.try_recv() { Ok(result) => { let update_id = running.update_id; - let status = BootDiskOsWriteStatus::from_result( - update_id, &result, - ); + let status = to_boot_disk_status(update_id, &result); slot.insert(WriterState::Complete(TaskCompleteState { update_id, result, @@ -413,9 +409,7 @@ impl BootDiskOsWriter { let update_id = running.update_id; let result = Err(Arc::new(BootDiskOsWriteError::TaskPanic)); - let status = BootDiskOsWriteStatus::from_result( - update_id, &result, - ); + let status = to_boot_disk_status(update_id, &result); slot.insert(WriterState::Complete(TaskCompleteState { update_id, result, @@ -425,10 +419,7 @@ impl BootDiskOsWriter { } } WriterState::Complete(complete) => { - BootDiskOsWriteStatus::from_result( - complete.update_id, - &complete.result, - ) + to_boot_disk_status(complete.update_id, &complete.result) } } } diff --git a/sled-agent/src/bootstrap/client.rs b/sled-agent/src/bootstrap/client.rs index 10f1ab6f25..bfdaf6e6d4 100644 --- a/sled-agent/src/bootstrap/client.rs +++ b/sled-agent/src/bootstrap/client.rs @@ -7,10 +7,10 @@ use super::params::version; use super::params::Request; use super::params::RequestEnvelope; -use super::params::StartSledAgentRequest; use super::views::SledAgentResponse; use crate::bootstrap::views::Response; use crate::bootstrap::views::ResponseEnvelope; +use sled_agent_types::sled::StartSledAgentRequest; use slog::Logger; use std::borrow::Cow; use std::io; diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 9fe399419f..5aedf848fe 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -4,181 +4,9 @@ //! Request types for the bootstrap agent -use anyhow::Result; -use async_trait::async_trait; -use omicron_common::address::{self, Ipv6Subnet, SLED_PREFIX}; -use omicron_common::ledger::Ledgerable; -use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use sha3::{Digest, Sha3_256}; +use sled_agent_types::sled::StartSledAgentRequest; use std::borrow::Cow; -use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; -use uuid::Uuid; - -/// A representation of a Baseboard ID as used in the inventory subsystem -/// This type is essentially the same as a `Baseboard` except it doesn't have a -/// revision or HW type (Gimlet, PC, Unknown). -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct BaseboardId { - /// Oxide Part Number - pub part_number: String, - /// Serial number (unique for a given part number) - pub serial_number: String, -} - -/// A request to Add a given sled after rack initialization has occurred -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct AddSledRequest { - pub sled_id: BaseboardId, - pub start_request: StartSledAgentRequest, -} - -// A wrapper around StartSledAgentRequestV0 that was used -// for the ledger format. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -struct PersistentSledAgentRequest { - request: StartSledAgentRequestV0, -} - -/// The version of `StartSledAgentRequest` we originally shipped with. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct StartSledAgentRequestV0 { - /// Uuid of the Sled Agent to be created. - pub id: Uuid, - - /// Uuid of the rack to which this sled agent belongs. - pub rack_id: Uuid, - - /// The external NTP servers to use - pub ntp_servers: Vec, - - /// The external DNS servers to use - pub dns_servers: Vec, - - /// Use trust quorum for key generation - pub use_trust_quorum: bool, - - // Note: The order of these fields is load bearing, because we serialize - // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it - // must come after non-table fields. - /// Portion of the IP space to be managed by the Sled Agent. - pub subnet: Ipv6Subnet, -} - -/// Configuration information for launching a Sled Agent. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct StartSledAgentRequest { - /// The current generation number of data as stored in CRDB. - /// - /// The initial generation is set during RSS time and then only mutated - /// by Nexus. For now, we don't actually anticipate mutating this data, - /// but we leave open the possiblity. - pub generation: u64, - - // Which version of the data structure do we have. This is to help with - // deserialization and conversion in future updates. - pub schema_version: u32, - - // The actual configuration details - pub body: StartSledAgentRequestBody, -} - -/// This is the actual app level data of `StartSledAgentRequest` -/// -/// We nest it below the "header" of `generation` and `schema_version` so that -/// we can perform partial deserialization of `EarlyNetworkConfig` to only read -/// the header and defer deserialization of the body once we know the schema -/// version. This is possible via the use of [`serde_json::value::RawValue`] in -/// future (post-v1) deserialization paths. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] -pub struct StartSledAgentRequestBody { - /// Uuid of the Sled Agent to be created. - pub id: Uuid, - - /// Uuid of the rack to which this sled agent belongs. - pub rack_id: Uuid, - - /// Use trust quorum for key generation - pub use_trust_quorum: bool, - - /// Is this node an LRTQ learner node? - /// - /// We only put the node into learner mode if `use_trust_quorum` is also - /// true. - pub is_lrtq_learner: bool, - - /// Portion of the IP space to be managed by the Sled Agent. - pub subnet: Ipv6Subnet, -} - -impl StartSledAgentRequest { - pub fn sled_address(&self) -> SocketAddrV6 { - address::get_sled_address(self.body.subnet) - } - - pub fn switch_zone_ip(&self) -> Ipv6Addr { - address::get_switch_zone_address(self.body.subnet) - } - - /// Compute the sha3_256 digest of `self.rack_id` to use as a `salt` - /// for disk encryption. We don't want to include other values that are - /// consistent across sleds as it would prevent us from moving drives - /// between sleds. - pub fn hash_rack_id(&self) -> [u8; 32] { - // We know the unwrap succeeds as a Sha3_256 digest is 32 bytes - Sha3_256::digest(self.body.rack_id.as_bytes()) - .as_slice() - .try_into() - .unwrap() - } -} - -impl From for StartSledAgentRequest { - fn from(v0: StartSledAgentRequestV0) -> Self { - StartSledAgentRequest { - generation: 0, - schema_version: 1, - body: StartSledAgentRequestBody { - id: v0.id, - rack_id: v0.rack_id, - use_trust_quorum: v0.use_trust_quorum, - is_lrtq_learner: false, - subnet: v0.subnet, - }, - } - } -} - -#[async_trait] -impl Ledgerable for StartSledAgentRequest { - fn is_newer_than(&self, other: &Self) -> bool { - self.generation > other.generation - } - - fn generation_bump(&mut self) { - // DO NOTHING! - // - // Generation bumps must only ever come from nexus and will be encoded - // in the struct itself - } - - // Attempt to deserialize the v1 or v0 version and return - // the v1 version. - fn deserialize( - s: &str, - ) -> Result { - // Try to deserialize the latest version of the data structure (v1). If - // that succeeds we are done. - if let Ok(val) = serde_json::from_str::(s) { - return Ok(val); - } - - // We don't have the latest version. Try to deserialize v0 and then - // convert it to the latest version. - let v0 = serde_json::from_str::(s)?.request; - Ok(v0.into()) - } -} #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum Request<'a> { @@ -200,6 +28,10 @@ pub(super) mod version { mod tests { use std::net::Ipv6Addr; + use omicron_common::address::Ipv6Subnet; + use sled_agent_types::sled::StartSledAgentRequestBody; + use uuid::Uuid; + use super::*; #[test] @@ -227,34 +59,4 @@ mod tests { assert!(envelope == deserialized, "serialization round trip failed"); } - - #[test] - fn serialize_start_sled_agent_v0_deserialize_v1() { - let v0 = PersistentSledAgentRequest { - request: StartSledAgentRequestV0 { - id: Uuid::new_v4(), - rack_id: Uuid::new_v4(), - ntp_servers: vec![String::from("test.pool.example.com")], - dns_servers: vec!["1.1.1.1".parse().unwrap()], - use_trust_quorum: false, - subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), - }, - }; - let serialized = serde_json::to_string(&v0).unwrap(); - let expected = StartSledAgentRequest { - generation: 0, - schema_version: 1, - body: StartSledAgentRequestBody { - id: v0.request.id, - rack_id: v0.request.rack_id, - use_trust_quorum: v0.request.use_trust_quorum, - is_lrtq_learner: false, - subnet: v0.request.subnet, - }, - }; - - let actual: StartSledAgentRequest = - Ledgerable::deserialize(&serialized).unwrap(); - assert_eq!(expected, actual); - } } diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index 73f7537853..eee7eed085 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -5,7 +5,6 @@ //! sled-agent's handle to the Rack Setup Service it spawns use super::client as bootstrap_agent_client; -use super::params::StartSledAgentRequest; use crate::rack_setup::service::RackSetupService; use crate::rack_setup::service::SetupServiceError; use ::bootstrap_agent_client::Client as BootstrapAgentClient; @@ -16,6 +15,7 @@ use omicron_common::backoff::retry_notify; use omicron_common::backoff::retry_policy_local; use omicron_common::backoff::BackoffError; use sled_agent_types::rack_init::RackInitializeRequest; +use sled_agent_types::sled::StartSledAgentRequest; use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index fa1d781a96..6681f396b4 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -6,7 +6,6 @@ use super::config::BOOTSTRAP_AGENT_HTTP_PORT; use super::http_entrypoints; -use super::params::StartSledAgentRequest; use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; @@ -41,6 +40,7 @@ use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_ddm_admin_client::DdmError; use omicron_uuid_kinds::RackInitUuid; use sled_agent_types::rack_init::RackInitializeRequest; +use sled_agent_types::sled::StartSledAgentRequest; use sled_hardware::underlay; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; @@ -714,11 +714,10 @@ impl Inner { #[cfg(test)] mod tests { - use crate::bootstrap::params::StartSledAgentRequestBody; - use super::*; use omicron_common::address::Ipv6Subnet; use omicron_test_utils::dev::test_setup_log; + use sled_agent_types::sled::StartSledAgentRequestBody; use std::net::Ipv6Addr; use uuid::Uuid; diff --git a/sled-agent/src/bootstrap/sprockets_server.rs b/sled-agent/src/bootstrap/sprockets_server.rs index 796883b578..8d92970d54 100644 --- a/sled-agent/src/bootstrap/sprockets_server.rs +++ b/sled-agent/src/bootstrap/sprockets_server.rs @@ -7,10 +7,10 @@ use crate::bootstrap::params::version; use crate::bootstrap::params::Request; use crate::bootstrap::params::RequestEnvelope; -use crate::bootstrap::params::StartSledAgentRequest; use crate::bootstrap::views::Response; use crate::bootstrap::views::ResponseEnvelope; use crate::bootstrap::views::SledAgentResponse; +use sled_agent_types::sled::StartSledAgentRequest; use slog::Logger; use std::io; use std::net::SocketAddrV6; diff --git a/sled-agent/src/common/disk.rs b/sled-agent/src/common/disk.rs index 54c56825eb..7bef28ac7c 100644 --- a/sled-agent/src/common/disk.rs +++ b/sled-agent/src/common/disk.rs @@ -4,12 +4,12 @@ //! Describes the states of network-attached storage. -use crate::params::DiskStateRequested; use chrono::Utc; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; use propolis_client::types::DiskAttachmentState as PropolisDiskState; +use sled_agent_types::disk::DiskStateRequested; use uuid::Uuid; /// Action to be taken on behalf of state transition. diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 820ec746b8..2bf8067d1c 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -5,26 +5,17 @@ //! HTTP entrypoint functions for the sled agent's exposed API use super::sled_agent::SledAgent; -use crate::bootstrap::params::AddSledRequest; -use crate::params::{ - BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, - InstanceExternalIpBody, InstancePutStateBody, InstancePutStateResponse, - InstanceUnregisterResponse, TimeSync, VpcFirewallRulesEnsureBody, - ZoneBundleId, ZoneBundleMetadata, Zpool, -}; use crate::sled_agent::Error as SledAgentError; -use crate::zone_bundle; +use crate::zone_bundle::BundleError; use bootstore::schemes::v0::NetworkConfig; use camino::Utf8PathBuf; use display_error_chain::DisplayErrorChain; use dropshot::{ - endpoint, ApiDescription, ApiDescriptionRegisterError, FreeformBody, - HttpError, HttpResponseCreated, HttpResponseDeleted, HttpResponseHeaders, - HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, - StreamingBody, TypedBody, + ApiDescription, FreeformBody, HttpError, HttpResponseCreated, + HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, + HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody, + TypedBody, }; -use illumos_utils::opte::params::VirtualNetworkInterfaceHost; -use installinator_common::M2Slot; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronZonesConfig, SledRole, }; @@ -34,1017 +25,696 @@ use omicron_common::api::internal::nexus::{ }; use omicron_common::api::internal::shared::{ ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, SwitchPorts, + VirtualNetworkInterfaceHost, +}; +use omicron_common::disk::{ + DiskVariant, DisksManagementResult, M2Slot, OmicronPhysicalDisksConfig, }; -use omicron_common::disk::{DiskVariant, OmicronPhysicalDisksConfig}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use sled_agent_api::*; +use sled_agent_types::boot_disk::{ + BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams, + BootDiskWriteStartQueryParams, +}; +use sled_agent_types::bootstore::BootstoreStatus; +use sled_agent_types::disk::DiskEnsureBody; use sled_agent_types::early_networking::EarlyNetworkConfig; -use sled_storage::resources::DisksManagementResult; +use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; +use sled_agent_types::instance::{ + InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody, + InstancePutStateResponse, InstanceUnregisterResponse, +}; +use sled_agent_types::sled::AddSledRequest; +use sled_agent_types::time_sync::TimeSync; +use sled_agent_types::zone_bundle::{ + BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, + StorageLimit, ZoneBundleId, ZoneBundleMetadata, +}; use std::collections::BTreeMap; -use uuid::Uuid; type SledApiDescription = ApiDescription; /// Returns a description of the sled agent API pub fn api() -> SledApiDescription { - fn register_endpoints( - api: &mut SledApiDescription, - ) -> Result<(), ApiDescriptionRegisterError> { - api.register(disk_put)?; - api.register(cockroachdb_init)?; - api.register(instance_issue_disk_snapshot_request)?; - api.register(instance_put_state)?; - api.register(instance_get_state)?; - api.register(instance_put_external_ip)?; - api.register(instance_delete_external_ip)?; - api.register(instance_register)?; - api.register(instance_unregister)?; - api.register(omicron_zones_get)?; - api.register(omicron_zones_put)?; - api.register(zones_list)?; - api.register(omicron_physical_disks_get)?; - api.register(omicron_physical_disks_put)?; - api.register(zone_bundle_list)?; - api.register(zone_bundle_list_all)?; - api.register(zone_bundle_create)?; - api.register(zone_bundle_get)?; - api.register(zone_bundle_delete)?; - api.register(zone_bundle_utilization)?; - api.register(zone_bundle_cleanup_context)?; - api.register(zone_bundle_cleanup_context_update)?; - api.register(zone_bundle_cleanup)?; - api.register(sled_role_get)?; - api.register(list_v2p)?; - api.register(set_v2p)?; - api.register(del_v2p)?; - api.register(timesync_get)?; - api.register(update_artifact)?; - api.register(vpc_firewall_rules_put)?; - api.register(zpools_get)?; - api.register(uplink_ensure)?; - api.register(read_network_bootstore_config_cache)?; - api.register(write_network_bootstore_config)?; - api.register(sled_add)?; - api.register(host_os_write_start)?; - api.register(host_os_write_status_get)?; - api.register(host_os_write_status_delete)?; - api.register(inventory)?; - api.register(sled_identifiers)?; - api.register(bootstore_status)?; - api.register(list_vpc_routes)?; - api.register(set_vpc_routes)?; - - Ok(()) - } - - let mut api = SledApiDescription::new(); - if let Err(err) = register_endpoints(&mut api) { - panic!("failed to register entrypoints: {}", err); - } - api + sled_agent_api_mod::api_description::() + .expect("registered entrypoints") } -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -struct ZonePathParam { - /// The name of the zone. - zone_name: String, -} +enum SledAgentImpl {} -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -struct ZoneBundleFilter { - /// An optional substring used to filter zone bundles. - filter: Option, -} +impl SledAgentApi for SledAgentImpl { + type Context = SledAgent; -/// List all zone bundles that exist, even for now-deleted zones. -#[endpoint { - method = GET, - path = "/zones/bundles", -}] -async fn zone_bundle_list_all( - rqctx: RequestContext, - query: Query, -) -> Result>, HttpError> { - let sa = rqctx.context(); - let filter = query.into_inner().filter; - sa.list_all_zone_bundles(filter.as_deref()) - .await - .map(HttpResponseOk) - .map_err(HttpError::from) -} - -/// List the zone bundles that are available for a running zone. -#[endpoint { - method = GET, - path = "/zones/bundles/{zone_name}", -}] -async fn zone_bundle_list( - rqctx: RequestContext, - params: Path, -) -> Result>, HttpError> { - let params = params.into_inner(); - let zone_name = params.zone_name; - let sa = rqctx.context(); - sa.list_zone_bundles(&zone_name) - .await - .map(HttpResponseOk) - .map_err(HttpError::from) -} + async fn zone_bundle_list_all( + rqctx: RequestContext, + query: Query, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let filter = query.into_inner().filter; + sa.list_all_zone_bundles(filter.as_deref()) + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + } -/// Ask the sled agent to create a zone bundle. -#[endpoint { - method = POST, - path = "/zones/bundles/{zone_name}", -}] -async fn zone_bundle_create( - rqctx: RequestContext, - params: Path, -) -> Result, HttpError> { - let params = params.into_inner(); - let zone_name = params.zone_name; - let sa = rqctx.context(); - sa.create_zone_bundle(&zone_name) - .await - .map(HttpResponseCreated) - .map_err(HttpError::from) -} + async fn zone_bundle_list( + rqctx: RequestContext, + params: Path, + ) -> Result>, HttpError> { + let params = params.into_inner(); + let zone_name = params.zone_name; + let sa = rqctx.context(); + sa.list_zone_bundles(&zone_name) + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + } -/// Fetch the binary content of a single zone bundle. -#[endpoint { - method = GET, - path = "/zones/bundles/{zone_name}/{bundle_id}", -}] -async fn zone_bundle_get( - rqctx: RequestContext, - params: Path, -) -> Result>, HttpError> { - let params = params.into_inner(); - let zone_name = params.zone_name; - let bundle_id = params.bundle_id; - let sa = rqctx.context(); - let Some(path) = sa - .get_zone_bundle_paths(&zone_name, &bundle_id) - .await - .map_err(HttpError::from)? - .into_iter() - .next() - else { - return Err(HttpError::for_not_found( - None, - format!( - "No zone bundle for zone '{}' with ID '{}'", - zone_name, bundle_id - ), - )); - }; - let f = tokio::fs::File::open(&path).await.map_err(|e| { - HttpError::for_internal_error(format!( - "failed to open zone bundle file at {}: {:?}", - path, e, - )) - })?; - let stream = hyper_staticfile::FileBytesStream::new(f); - let body = FreeformBody(stream.into_body()); - let mut response = HttpResponseHeaders::new_unnamed(HttpResponseOk(body)); - response.headers_mut().append( - http::header::CONTENT_TYPE, - "application/gzip".try_into().unwrap(), - ); - Ok(response) -} + async fn zone_bundle_create( + rqctx: RequestContext, + params: Path, + ) -> Result, HttpError> { + let params = params.into_inner(); + let zone_name = params.zone_name; + let sa = rqctx.context(); + sa.create_zone_bundle(&zone_name) + .await + .map(HttpResponseCreated) + .map_err(HttpError::from) + } -/// Delete a zone bundle. -#[endpoint { - method = DELETE, - path = "/zones/bundles/{zone_name}/{bundle_id}", -}] -async fn zone_bundle_delete( - rqctx: RequestContext, - params: Path, -) -> Result { - let params = params.into_inner(); - let zone_name = params.zone_name; - let bundle_id = params.bundle_id; - let sa = rqctx.context(); - let paths = sa - .get_zone_bundle_paths(&zone_name, &bundle_id) - .await - .map_err(HttpError::from)?; - if paths.is_empty() { - return Err(HttpError::for_not_found( - None, - format!( - "No zone bundle for zone '{}' with ID '{}'", - zone_name, bundle_id - ), - )); - }; - for path in paths.into_iter() { - tokio::fs::remove_file(&path).await.map_err(|e| { + async fn zone_bundle_get( + rqctx: RequestContext, + params: Path, + ) -> Result>, HttpError> + { + let params = params.into_inner(); + let zone_name = params.zone_name; + let bundle_id = params.bundle_id; + let sa = rqctx.context(); + let Some(path) = sa + .get_zone_bundle_paths(&zone_name, &bundle_id) + .await + .map_err(HttpError::from)? + .into_iter() + .next() + else { + return Err(HttpError::for_not_found( + None, + format!( + "No zone bundle for zone '{}' with ID '{}'", + zone_name, bundle_id + ), + )); + }; + let f = tokio::fs::File::open(&path).await.map_err(|e| { HttpError::for_internal_error(format!( - "Failed to delete zone bundle: {e}" + "failed to open zone bundle file at {}: {:?}", + path, e, )) })?; + let stream = hyper_staticfile::FileBytesStream::new(f); + let body = FreeformBody(stream.into_body()); + let mut response = + HttpResponseHeaders::new_unnamed(HttpResponseOk(body)); + response.headers_mut().append( + http::header::CONTENT_TYPE, + "application/gzip".try_into().unwrap(), + ); + Ok(response) } - Ok(HttpResponseDeleted()) -} -/// Return utilization information about all zone bundles. -#[endpoint { - method = GET, - path = "/zones/bundle-cleanup/utilization", -}] -async fn zone_bundle_utilization( - rqctx: RequestContext, -) -> Result< - HttpResponseOk>, - HttpError, -> { - let sa = rqctx.context(); - sa.zone_bundle_utilization() - .await - .map(HttpResponseOk) - .map_err(HttpError::from) -} + async fn zone_bundle_delete( + rqctx: RequestContext, + params: Path, + ) -> Result { + let params = params.into_inner(); + let zone_name = params.zone_name; + let bundle_id = params.bundle_id; + let sa = rqctx.context(); + let paths = sa + .get_zone_bundle_paths(&zone_name, &bundle_id) + .await + .map_err(HttpError::from)?; + if paths.is_empty() { + return Err(HttpError::for_not_found( + None, + format!( + "No zone bundle for zone '{}' with ID '{}'", + zone_name, bundle_id + ), + )); + }; + for path in paths.into_iter() { + tokio::fs::remove_file(&path).await.map_err(|e| { + HttpError::for_internal_error(format!( + "Failed to delete zone bundle: {e}" + )) + })?; + } + Ok(HttpResponseDeleted()) + } -/// Return context used by the zone-bundle cleanup task. -#[endpoint { - method = GET, - path = "/zones/bundle-cleanup/context", -}] -async fn zone_bundle_cleanup_context( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.zone_bundle_cleanup_context().await)) -} + async fn zone_bundle_utilization( + rqctx: RequestContext, + ) -> Result< + HttpResponseOk>, + HttpError, + > { + let sa = rqctx.context(); + sa.zone_bundle_utilization() + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + } + + async fn zone_bundle_cleanup_context( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.zone_bundle_cleanup_context().await)) + } -/// Update context used by the zone-bundle cleanup task. -#[endpoint { - method = PUT, - path = "/zones/bundle-cleanup/context", -}] -async fn zone_bundle_cleanup_context_update( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let params = body.into_inner(); - let new_period = params - .period - .map(zone_bundle::CleanupPeriod::new) - .transpose() - .map_err(|e| HttpError::from(SledAgentError::from(e)))?; - let new_priority = params.priority; - let new_limit = params - .storage_limit - .map(zone_bundle::StorageLimit::new) - .transpose() - .map_err(|e| HttpError::from(SledAgentError::from(e)))?; - sa.update_zone_bundle_cleanup_context(new_period, new_limit, new_priority) + async fn zone_bundle_cleanup_context_update( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let params = body.into_inner(); + let new_period = + params.period.map(CleanupPeriod::new).transpose().map_err(|e| { + HttpError::from(SledAgentError::from(BundleError::from(e))) + })?; + let new_priority = params.priority; + let new_limit = + params.storage_limit.map(StorageLimit::new).transpose().map_err( + |e| HttpError::from(SledAgentError::from(BundleError::from(e))), + )?; + sa.update_zone_bundle_cleanup_context( + new_period, + new_limit, + new_priority, + ) .await .map(|_| HttpResponseUpdatedNoContent()) .map_err(HttpError::from) -} + } -/// Trigger a zone bundle cleanup. -#[endpoint { - method = POST, - path = "/zones/bundle-cleanup", -}] -async fn zone_bundle_cleanup( - rqctx: RequestContext, -) -> Result< - HttpResponseOk>, - HttpError, -> { - let sa = rqctx.context(); - sa.zone_bundle_cleanup().await.map(HttpResponseOk).map_err(HttpError::from) -} + async fn zone_bundle_cleanup( + rqctx: RequestContext, + ) -> Result>, HttpError> + { + let sa = rqctx.context(); + sa.zone_bundle_cleanup() + .await + .map(HttpResponseOk) + .map_err(HttpError::from) + } -/// List the zones that are currently managed by the sled agent. -#[endpoint { - method = GET, - path = "/zones", -}] -async fn zones_list( - rqctx: RequestContext, -) -> Result>, HttpError> { - let sa = rqctx.context(); - sa.zones_list().await.map(HttpResponseOk).map_err(HttpError::from) -} + async fn zones_list( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.zones_list().await.map(HttpResponseOk).map_err(HttpError::from) + } -#[endpoint { - method = GET, - path = "/omicron-zones", -}] -async fn omicron_zones_get( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_zones_list().await?)) -} + async fn omicron_zones_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_zones_list().await?)) + } -#[endpoint { - method = PUT, - path = "/omicron-physical-disks", -}] -async fn omicron_physical_disks_put( - rqctx: RequestContext, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let body_args = body.into_inner(); - let result = sa.omicron_physical_disks_ensure(body_args).await?; - Ok(HttpResponseOk(result)) -} + async fn omicron_zones_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.omicron_zones_ensure(body_args).await?; + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = GET, - path = "/omicron-physical-disks", -}] -async fn omicron_physical_disks_get( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) -} + async fn omicron_physical_disks_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) + } -#[endpoint { - method = PUT, - path = "/omicron-zones", -}] -async fn omicron_zones_put( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); - sa.omicron_zones_ensure(body_args).await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn omicron_physical_disks_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let body_args = body.into_inner(); + let result = sa.omicron_physical_disks_ensure(body_args).await?; + Ok(HttpResponseOk(result)) + } -#[endpoint { - method = GET, - path = "/zpools", -}] -async fn zpools_get( - rqctx: RequestContext, -) -> Result>, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.zpools_get().await)) -} + async fn zpools_get( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.zpools_get().await)) + } -#[endpoint { - method = GET, - path = "/sled-role", -}] -async fn sled_role_get( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.get_role())) -} + async fn sled_role_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.get_role())) + } -/// Initializes a CockroachDB cluster -#[endpoint { - method = POST, - path = "/cockroachdb", -}] -async fn cockroachdb_init( - rqctx: RequestContext, -) -> Result { - let sa = rqctx.context(); - sa.cockroachdb_initialize().await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn cockroachdb_init( + rqctx: RequestContext, + ) -> Result { + let sa = rqctx.context(); + sa.cockroachdb_initialize().await?; + Ok(HttpResponseUpdatedNoContent()) + } -/// Path parameters for Instance requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct InstancePathParam { - instance_id: InstanceUuid, -} + async fn instance_register( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.instance_ensure_registered( + instance_id, + body_args.propolis_id, + body_args.hardware, + body_args.instance_runtime, + body_args.vmm_runtime, + body_args.propolis_addr, + body_args.metadata, + ) + .await?, + )) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}", -}] -async fn instance_register( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_ensure_registered( - instance_id, - body_args.propolis_id, - body_args.hardware, - body_args.instance_runtime, - body_args.vmm_runtime, - body_args.propolis_addr, - body_args.metadata, - ) - .await?, - )) -} + async fn instance_unregister( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + Ok(HttpResponseOk(sa.instance_ensure_unregistered(instance_id).await?)) + } -#[endpoint { - method = DELETE, - path = "/instances/{instance_id}", -}] -async fn instance_unregister( - rqctx: RequestContext, - path_params: Path, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_ensure_unregistered(instance_id).await?)) -} + async fn instance_put_state( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.instance_ensure_state(instance_id, body_args.state).await?, + )) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/state", -}] -async fn instance_put_state( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_ensure_state(instance_id, body_args.state).await?, - )) -} + async fn instance_get_state( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) + } -#[endpoint { - method = GET, - path = "/instances/{instance_id}/state", -}] -async fn instance_get_state( - rqctx: RequestContext, - path_params: Path, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) -} + async fn instance_put_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_put_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/external-ip", -}] -async fn instance_put_external_ip( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - sa.instance_put_external_ip(instance_id, &body_args).await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn instance_delete_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_delete_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = DELETE, - path = "/instances/{instance_id}/external-ip", -}] -async fn instance_delete_external_ip( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - sa.instance_delete_external_ip(instance_id, &body_args).await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn disk_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let disk_id = path_params.into_inner().disk_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.disk_ensure( + disk_id, + body_args.initial_runtime.clone(), + body_args.target.clone(), + ) + .await + .map_err(|e| Error::from(e))?, + )) + } -/// Path parameters for Disk requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct DiskPathParam { - disk_id: Uuid, -} + async fn update_artifact( + rqctx: RequestContext, + artifact: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.update_artifact(artifact.into_inner()).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = PUT, - path = "/disks/{disk_id}", -}] -async fn disk_put( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let disk_id = path_params.into_inner().disk_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.disk_ensure( - disk_id, - body_args.initial_runtime.clone(), - body_args.target.clone(), + async fn instance_issue_disk_snapshot_request( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result< + HttpResponseOk, + HttpError, + > { + let sa = rqctx.context(); + let path_params = path_params.into_inner(); + let body = body.into_inner(); + + sa.instance_issue_disk_snapshot_request( + InstanceUuid::from_untyped_uuid(path_params.instance_id), + path_params.disk_id, + body.snapshot_id, ) - .await - .map_err(|e| Error::from(e))?, - )) -} + .await?; -#[endpoint { - method = POST, - path = "/update" -}] -async fn update_artifact( - rqctx: RequestContext, - artifact: TypedBody, -) -> Result { - let sa = rqctx.context(); - sa.update_artifact(artifact.into_inner()).await.map_err(Error::from)?; - Ok(HttpResponseUpdatedNoContent()) -} + Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { + snapshot_id: body.snapshot_id, + })) + } -#[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestPathParam { - instance_id: Uuid, - disk_id: Uuid, -} + async fn vpc_firewall_rules_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let _vpc_id = path_params.into_inner().vpc_id; + let body_args = body.into_inner(); -#[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestBody { - snapshot_id: Uuid, -} + sa.firewall_rules_ensure(body_args.vni, &body_args.rules[..]) + .await + .map_err(Error::from)?; -#[derive(Serialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestResponse { - snapshot_id: Uuid, -} + Ok(HttpResponseUpdatedNoContent()) + } -/// Take a snapshot of a disk that is attached to an instance -#[endpoint { - method = POST, - path = "/instances/{instance_id}/disks/{disk_id}/snapshot", -}] -async fn instance_issue_disk_snapshot_request( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> -{ - let sa = rqctx.context(); - let path_params = path_params.into_inner(); - let body = body.into_inner(); - - sa.instance_issue_disk_snapshot_request( - InstanceUuid::from_untyped_uuid(path_params.instance_id), - path_params.disk_id, - body.snapshot_id, - ) - .await?; - - Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { - snapshot_id: body.snapshot_id, - })) -} + async fn set_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); -/// Path parameters for VPC requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct VpcPathParam { - vpc_id: Uuid, -} + sa.set_virtual_nic_host(&body_args).await.map_err(Error::from)?; -#[endpoint { - method = PUT, - path = "/vpc/{vpc_id}/firewall/rules", -}] -async fn vpc_firewall_rules_put( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let _vpc_id = path_params.into_inner().vpc_id; - let body_args = body.into_inner(); - - sa.firewall_rules_ensure(body_args.vni, &body_args.rules[..]) - .await - .map_err(Error::from)?; - - Ok(HttpResponseUpdatedNoContent()) -} + Ok(HttpResponseUpdatedNoContent()) + } -/// Create a mapping from a virtual NIC to a physical host -// Keep interface_id to maintain parity with the simulated sled agent, which -// requires interface_id on the path. -#[endpoint { - method = PUT, - path = "/v2p/", -}] -async fn set_v2p( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); - - sa.set_virtual_nic_host(&body_args).await.map_err(Error::from)?; - - Ok(HttpResponseUpdatedNoContent()) -} + async fn del_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); -/// Delete a mapping from a virtual NIC to a physical host -// Keep interface_id to maintain parity with the simulated sled agent, which -// requires interface_id on the path. -#[endpoint { - method = DELETE, - path = "/v2p/", -}] -async fn del_v2p( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); - - sa.unset_virtual_nic_host(&body_args).await.map_err(Error::from)?; - - Ok(HttpResponseUpdatedNoContent()) -} + sa.unset_virtual_nic_host(&body_args).await.map_err(Error::from)?; -/// List v2p mappings present on sled -// Used by nexus background task -#[endpoint { - method = GET, - path = "/v2p/", -}] -async fn list_v2p( - rqctx: RequestContext, -) -> Result>, HttpError> { - let sa = rqctx.context(); + Ok(HttpResponseUpdatedNoContent()) + } - let vnics = sa.list_virtual_nics().await.map_err(Error::from)?; + async fn list_v2p( + rqctx: RequestContext, + ) -> Result>, HttpError> + { + let sa = rqctx.context(); - Ok(HttpResponseOk(vnics)) -} + let vnics = sa.list_virtual_nics().await.map_err(Error::from)?; -#[endpoint { - method = GET, - path = "/timesync", -}] -async fn timesync_get( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.timesync_get().await.map_err(|e| Error::from(e))?)) -} + Ok(HttpResponseOk(vnics)) + } -#[endpoint { - method = POST, - path = "/switch-ports", -}] -async fn uplink_ensure( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - sa.ensure_scrimlet_host_ports(body.into_inner().uplinks).await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn timesync_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.timesync_get().await.map_err(|e| Error::from(e))?)) + } -/// This API endpoint is only reading the local sled agent's view of the -/// bootstore. The boostore is a distributed data store that is eventually -/// consistent. Reads from individual nodes may not represent the latest state. -#[endpoint { - method = GET, - path = "/network-bootstore-config", -}] -async fn read_network_bootstore_config_cache( - rqctx: RequestContext, -) -> Result, HttpError> { - let sa = rqctx.context(); - let bs = sa.bootstore(); - - let config = bs.get_network_config().await.map_err(|e| { - HttpError::for_internal_error(format!("failed to get bootstore: {e}")) - })?; - - let config = match config { - Some(config) => EarlyNetworkConfig::deserialize_bootstore_config( - &rqctx.log, &config, - ) - .map_err(|e| { - HttpError::for_internal_error(format!( - "deserialize early network config: {e}" - )) - })?, - None => { - return Err(HttpError::for_unavail( - None, - "early network config does not exist yet".into(), - )); - } - }; + async fn uplink_ensure( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.ensure_scrimlet_host_ports(body.into_inner().uplinks).await?; + Ok(HttpResponseUpdatedNoContent()) + } - Ok(HttpResponseOk(config)) -} + async fn read_network_bootstore_config_cache( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let bs = sa.bootstore(); -#[endpoint { - method = PUT, - path = "/network-bootstore-config", -}] -async fn write_network_bootstore_config( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let bs = sa.bootstore(); - let config = body.into_inner(); - - bs.update_network_config(NetworkConfig::from(config)).await.map_err( - |e| { + let config = bs.get_network_config().await.map_err(|e| { HttpError::for_internal_error(format!( - "failed to write updated config to boot store: {e}" + "failed to get bootstore: {e}" )) - }, - )?; - - Ok(HttpResponseUpdatedNoContent()) -} + })?; -/// Add a sled to a rack that was already initialized via RSS -#[endpoint { - method = PUT, - path = "/sleds" -}] -async fn sled_add( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let request = body.into_inner(); - - // Perform some minimal validation - if request.start_request.body.use_trust_quorum - && !request.start_request.body.is_lrtq_learner - { - return Err(HttpError::for_bad_request( - None, - "New sleds must be LRTQ learners if trust quorum is in use" - .to_string(), - )); + let config = match config { + Some(config) => EarlyNetworkConfig::deserialize_bootstore_config( + &rqctx.log, &config, + ) + .map_err(|e| { + HttpError::for_internal_error(format!( + "deserialize early network config: {e}" + )) + })?, + None => { + return Err(HttpError::for_unavail( + None, + "early network config does not exist yet".into(), + )); + } + }; + + Ok(HttpResponseOk(config)) } - crate::sled_agent::sled_add( - sa.logger().clone(), - request.sled_id, - request.start_request, - ) - .await - .map_err(|e| { - let message = format!("Failed to add sled to rack cluster: {e}"); - HttpError { - status_code: http::StatusCode::INTERNAL_SERVER_ERROR, - error_code: None, - external_message: message.clone(), - internal_message: message, - } - })?; - Ok(HttpResponseUpdatedNoContent()) -} - -#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] -pub struct BootDiskPathParams { - pub boot_disk: M2Slot, -} - -#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] -pub struct BootDiskUpdatePathParams { - pub boot_disk: M2Slot, - pub update_id: Uuid, -} - -#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] -pub struct BootDiskWriteStartQueryParams { - pub update_id: Uuid, - // TODO do we already have sha2-256 hashes of the OS images, and if so - // should we use that instead? Another option is to use the external API - // `Digest` type, although it predates `serde_human_bytes` so just stores - // the hash as a `String`. - #[serde(with = "serde_human_bytes::hex_array")] - #[schemars(schema_with = "omicron_common::hex_schema::<32>")] - pub sha3_256_digest: [u8; 32], -} - -/// Write a new host OS image to the specified boot disk -#[endpoint { - method = POST, - path = "/boot-disk/{boot_disk}/os/write", -}] -async fn host_os_write_start( - request_context: RequestContext, - path_params: Path, - query_params: Query, - body: StreamingBody, -) -> Result { - let sa = request_context.context(); - let boot_disk = path_params.into_inner().boot_disk; - - // Find our corresponding disk. - let maybe_disk_path = - sa.storage().get_latest_disks().await.iter_managed().find_map( - |(_identity, disk)| { - // Synthetic disks panic if asked for their `slot()`, so filter - // them out first; additionally, filter out any non-M2 disks. - if disk.is_synthetic() || disk.variant() != DiskVariant::M2 { - return None; - } - - // Convert this M2 disk's slot to an M2Slot, and skip any that - // don't match the requested boot_disk. - let Ok(slot) = M2Slot::try_from(disk.slot()) else { - return None; - }; - if slot != boot_disk { - return None; - } - - let raw_devs_path = true; - Some(disk.boot_image_devfs_path(raw_devs_path)) + async fn write_network_bootstore_config( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let bs = sa.bootstore(); + let config = body.into_inner(); + + bs.update_network_config(NetworkConfig::from(config)).await.map_err( + |e| { + HttpError::for_internal_error(format!( + "failed to write updated config to boot store: {e}" + )) }, - ); + )?; - let disk_path = match maybe_disk_path { - Some(Ok(path)) => path, - Some(Err(err)) => { - let message = format!( - "failed to find devfs path for {boot_disk:?}: {}", - DisplayErrorChain::new(&err) - ); - return Err(HttpError { - status_code: http::StatusCode::SERVICE_UNAVAILABLE, - error_code: None, - external_message: message.clone(), - internal_message: message, - }); + Ok(HttpResponseUpdatedNoContent()) + } + + async fn sled_add( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let request = body.into_inner(); + + // Perform some minimal validation + if request.start_request.body.use_trust_quorum + && !request.start_request.body.is_lrtq_learner + { + return Err(HttpError::for_bad_request( + None, + "New sleds must be LRTQ learners if trust quorum is in use" + .to_string(), + )); } - None => { - let message = format!("no disk found for slot {boot_disk:?}",); - return Err(HttpError { - status_code: http::StatusCode::SERVICE_UNAVAILABLE, + + crate::sled_agent::sled_add( + sa.logger().clone(), + request.sled_id, + request.start_request, + ) + .await + .map_err(|e| { + let message = format!("Failed to add sled to rack cluster: {e}"); + HttpError { + status_code: http::StatusCode::INTERNAL_SERVER_ERROR, error_code: None, external_message: message.clone(), internal_message: message, - }); - } - }; - - let BootDiskWriteStartQueryParams { update_id, sha3_256_digest } = - query_params.into_inner(); - sa.boot_disk_os_writer() - .start_update( - boot_disk, - disk_path, - update_id, - sha3_256_digest, - body.into_stream(), - ) - .await - .map_err(|err| HttpError::from(&*err))?; - Ok(HttpResponseUpdatedNoContent()) -} + } + })?; + Ok(HttpResponseUpdatedNoContent()) + } -/// Current progress of an OS image being written to disk. -#[derive( - Debug, Clone, Copy, PartialEq, Eq, Deserialize, JsonSchema, Serialize, -)] -#[serde(tag = "state", rename_all = "snake_case")] -pub enum BootDiskOsWriteProgress { - /// The image is still being uploaded. - ReceivingUploadedImage { bytes_received: usize }, - /// The image is being written to disk. - WritingImageToDisk { bytes_written: usize }, - /// The image is being read back from disk for validation. - ValidatingWrittenImage { bytes_read: usize }, -} + async fn host_os_write_start( + request_context: RequestContext, + path_params: Path, + query_params: Query, + body: StreamingBody, + ) -> Result { + let sa = request_context.context(); + let boot_disk = path_params.into_inner().boot_disk; + + // Find our corresponding disk. + let maybe_disk_path = + sa.storage().get_latest_disks().await.iter_managed().find_map( + |(_identity, disk)| { + // Synthetic disks panic if asked for their `slot()`, so filter + // them out first; additionally, filter out any non-M2 disks. + if disk.is_synthetic() || disk.variant() != DiskVariant::M2 + { + return None; + } + + // Convert this M2 disk's slot to an M2Slot, and skip any that + // don't match the requested boot_disk. + let Ok(slot) = M2Slot::try_from(disk.slot()) else { + return None; + }; + if slot != boot_disk { + return None; + } + + let raw_devs_path = true; + Some(disk.boot_image_devfs_path(raw_devs_path)) + }, + ); -/// Status of an update to a boot disk OS. -#[derive(Debug, Clone, Deserialize, JsonSchema, Serialize)] -#[serde(tag = "status", rename_all = "snake_case")] -pub enum BootDiskOsWriteStatus { - /// No update has been started for this disk, or any previously-started - /// update has completed and had its status cleared. - NoUpdateStarted, - /// An update is currently running. - InProgress { update_id: Uuid, progress: BootDiskOsWriteProgress }, - /// The most recent update completed successfully. - Complete { update_id: Uuid }, - /// The most recent update failed. - Failed { update_id: Uuid, message: String }, -} + let disk_path = match maybe_disk_path { + Some(Ok(path)) => path, + Some(Err(err)) => { + let message = format!( + "failed to find devfs path for {boot_disk:?}: {}", + DisplayErrorChain::new(&err) + ); + return Err(HttpError { + status_code: http::StatusCode::SERVICE_UNAVAILABLE, + error_code: None, + external_message: message.clone(), + internal_message: message, + }); + } + None => { + let message = format!("no disk found for slot {boot_disk:?}",); + return Err(HttpError { + status_code: http::StatusCode::SERVICE_UNAVAILABLE, + error_code: None, + external_message: message.clone(), + internal_message: message, + }); + } + }; + + let BootDiskWriteStartQueryParams { update_id, sha3_256_digest } = + query_params.into_inner(); + sa.boot_disk_os_writer() + .start_update( + boot_disk, + disk_path, + update_id, + sha3_256_digest, + body.into_stream(), + ) + .await + .map_err(|err| HttpError::from(&*err))?; + Ok(HttpResponseUpdatedNoContent()) + } -/// Get the status of writing a new host OS -#[endpoint { - method = GET, - path = "/boot-disk/{boot_disk}/os/write/status", -}] -async fn host_os_write_status_get( - request_context: RequestContext, - path_params: Path, -) -> Result, HttpError> { - let sa = request_context.context(); - let boot_disk = path_params.into_inner().boot_disk; - let status = sa.boot_disk_os_writer().status(boot_disk); - Ok(HttpResponseOk(status)) -} + async fn host_os_write_status_get( + request_context: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sa = request_context.context(); + let boot_disk = path_params.into_inner().boot_disk; + let status = sa.boot_disk_os_writer().status(boot_disk); + Ok(HttpResponseOk(status)) + } -/// Clear the status of a completed write of a new host OS -#[endpoint { - method = DELETE, - path = "/boot-disk/{boot_disk}/os/write/status/{update_id}", -}] -async fn host_os_write_status_delete( - request_context: RequestContext, - path_params: Path, -) -> Result { - let sa = request_context.context(); - let BootDiskUpdatePathParams { boot_disk, update_id } = - path_params.into_inner(); - sa.boot_disk_os_writer() - .clear_terminal_status(boot_disk, update_id) - .map_err(|err| HttpError::from(&err))?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn host_os_write_status_delete( + request_context: RequestContext, + path_params: Path, + ) -> Result { + let sa = request_context.context(); + let BootDiskUpdatePathParams { boot_disk, update_id } = + path_params.into_inner(); + sa.boot_disk_os_writer() + .clear_terminal_status(boot_disk, update_id) + .map_err(|err| HttpError::from(&err))?; + Ok(HttpResponseUpdatedNoContent()) + } -/// Fetch basic information about this sled -#[endpoint { - method = GET, - path = "/inventory", -}] -async fn inventory( - request_context: RequestContext, -) -> Result, HttpError> { - let sa = request_context.context(); - Ok(HttpResponseOk(sa.inventory().await?)) -} + async fn inventory( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + Ok(HttpResponseOk(sa.inventory().await?)) + } -/// Fetch sled identifiers -#[endpoint { - method = GET, - path = "/sled-identifiers", -}] -async fn sled_identifiers( - request_context: RequestContext, -) -> Result, HttpError> { - Ok(HttpResponseOk(request_context.context().sled_identifiers())) -} + async fn sled_identifiers( + request_context: RequestContext, + ) -> Result, HttpError> { + Ok(HttpResponseOk(request_context.context().sled_identifiers())) + } -/// Get the internal state of the local bootstore node -#[endpoint { - method = GET, - path = "/bootstore/status", -}] -async fn bootstore_status( - request_context: RequestContext, -) -> Result, HttpError> { - let sa = request_context.context(); - let bootstore = sa.bootstore(); - let status = bootstore - .get_status() - .await - .map_err(|e| { - HttpError::from(omicron_common::api::external::Error::from(e)) - })? - .into(); - Ok(HttpResponseOk(status)) -} + async fn bootstore_status( + request_context: RequestContext, + ) -> Result, HttpError> { + let sa = request_context.context(); + let bootstore = sa.bootstore(); + let status = bootstore + .get_status() + .await + .map_err(|e| { + HttpError::from(omicron_common::api::external::Error::from(e)) + })? + .into(); + Ok(HttpResponseOk(status)) + } -/// Get the current versions of VPC routing rules. -#[endpoint { - method = GET, - path = "/vpc-routes", -}] -async fn list_vpc_routes( - request_context: RequestContext, -) -> Result>, HttpError> { - let sa = request_context.context(); - Ok(HttpResponseOk(sa.list_vpc_routes())) -} + async fn list_vpc_routes( + request_context: RequestContext, + ) -> Result>, HttpError> { + let sa = request_context.context(); + Ok(HttpResponseOk(sa.list_vpc_routes())) + } -/// Update VPC routing rules. -#[endpoint { - method = PUT, - path = "/vpc-routes", -}] -async fn set_vpc_routes( - request_context: RequestContext, - body: TypedBody>, -) -> Result { - let sa = request_context.context(); - sa.set_vpc_routes(body.into_inner())?; - Ok(HttpResponseUpdatedNoContent()) + async fn set_vpc_routes( + request_context: RequestContext, + body: TypedBody>, + ) -> Result { + let sa = request_context.context(); + sa.set_vpc_routes(body.into_inner())?; + Ok(HttpResponseUpdatedNoContent()) + } } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 5dc4e1e6a2..0bcbc97fd2 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -13,13 +13,6 @@ use crate::instance_manager::{ }; use crate::metrics::MetricsRequestQueue; use crate::nexus::NexusClient; -use crate::params::ZoneBundleMetadata; -use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; -use crate::params::{ - InstanceHardware, InstanceMetadata, InstanceMigrationTargetParams, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, VpcFirewallRule, -}; use crate::profile::*; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; @@ -36,7 +29,7 @@ use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; use omicron_common::api::internal::shared::{ - NetworkInterface, SledIdentifiers, SourceNatConfig, + NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig, }; use omicron_common::backoff; use omicron_common::zpool_name::ZpoolName; @@ -44,6 +37,8 @@ use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; use propolis_client::Client as PropolisClient; use rand::prelude::IteratorRandom; use rand::SeedableRng; +use sled_agent_types::instance::*; +use sled_agent_types::zone_bundle::{ZoneBundleCause, ZoneBundleMetadata}; use sled_storage::dataset::ZONE_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; @@ -225,7 +220,7 @@ enum InstanceRequest { tx: oneshot::Sender, }, PutState { - state: crate::params::InstanceStateRequested, + state: InstanceStateRequested, tx: oneshot::Sender>, }, Terminate { @@ -337,7 +332,7 @@ struct InstanceRunner { source_nat: SourceNatConfig, ephemeral_ip: Option, floating_ips: Vec, - firewall_rules: Vec, + firewall_rules: Vec, dhcp_config: DhcpCfg, // Disk related properties @@ -1158,7 +1153,7 @@ impl Instance { pub async fn put_state( &self, tx: oneshot::Sender>, - state: crate::params::InstanceStateRequested, + state: InstanceStateRequested, ) -> Result<(), Error> { self.tx .send(InstanceRequest::PutState { state, tx }) @@ -1305,7 +1300,7 @@ impl InstanceRunner { async fn put_state( &mut self, - state: crate::params::InstanceStateRequested, + state: InstanceStateRequested, ) -> Result { use propolis_client::types::InstanceStateRequested as PropolisRequest; let (propolis_state, next_published) = match state { @@ -1569,14 +1564,12 @@ mod tests { use crate::metrics; use crate::nexus::make_nexus_client_with_port; use crate::vmm_reservoir::VmmReservoirManagerHandle; - use crate::zone_bundle::CleanupContext; use camino_tempfile::Utf8TempDir; use dns_server::TransientServer; use dropshot::HttpServer; use illumos_utils::dladm::MockDladm; use illumos_utils::dladm::__mock_MockDladm::__create_vnic::Context as MockDladmCreateVnicContext; use illumos_utils::dladm::__mock_MockDladm::__delete_vnic::Context as MockDladmDeleteVnicContext; - use illumos_utils::opte::params::DhcpConfig; use illumos_utils::svc::__wait_for_service::Context as MockWaitForServiceContext; use illumos_utils::zone::MockZones; use illumos_utils::zone::__mock_MockZones::__boot::Context as MockZonesBootContext; @@ -1588,8 +1581,9 @@ mod tests { use omicron_common::api::internal::nexus::{ InstanceProperties, InstanceRuntimeState, VmmState, }; - use omicron_common::api::internal::shared::SledIdentifiers; + use omicron_common::api::internal::shared::{DhcpConfig, SledIdentifiers}; use omicron_common::FileKv; + use sled_agent_types::zone_bundle::CleanupContext; use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::net::Ipv6Addr; use std::net::SocketAddrV6; diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index fe070464ad..63164ed290 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -8,13 +8,6 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::metrics::MetricsRequestQueue; use crate::nexus::NexusClient; -use crate::params::InstanceExternalIpBody; -use crate::params::InstanceMetadata; -use crate::params::ZoneBundleMetadata; -use crate::params::{ - InstanceHardware, InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, -}; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; @@ -32,6 +25,8 @@ use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; +use sled_agent_types::instance::*; +use sled_agent_types::zone_bundle::ZoneBundleMetadata; use sled_storage::manager::StorageHandle; use sled_storage::resources::AllDisks; use slog::Logger; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e920ffc3fc..68389ccf43 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -21,9 +21,10 @@ use crate::hardware_monitor::HardwareMonitor; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; -use crate::zone_bundle::{CleanupContext, ZoneBundler}; +use crate::zone_bundle::ZoneBundler; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; +use sled_agent_types::zone_bundle::CleanupContext; use sled_hardware::{HardwareManager, SledMode, UnparsedDisk}; use sled_storage::config::MountConfig; use sled_storage::disk::RawSyntheticDisk; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index aa5e8fd26f..419e897d75 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -2,234 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::zone_bundle::PriorityOrder; -pub use crate::zone_bundle::ZoneBundleCause; -pub use crate::zone_bundle::ZoneBundleId; -pub use crate::zone_bundle::ZoneBundleMetadata; -pub use illumos_utils::opte::params::DhcpConfig; -pub use illumos_utils::opte::params::VpcFirewallRule; -pub use illumos_utils::opte::params::VpcFirewallRulesEnsureBody; use nexus_sled_agent_shared::inventory::{OmicronZoneConfig, OmicronZoneType}; -use omicron_common::api::internal::nexus::{ - DiskRuntimeState, InstanceProperties, InstanceRuntimeState, - SledInstanceState, VmmRuntimeState, -}; -use omicron_common::api::internal::shared::{ - NetworkInterface, SourceNatConfig, -}; -use omicron_common::disk::DiskVariant; -use omicron_uuid_kinds::PropolisUuid; -use omicron_uuid_kinds::ZpoolUuid; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; pub use sled_hardware::DendriteAsic; -use sled_hardware_types::Baseboard; use sled_storage::dataset::DatasetName; use sled_storage::dataset::DatasetType; -use std::collections::BTreeSet; -use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; -use std::net::{IpAddr, SocketAddr, SocketAddrV6}; -use std::time::Duration; -use uuid::Uuid; - -/// Used to request a Disk state change -#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize, JsonSchema)] -#[serde(rename_all = "lowercase", tag = "state", content = "instance")] -pub enum DiskStateRequested { - Detached, - Attached(Uuid), - Destroyed, - Faulted, -} - -impl DiskStateRequested { - /// Returns whether the requested state is attached to an Instance or not. - pub fn is_attached(&self) -> bool { - match self { - DiskStateRequested::Detached => false, - DiskStateRequested::Destroyed => false, - DiskStateRequested::Faulted => false, - - DiskStateRequested::Attached(_) => true, - } - } -} - -/// Sent from to a sled agent to establish the runtime state of a Disk -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct DiskEnsureBody { - /// Last runtime state of the Disk known to Nexus (used if the agent has - /// never seen this Disk before). - pub initial_runtime: DiskRuntimeState, - /// requested runtime state of the Disk - pub target: DiskStateRequested, -} - -/// Describes the instance hardware. -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] -pub struct InstanceHardware { - pub properties: InstanceProperties, - pub nics: Vec, - pub source_nat: SourceNatConfig, - /// Zero or more external IP addresses (either floating or ephemeral), - /// provided to an instance to allow inbound connectivity. - pub ephemeral_ip: Option, - pub floating_ips: Vec, - pub firewall_rules: Vec, - pub dhcp_config: DhcpConfig, - // TODO: replace `propolis_client::*` with locally-modeled request type - pub disks: Vec, - pub cloud_init_bytes: Option, -} - -/// Metadata used to track statistics about an instance. -/// -// NOTE: The instance ID is not here, since it's already provided in other -// pieces of the instance-related requests. It is pulled from there when -// publishing metrics for the instance. -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -pub struct InstanceMetadata { - pub silo_id: Uuid, - pub project_id: Uuid, -} - -/// The body of a request to ensure that a instance and VMM are known to a sled -/// agent. -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct InstanceEnsureBody { - /// A description of the instance's virtual hardware and the initial runtime - /// state this sled agent should store for this incarnation of the instance. - pub hardware: InstanceHardware, - - /// The instance runtime state for the instance being registered. - pub instance_runtime: InstanceRuntimeState, - - /// The initial VMM runtime state for the VMM being registered. - pub vmm_runtime: VmmRuntimeState, - - /// The ID of the VMM being registered. This may not be the active VMM ID in - /// the instance runtime state (e.g. if the new VMM is going to be a - /// migration target). - pub propolis_id: PropolisUuid, - - /// The address at which this VMM should serve a Propolis server API. - pub propolis_addr: SocketAddr, - - /// Metadata used to track instance statistics. - pub metadata: InstanceMetadata, -} - -/// The body of a request to move a previously-ensured instance into a specific -/// runtime state. -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct InstancePutStateBody { - /// The state into which the instance should be driven. - pub state: InstanceStateRequested, -} - -/// The response sent from a request to move an instance into a specific runtime -/// state. -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct InstancePutStateResponse { - /// The current runtime state of the instance after handling the request to - /// change its state. If the instance's state did not change, this field is - /// `None`. - pub updated_runtime: Option, -} - -/// The response sent from a request to unregister an instance. -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct InstanceUnregisterResponse { - /// The current state of the instance after handling the request to - /// unregister it. If the instance's state did not change, this field is - /// `None`. - pub updated_runtime: Option, -} - -/// Parameters used when directing Propolis to initialize itself via live -/// migration. -#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct InstanceMigrationTargetParams { - /// The Propolis ID of the migration source. - pub src_propolis_id: Uuid, - - /// The address of the Propolis server that will serve as the migration - /// source. - pub src_propolis_addr: SocketAddr, -} - -/// Requestable running state of an Instance. -/// -/// A subset of [`omicron_common::api::external::InstanceState`]. -#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case", tag = "type", content = "value")] -pub enum InstanceStateRequested { - /// Run this instance by migrating in from a previous running incarnation of - /// the instance. - MigrationTarget(InstanceMigrationTargetParams), - /// Start the instance if it is not already running. - Running, - /// Stop the instance. - Stopped, - /// Immediately reset the instance, as though it had stopped and immediately - /// began to run again. - Reboot, -} - -impl Display for InstanceStateRequested { - fn fmt(&self, f: &mut Formatter) -> FormatResult { - write!(f, "{}", self.label()) - } -} - -impl InstanceStateRequested { - fn label(&self) -> &str { - match self { - InstanceStateRequested::MigrationTarget(_) => "migrating in", - InstanceStateRequested::Running => "running", - InstanceStateRequested::Stopped => "stopped", - InstanceStateRequested::Reboot => "reboot", - } - } - - /// Returns true if the state represents a stopped Instance. - pub fn is_stopped(&self) -> bool { - match self { - InstanceStateRequested::MigrationTarget(_) => false, - InstanceStateRequested::Running => false, - InstanceStateRequested::Stopped => true, - InstanceStateRequested::Reboot => false, - } - } -} - -/// Instance runtime state to update for a migration. -#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct InstanceMigrationSourceParams { - pub migration_id: Uuid, - pub dst_propolis_id: PropolisUuid, -} - -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub enum DiskType { - U2, - M2, -} - -impl From for DiskType { - fn from(v: DiskVariant) -> Self { - match v { - DiskVariant::U2 => Self::U2, - DiskVariant::M2 => Self::M2, - } - } -} - -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct Zpool { - pub id: ZpoolUuid, - pub disk_type: DiskType, -} +use std::net::SocketAddrV6; /// Extension trait for `OmicronZoneConfig`. /// @@ -312,88 +89,3 @@ impl OmicronZoneTypeExt for OmicronZoneConfig { &self.zone_type } } - -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct TimeSync { - /// The synchronization state of the sled, true when the system clock - /// and the NTP clock are in sync (to within a small window). - pub sync: bool, - /// The NTP reference ID. - pub ref_id: u32, - /// The NTP reference IP address. - pub ip_addr: IpAddr, - /// The NTP stratum (our upstream's stratum plus one). - pub stratum: u8, - /// The NTP reference time (i.e. what chrony thinks the current time is, not - /// necessarily the current system time). - pub ref_time: f64, - // This could be f32, but there is a problem with progenitor/typify - // where, although the f32 correctly becomes "float" (and not "double") in - // the API spec, that "float" gets converted back to f64 when generating - // the client. - /// The current offset between the NTP clock and system clock. - pub correction: f64, -} - -/// Parameters used to update the zone bundle cleanup context. -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -pub struct CleanupContextUpdate { - /// The new period on which automatic cleanups are run. - pub period: Option, - /// The priority ordering for preserving old zone bundles. - pub priority: Option, - /// The new limit on the underlying dataset quota allowed for bundles. - pub storage_limit: Option, -} - -/// Used to dynamically update external IPs attached to an instance. -#[derive( - Copy, Clone, Debug, Eq, PartialEq, Hash, Deserialize, JsonSchema, Serialize, -)] -#[serde(rename_all = "snake_case", tag = "type", content = "value")] -pub enum InstanceExternalIpBody { - Ephemeral(IpAddr), - Floating(IpAddr), -} - -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -pub struct EstablishedConnection { - baseboard: Baseboard, - addr: SocketAddrV6, -} - -impl From<(Baseboard, SocketAddrV6)> for EstablishedConnection { - fn from(value: (Baseboard, SocketAddrV6)) -> Self { - EstablishedConnection { baseboard: value.0, addr: value.1 } - } -} - -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] -pub struct BootstoreStatus { - pub fsm_ledger_generation: u64, - pub network_config_ledger_generation: Option, - pub fsm_state: String, - pub peers: BTreeSet, - pub established_connections: Vec, - pub accepted_connections: BTreeSet, - pub negotiating_connections: BTreeSet, -} - -impl From for BootstoreStatus { - fn from(value: bootstore::schemes::v0::Status) -> Self { - BootstoreStatus { - fsm_ledger_generation: value.fsm_ledger_generation, - network_config_ledger_generation: value - .network_config_ledger_generation, - fsm_state: value.fsm_state.to_string(), - peers: value.peers, - established_connections: value - .connections - .into_iter() - .map(EstablishedConnection::from) - .collect(), - accepted_connections: value.accepted_connections, - negotiating_connections: value.negotiating_connections, - } - } -} diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index 529ef392b7..42186f66e9 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -3,7 +3,6 @@ use crate::nexus::NexusClient; use anyhow::{anyhow, Result}; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; -use illumos_utils::opte::params::VpcFirewallRule; use illumos_utils::opte::{DhcpCfg, PortCreateParams, PortManager}; use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::zone::Zones; @@ -14,7 +13,9 @@ use omicron_common::api::external::{ Generation, VpcFirewallRuleAction, VpcFirewallRuleDirection, VpcFirewallRulePriority, VpcFirewallRuleStatus, }; -use omicron_common::api::internal::shared::NetworkInterface; +use omicron_common::api::internal::shared::{ + NetworkInterface, ResolvedVpcFirewallRule, +}; use rand::prelude::IteratorRandom; use rand::SeedableRng; use sled_storage::dataset::ZONE_DATASET; @@ -308,7 +309,7 @@ impl ProbeManagerInner { source_nat: None, ephemeral_ip: Some(eip.ip), floating_ips: &[], - firewall_rules: &[VpcFirewallRule { + firewall_rules: &[ResolvedVpcFirewallRule { status: VpcFirewallRuleStatus::Enabled, direction: VpcFirewallRuleDirection::Inbound, targets: vec![nic.clone()], diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 471717989a..ff137f131f 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -4,7 +4,6 @@ //! Plan generation for "where should services be initialized". -use crate::bootstrap::params::StartSledAgentRequest; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use illumos_utils::zpool::ZpoolName; @@ -40,6 +39,7 @@ use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; use sled_agent_types::rack_init::RackInitializeRequest as Config; +use sled_agent_types::sled::StartSledAgentRequest; use sled_storage::dataset::{DatasetName, DatasetType, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 3d5b90a22d..c511cf1447 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -4,16 +4,15 @@ //! Plan generation for "how should sleds be initialized". -use crate::bootstrap::params::StartSledAgentRequestBody; -use crate::bootstrap::{ - config::BOOTSTRAP_AGENT_RACK_INIT_PORT, params::StartSledAgentRequest, -}; +use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use camino::Utf8PathBuf; use omicron_common::ledger::{self, Ledger, Ledgerable}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_agent_types::rack_init::back_compat::RackInitializeRequestV1 as ConfigV1; use sled_agent_types::rack_init::RackInitializeRequest as Config; +use sled_agent_types::sled::StartSledAgentRequest; +use sled_agent_types::sled::StartSledAgentRequestBody; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index ac51912fe6..bead95be80 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -69,10 +69,9 @@ use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; use crate::bootstrap::early_networking::{ EarlyNetworkSetup, EarlyNetworkSetupError, }; -use crate::bootstrap::params::StartSledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::d2n_params; -use crate::params::{OmicronZoneTypeExt, TimeSync}; +use crate::params::OmicronZoneTypeExt; use crate::rack_setup::plan::service::{ Plan as ServicePlan, PlanError as ServicePlanError, }; @@ -120,6 +119,8 @@ use sled_agent_types::early_networking::{ use sled_agent_types::rack_init::{ BootstrapAddressDiscovery, RackInitializeRequest as Config, }; +use sled_agent_types::sled::StartSledAgentRequest; +use sled_agent_types::time_sync::TimeSync; use sled_hardware_types::underlay::BootstrapInterface; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 43fb64914f..c4f3e1008f 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -7,11 +7,11 @@ use super::config::Config; use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; -use crate::bootstrap::params::StartSledAgentRequest; use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::make_nexus_client; use crate::services::ServiceManager; use internal_dns::resolver::Resolver; +use sled_agent_types::sled::StartSledAgentRequest; use slog::Logger; use std::net::SocketAddr; use std::sync::Arc; @@ -99,14 +99,3 @@ impl Server { self.http_server.close().await } } - -/// Runs the OpenAPI generator, emitting the spec to stdout. -pub fn run_openapi() -> Result<(), String> { - http_api() - .openapi("Oxide Sled Agent API", "0.0.1") - .description("API for interacting with individual sleds") - .contact_url("https://oxide.computer") - .contact_email("api@oxide.computer") - .write(&mut std::io::stdout()) - .map_err(|e| e.to_string()) -} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index e319b3fa15..b822ae2963 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -31,10 +31,7 @@ use crate::bootstrap::early_networking::{ use crate::bootstrap::BootstrapNetworking; use crate::config::SidecarRevision; use crate::metrics::MetricsRequestQueue; -use crate::params::{ - DendriteAsic, OmicronZoneConfigExt, OmicronZoneTypeExt, TimeSync, - ZoneBundleCause, ZoneBundleMetadata, -}; +use crate::params::{DendriteAsic, OmicronZoneConfigExt, OmicronZoneTypeExt}; use crate::profile::*; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; @@ -96,6 +93,10 @@ use omicron_common::ledger::{self, Ledger, Ledgerable}; use omicron_ddm_admin_client::{Client as DdmAdminClient, DdmError}; use once_cell::sync::OnceCell; use rand::prelude::SliceRandom; +use sled_agent_types::{ + time_sync::TimeSync, + zone_bundle::{ZoneBundleCause, ZoneBundleMetadata}, +}; use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware::SledMode; diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index ffb7327ce7..6057d03f70 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -410,7 +410,6 @@ impl SimCollection { #[cfg(test)] mod test { - use crate::params::{DiskStateRequested, InstanceStateRequested}; use crate::sim::collection::SimObject; use crate::sim::disk::SimDisk; use crate::sim::instance::SimInstance; @@ -427,6 +426,8 @@ mod test { use omicron_common::api::internal::nexus::VmmState; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::PropolisUuid; + use sled_agent_types::disk::DiskStateRequested; + use sled_agent_types::instance::InstanceStateRequested; fn make_instance( logctx: &LogContext, diff --git a/sled-agent/src/sim/disk.rs b/sled-agent/src/sim/disk.rs index 284e424ebf..9661b1949b 100644 --- a/sled-agent/src/sim/disk.rs +++ b/sled-agent/src/sim/disk.rs @@ -5,7 +5,6 @@ //! Simulated sled agent implementation use crate::nexus::NexusClient; -use crate::params::DiskStateRequested; use crate::sim::simulatable::Simulatable; use async_trait::async_trait; use dropshot::ConfigLogging; @@ -20,6 +19,7 @@ use omicron_common::api::internal::nexus::ProducerKind; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use propolis_client::types::DiskAttachmentState as PropolisDiskState; +use sled_agent_types::disk::DiskStateRequested; use std::net::{Ipv6Addr, SocketAddr}; use std::sync::Arc; use std::time::Duration; diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index d042e19814..c219a747ce 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -5,12 +5,6 @@ //! HTTP entrypoint functions for the sled agent's exposed API use super::collection::PokeMode; -use crate::bootstrap::params::AddSledRequest; -use crate::params::{ - DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, - InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, - VpcFirewallRulesEnsureBody, -}; use dropshot::ApiDescription; use dropshot::HttpError; use dropshot::HttpResponseOk; @@ -19,20 +13,28 @@ use dropshot::Path; use dropshot::RequestContext; use dropshot::TypedBody; use dropshot::{endpoint, ApiDescriptionRegisterError}; -use illumos_utils::opte::params::VirtualNetworkInterfaceHost; use nexus_sled_agent_shared::inventory::{Inventory, OmicronZonesConfig}; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::UpdateArtifactId; +use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use omicron_common::api::internal::shared::{ ResolvedVpcRouteSet, ResolvedVpcRouteState, SwitchPorts, }; +use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_agent_types::disk::DiskEnsureBody; use sled_agent_types::early_networking::EarlyNetworkConfig; -use sled_storage::resources::DisksManagementResult; +use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; +use sled_agent_types::instance::InstanceEnsureBody; +use sled_agent_types::instance::InstanceExternalIpBody; +use sled_agent_types::instance::InstancePutStateBody; +use sled_agent_types::instance::InstancePutStateResponse; +use sled_agent_types::instance::InstanceUnregisterResponse; +use sled_agent_types::sled::AddSledRequest; use std::sync::Arc; use uuid::Uuid; diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 8ee0130262..33bc1c40c1 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -8,7 +8,6 @@ use super::simulatable::Simulatable; use crate::common::instance::{ObservedPropolisState, PublishedVmmState}; use crate::nexus::NexusClient; -use crate::params::InstanceStateRequested; use async_trait::async_trait; use chrono::Utc; use nexus_client; @@ -21,6 +20,7 @@ use propolis_client::types::{ InstanceMigrationStatus as PropolisMigrationStatus, InstanceState as PropolisInstanceState, InstanceStateMonitorResponse, }; +use sled_agent_types::instance::InstanceStateRequested; use std::collections::VecDeque; use std::sync::Arc; use std::sync::Mutex; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 79d57a42e6..10536c8c80 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -11,18 +11,12 @@ use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::nexus::NexusClient; -use crate::params::{ - DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, -}; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; use anyhow::bail; use anyhow::Context; use dropshot::{HttpError, HttpServer}; use futures::lock::Mutex; -use illumos_utils::opte::params::VirtualNetworkInterfaceHost; use nexus_sled_agent_shared::inventory::{ Inventory, InventoryDisk, InventoryZpool, OmicronZonesConfig, SledRole, }; @@ -38,9 +32,11 @@ use omicron_common::api::internal::nexus::{ use omicron_common::api::internal::shared::{ RackNetworkConfig, ResolvedVpcRoute, ResolvedVpcRouteSet, ResolvedVpcRouteState, RouterId, RouterKind, RouterVersion, + VirtualNetworkInterfaceHost, }; use omicron_common::disk::{ - DiskIdentity, DiskVariant, OmicronPhysicalDisksConfig, + DiskIdentity, DiskVariant, DisksManagementResult, + OmicronPhysicalDisksConfig, }; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, ZpoolUuid}; use oxnet::Ipv6Net; @@ -48,10 +44,15 @@ use propolis_client::{ types::VolumeConstructionRequest, Client as PropolisClient, }; use propolis_mock_server::Context as PropolisContext; +use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, }; -use sled_storage::resources::DisksManagementResult; +use sled_agent_types::instance::{ + InstanceExternalIpBody, InstanceHardware, InstanceMetadata, + InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, +}; use slog::Logger; use std::collections::{HashMap, HashSet, VecDeque}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 948ac96bcd..556388ce93 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -19,15 +19,15 @@ use dropshot::HandlerTaskMode; use dropshot::HttpError; use futures::lock::Mutex; use omicron_common::disk::DiskIdentity; +use omicron_common::disk::DiskManagementStatus; use omicron_common::disk::DiskVariant; +use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::ZpoolUuid; use propolis_client::types::VolumeConstructionRequest; -use sled_storage::resources::DiskManagementStatus; -use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::HashMap; use std::collections::HashSet; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index d87df0d7c5..50e5611027 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -7,7 +7,6 @@ use crate::boot_disk_os_writer::BootDiskOsWriter; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::early_networking::EarlyNetworkSetupError; -use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; use crate::instance_manager::InstanceManager; use crate::long_running_tasks::LongRunningTaskHandles; @@ -15,12 +14,7 @@ use crate::metrics::MetricsManager; use crate::nexus::{ NexusClient, NexusNotifierHandle, NexusNotifierInput, NexusNotifierTask, }; -use crate::params::{ - DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, OmicronZoneTypeExt, TimeSync, VpcFirewallRule, - ZoneBundleMetadata, Zpool, -}; +use crate::params::OmicronZoneTypeExt; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; use crate::storage_monitor::StorageMonitorHandle; @@ -34,7 +28,6 @@ use derive_more::From; use dropshot::HttpError; use futures::stream::FuturesUnordered; use futures::StreamExt; -use illumos_utils::opte::params::VirtualNetworkInterfaceHost; use illumos_utils::opte::PortManager; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use illumos_utils::zone::ZONE_PREFIX; @@ -49,8 +42,9 @@ use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; use omicron_common::api::internal::shared::{ - HostPortConfig, RackNetworkConfig, ResolvedVpcRouteSet, - ResolvedVpcRouteState, SledIdentifiers, + HostPortConfig, RackNetworkConfig, ResolvedVpcFirewallRule, + ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, + VirtualNetworkInterfaceHost, }; use omicron_common::api::{ internal::nexus::DiskRuntimeState, internal::nexus::InstanceRuntimeState, @@ -59,15 +53,27 @@ use omicron_common::api::{ use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; -use omicron_common::disk::OmicronPhysicalDisksConfig; +use omicron_common::disk::{DisksManagementResult, OmicronPhysicalDisksConfig}; use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_uuid_kinds::{InstanceUuid, PropolisUuid}; +use sled_agent_api::Zpool; +use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfig; +use sled_agent_types::instance::{ + InstanceExternalIpBody, InstanceHardware, InstanceMetadata, + InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, +}; +use sled_agent_types::sled::{BaseboardId, StartSledAgentRequest}; +use sled_agent_types::time_sync::TimeSync; +use sled_agent_types::zone_bundle::{ + BundleUtilization, CleanupContext, CleanupCount, CleanupPeriod, + PriorityOrder, StorageLimit, ZoneBundleMetadata, +}; use sled_hardware::{underlay, HardwareManager}; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; use sled_storage::manager::StorageHandle; -use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -233,8 +239,9 @@ impl From for dropshot::HttpError { BundleError::NoSuchZone { .. } => { HttpError::for_not_found(None, inner.to_string()) } - BundleError::InvalidStorageLimit - | BundleError::InvalidCleanupPeriod => { + BundleError::StorageLimitCreate(_) + | BundleError::CleanupPeriodCreate(_) + | BundleError::PriorityOrderCreate(_) => { HttpError::for_bad_request(None, inner.to_string()) } BundleError::InstanceTerminating => { @@ -772,18 +779,16 @@ impl SledAgent { } /// Fetch the zone bundle cleanup context. - pub async fn zone_bundle_cleanup_context( - &self, - ) -> zone_bundle::CleanupContext { + pub async fn zone_bundle_cleanup_context(&self) -> CleanupContext { self.inner.zone_bundler.cleanup_context().await } /// Update the zone bundle cleanup context. pub async fn update_zone_bundle_cleanup_context( &self, - period: Option, - storage_limit: Option, - priority: Option, + period: Option, + storage_limit: Option, + priority: Option, ) -> Result<(), Error> { self.inner .zone_bundler @@ -795,15 +800,14 @@ impl SledAgent { /// Fetch the current utilization of the relevant datasets for zone bundles. pub async fn zone_bundle_utilization( &self, - ) -> Result, Error> - { + ) -> Result, Error> { self.inner.zone_bundler.utilization().await.map_err(Error::from) } /// Trigger an explicit request to cleanup old zone bundles. pub async fn zone_bundle_cleanup( &self, - ) -> Result, Error> { + ) -> Result, Error> { self.inner.zone_bundler.cleanup().await.map_err(Error::from) } @@ -1098,7 +1102,7 @@ impl SledAgent { pub async fn firewall_rules_ensure( &self, vpc_vni: Vni, - rules: &[VpcFirewallRule], + rules: &[ResolvedVpcFirewallRule], ) -> Result<(), Error> { self.inner .port_manager diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 4062016597..46cee1c415 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -11,8 +11,6 @@ use anyhow::Context; use camino::FromPathBufError; use camino::Utf8Path; use camino::Utf8PathBuf; -use chrono::DateTime; -use chrono::Utc; use flate2::bufread::GzDecoder; use illumos_utils::running_zone::is_oxide_smf_log_file; use illumos_utils::running_zone::RunningZone; @@ -29,18 +27,12 @@ use illumos_utils::zfs::Snapshot; use illumos_utils::zfs::Zfs; use illumos_utils::zfs::ZFS; use illumos_utils::zone::AdmError; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; +use sled_agent_types::zone_bundle::*; use sled_storage::dataset::U2_DEBUG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; -use std::cmp::Ord; -use std::cmp::Ordering; -use std::cmp::PartialOrd; use std::collections::BTreeMap; use std::collections::BTreeSet; -use std::collections::HashSet; use std::io::Cursor; use std::sync::Arc; use std::time::Duration; @@ -55,104 +47,6 @@ use tokio::time::sleep; use tokio::time::Instant; use uuid::Uuid; -/// An identifier for a zone bundle. -#[derive( - Clone, - Debug, - Deserialize, - Eq, - Hash, - JsonSchema, - Ord, - PartialEq, - PartialOrd, - Serialize, -)] -pub struct ZoneBundleId { - /// The name of the zone this bundle is derived from. - pub zone_name: String, - /// The ID for this bundle itself. - pub bundle_id: Uuid, -} - -/// The reason or cause for a zone bundle, i.e., why it was created. -// -// NOTE: The ordering of the enum variants is important, and should not be -// changed without careful consideration. -// -// The ordering is used when deciding which bundles to remove automatically. In -// addition to time, the cause is used to sort bundles, so changing the variant -// order will change that priority. -#[derive( - Clone, - Copy, - Debug, - Default, - Deserialize, - Eq, - Hash, - JsonSchema, - Ord, - PartialEq, - PartialOrd, - Serialize, -)] -#[serde(rename_all = "snake_case")] -#[non_exhaustive] -pub enum ZoneBundleCause { - /// Some other, unspecified reason. - #[default] - Other, - /// A zone bundle taken when a sled agent finds a zone that it does not - /// expect to be running. - UnexpectedZone, - /// An instance zone was terminated. - TerminatedInstance, - /// Generated in response to an explicit request to the sled agent. - ExplicitRequest, -} - -/// Metadata about a zone bundle. -#[derive( - Clone, - Debug, - Deserialize, - Eq, - Hash, - JsonSchema, - Ord, - PartialEq, - PartialOrd, - Serialize, -)] -pub struct ZoneBundleMetadata { - /// Identifier for this zone bundle - pub id: ZoneBundleId, - /// The time at which this zone bundle was created. - pub time_created: DateTime, - /// A version number for this zone bundle. - pub version: u8, - /// The reason or cause a bundle was created. - pub cause: ZoneBundleCause, -} - -impl ZoneBundleMetadata { - const VERSION: u8 = 0; - - /// Create a new set of metadata for the provided zone. - pub(crate) fn new(zone_name: &str, cause: ZoneBundleCause) -> Self { - Self { - id: ZoneBundleId { - zone_name: zone_name.to_string(), - bundle_id: Uuid::new_v4(), - }, - time_created: Utc::now(), - version: Self::VERSION, - cause, - } - } -} - // The name of the snapshot created from the zone root filesystem. const ZONE_ROOT_SNAPSHOT_NAME: &'static str = "zone-root"; @@ -650,20 +544,14 @@ pub enum BundleError { #[error("Zone '{name}' cannot currently be bundled")] Unavailable { name: String }, - #[error("Storage limit must be expressed as a percentage in (0, 100]")] - InvalidStorageLimit, + #[error(transparent)] + StorageLimitCreate(#[from] StorageLimitCreateError), - #[error( - "Cleanup period must be between {min:?} and {max:?}, inclusive", - min = CleanupPeriod::MIN, - max = CleanupPeriod::MAX, - )] - InvalidCleanupPeriod, + #[error(transparent)] + CleanupPeriodCreate(#[from] CleanupPeriodCreateError), - #[error( - "Invalid priority ordering. Each element must appear exactly once." - )] - InvalidPriorityOrder, + #[error(transparent)] + PriorityOrderCreate(#[from] PriorityOrderCreateError), #[error("Cleanup failed")] Cleanup(#[source] anyhow::Error), @@ -1484,29 +1372,6 @@ async fn get_zone_bundle_paths( Ok(out) } -/// The portion of a debug dataset used for zone bundles. -#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] -pub struct BundleUtilization { - /// The total dataset quota, in bytes. - pub dataset_quota: u64, - /// The total number of bytes available for zone bundles. - /// - /// This is `dataset_quota` multiplied by the context's storage limit. - pub bytes_available: u64, - /// Total bundle usage, in bytes. - pub bytes_used: u64, -} - -#[derive(Clone, Debug, PartialEq)] -struct ZoneBundleInfo { - // The raw metadata for the bundle - metadata: ZoneBundleMetadata, - // The full path to the bundle - path: Utf8PathBuf, - // The number of bytes consumed on disk by the bundle - bytes: u64, -} - // Enumerate all zone bundles under the provided directory. async fn enumerate_zone_bundles( log: &Logger, @@ -1577,15 +1442,6 @@ async fn enumerate_zone_bundles( Ok(out) } -/// The count of bundles / bytes removed during a cleanup operation. -#[derive(Clone, Copy, Debug, Default, Deserialize, JsonSchema, Serialize)] -pub struct CleanupCount { - /// The number of bundles removed. - bundles: u64, - /// The number of bytes removed. - bytes: u64, -} - // Run a cleanup, removing old bundles according to the strategy. // // Return the number of bundles removed and the new usage. @@ -1687,19 +1543,6 @@ async fn compute_bundle_utilization( Ok(out) } -/// Context provided for the zone bundle cleanup task. -#[derive( - Clone, Copy, Debug, Default, Deserialize, JsonSchema, PartialEq, Serialize, -)] -pub struct CleanupContext { - /// The period on which automatic checks and cleanup is performed. - pub period: CleanupPeriod, - /// The limit on the dataset quota available for zone bundles. - pub storage_limit: StorageLimit, - /// The priority ordering for keeping old bundles. - pub priority: PriorityOrder, -} - // Return the number of bytes occupied by the provided directory. // // This returns an error if: @@ -1814,258 +1657,10 @@ async fn zfs_quota(path: &Utf8PathBuf) -> Result { } } -/// The limit on space allowed for zone bundles, as a percentage of the overall -/// dataset's quota. -#[derive( - Clone, - Copy, - Debug, - Deserialize, - JsonSchema, - PartialEq, - PartialOrd, - Serialize, -)] -pub struct StorageLimit(u8); - -impl std::fmt::Display for StorageLimit { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}%", self.as_u8()) - } -} - -impl Default for StorageLimit { - fn default() -> Self { - StorageLimit(25) - } -} - -impl StorageLimit { - /// Minimum percentage of dataset quota supported. - pub const MIN: Self = Self(0); - - /// Maximum percentage of dataset quota supported. - pub const MAX: Self = Self(50); - - /// Construct a new limit allowed for zone bundles. - /// - /// This should be expressed as a percentage, in the range (Self::MIN, - /// Self::MAX]. - pub const fn new(percentage: u8) -> Result { - if percentage > Self::MIN.0 && percentage <= Self::MAX.0 { - Ok(Self(percentage)) - } else { - Err(BundleError::InvalidStorageLimit) - } - } - - /// Return the contained quota percentage. - pub const fn as_u8(&self) -> u8 { - self.0 - } - - // Compute the number of bytes available from a dataset quota, in bytes. - const fn bytes_available(&self, dataset_quota: u64) -> u64 { - (dataset_quota * self.as_u8() as u64) / 100 - } -} - -/// A dimension along with bundles can be sorted, to determine priority. -#[derive( - Clone, - Copy, - Debug, - Deserialize, - Eq, - Hash, - JsonSchema, - Serialize, - Ord, - PartialEq, - PartialOrd, -)] -#[serde(rename_all = "snake_case")] -pub enum PriorityDimension { - /// Sorting by time, with older bundles with lower priority. - Time, - /// Sorting by the cause for creating the bundle. - Cause, - // TODO-completeness: Support zone or zone type (e.g., service vs instance)? -} - -/// The priority order for bundles during cleanup. -/// -/// Bundles are sorted along the dimensions in [`PriorityDimension`], with each -/// dimension appearing exactly once. During cleanup, lesser-priority bundles -/// are pruned first, to maintain the dataset quota. Note that bundles are -/// sorted by each dimension in the order in which they appear, with each -/// dimension having higher priority than the next. -#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] -pub struct PriorityOrder([PriorityDimension; PriorityOrder::EXPECTED_SIZE]); - -impl std::ops::Deref for PriorityOrder { - type Target = [PriorityDimension; PriorityOrder::EXPECTED_SIZE]; - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Default for PriorityOrder { - fn default() -> Self { - Self::DEFAULT - } -} - -impl PriorityOrder { - // NOTE: Must match the number of variants in `PriorityDimension`. - const EXPECTED_SIZE: usize = 2; - const DEFAULT: Self = - Self([PriorityDimension::Cause, PriorityDimension::Time]); - - /// Construct a new priority order. - /// - /// This requires that each dimension appear exactly once. - pub fn new(dims: &[PriorityDimension]) -> Result { - if dims.len() != Self::EXPECTED_SIZE { - return Err(BundleError::InvalidPriorityOrder); - } - let mut seen = HashSet::new(); - for dim in dims.iter() { - if !seen.insert(dim) { - return Err(BundleError::InvalidPriorityOrder); - } - } - Ok(Self(dims.try_into().unwrap())) - } - - // Order zone bundle info according to the contained priority. - // - // We sort the info by each dimension, in the order in which it appears. - // That means earlier dimensions have higher priority than later ones. - fn compare_bundles( - &self, - lhs: &ZoneBundleInfo, - rhs: &ZoneBundleInfo, - ) -> Ordering { - for dim in self.0.iter() { - let ord = match dim { - PriorityDimension::Cause => { - lhs.metadata.cause.cmp(&rhs.metadata.cause) - } - PriorityDimension::Time => { - lhs.metadata.time_created.cmp(&rhs.metadata.time_created) - } - }; - if matches!(ord, Ordering::Equal) { - continue; - } - return ord; - } - Ordering::Equal - } -} - -/// A period on which bundles are automatically cleaned up. -#[derive( - Clone, Copy, Deserialize, JsonSchema, PartialEq, PartialOrd, Serialize, -)] -pub struct CleanupPeriod(Duration); - -impl Default for CleanupPeriod { - fn default() -> Self { - Self(Duration::from_secs(600)) - } -} - -impl CleanupPeriod { - /// The minimum supported cleanup period. - pub const MIN: Self = Self(Duration::from_secs(60)); - - /// The maximum supported cleanup period. - pub const MAX: Self = Self(Duration::from_secs(60 * 60 * 24)); - - /// Construct a new cleanup period, checking that it's valid. - pub fn new(duration: Duration) -> Result { - if duration >= Self::MIN.as_duration() - && duration <= Self::MAX.as_duration() - { - Ok(Self(duration)) - } else { - Err(BundleError::InvalidCleanupPeriod) - } - } - - /// Return the period as a duration. - pub const fn as_duration(&self) -> Duration { - self.0 - } -} - -impl TryFrom for CleanupPeriod { - type Error = BundleError; - - fn try_from(duration: Duration) -> Result { - Self::new(duration) - } -} - -impl std::fmt::Debug for CleanupPeriod { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - self.0.fmt(f) - } -} - #[cfg(test)] mod tests { use super::disk_usage; - use super::PriorityDimension; - use super::PriorityOrder; - use super::StorageLimit; use super::Utf8PathBuf; - use super::ZoneBundleCause; - use super::ZoneBundleId; - use super::ZoneBundleInfo; - use super::ZoneBundleMetadata; - use chrono::TimeZone; - use chrono::Utc; - - #[test] - fn test_sort_zone_bundle_cause() { - use ZoneBundleCause::*; - let mut original = - [ExplicitRequest, Other, TerminatedInstance, UnexpectedZone]; - let expected = - [Other, UnexpectedZone, TerminatedInstance, ExplicitRequest]; - original.sort(); - assert_eq!(original, expected); - } - - #[test] - fn test_priority_dimension() { - assert!(PriorityOrder::new(&[]).is_err()); - assert!(PriorityOrder::new(&[PriorityDimension::Cause]).is_err()); - assert!(PriorityOrder::new(&[ - PriorityDimension::Cause, - PriorityDimension::Cause - ]) - .is_err()); - assert!(PriorityOrder::new(&[ - PriorityDimension::Cause, - PriorityDimension::Cause, - PriorityDimension::Time - ]) - .is_err()); - - assert!(PriorityOrder::new(&[ - PriorityDimension::Cause, - PriorityDimension::Time - ]) - .is_ok()); - assert_eq!( - PriorityOrder::new(&PriorityOrder::default().0).unwrap(), - PriorityOrder::default() - ); - } #[tokio::test] async fn test_disk_usage() { @@ -2081,95 +1676,6 @@ mod tests { let path = Utf8PathBuf::from("/some/nonexistent/path"); assert!(disk_usage(&path).await.is_err()); } - - #[test] - fn test_storage_limit_bytes_available() { - let pct = StorageLimit(1); - assert_eq!(pct.bytes_available(100), 1); - assert_eq!(pct.bytes_available(1000), 10); - - let pct = StorageLimit(100); - assert_eq!(pct.bytes_available(100), 100); - assert_eq!(pct.bytes_available(1000), 1000); - - let pct = StorageLimit(100); - assert_eq!(pct.bytes_available(99), 99); - - let pct = StorageLimit(99); - assert_eq!(pct.bytes_available(1), 0); - - // Test non-power of 10. - let pct = StorageLimit(25); - assert_eq!(pct.bytes_available(32768), 8192); - } - - #[test] - fn test_compare_bundles() { - use PriorityDimension::*; - let time_first = PriorityOrder([Time, Cause]); - let cause_first = PriorityOrder([Cause, Time]); - - fn make_info( - year: i32, - month: u32, - day: u32, - cause: ZoneBundleCause, - ) -> ZoneBundleInfo { - ZoneBundleInfo { - metadata: ZoneBundleMetadata { - id: ZoneBundleId { - zone_name: String::from("oxz_whatever"), - bundle_id: uuid::Uuid::new_v4(), - }, - time_created: Utc - .with_ymd_and_hms(year, month, day, 0, 0, 0) - .single() - .unwrap(), - cause, - version: 0, - }, - path: Utf8PathBuf::from("/some/path"), - bytes: 0, - } - } - - let info = [ - make_info(2020, 1, 2, ZoneBundleCause::TerminatedInstance), - make_info(2020, 1, 2, ZoneBundleCause::ExplicitRequest), - make_info(2020, 1, 1, ZoneBundleCause::TerminatedInstance), - make_info(2020, 1, 1, ZoneBundleCause::ExplicitRequest), - ]; - - let mut sorted = info.clone(); - sorted.sort_by(|lhs, rhs| time_first.compare_bundles(lhs, rhs)); - // Low -> high priority - // [old/terminated, old/explicit, new/terminated, new/explicit] - let expected = [ - info[2].clone(), - info[3].clone(), - info[0].clone(), - info[1].clone(), - ]; - assert_eq!( - sorted, expected, - "sorting zone bundles by time-then-cause failed" - ); - - let mut sorted = info.clone(); - sorted.sort_by(|lhs, rhs| cause_first.compare_bundles(lhs, rhs)); - // Low -> high priority - // [old/terminated, new/terminated, old/explicit, new/explicit] - let expected = [ - info[2].clone(), - info[0].clone(), - info[3].clone(), - info[1].clone(), - ]; - assert_eq!( - sorted, expected, - "sorting zone bundles by cause-then-time failed" - ); - } } #[cfg(all(target_os = "illumos", test))] @@ -2347,7 +1853,10 @@ mod illumos_tests { let new_context = CleanupContext { period: CleanupPeriod::new(ctx.context.period.as_duration() / 2) .unwrap(), - storage_limit: StorageLimit(ctx.context.storage_limit.as_u8() / 2), + storage_limit: StorageLimit::new( + ctx.context.storage_limit.as_u8() / 2, + ) + .unwrap(), priority: PriorityOrder::new( &ctx.context.priority.iter().copied().rev().collect::>(), ) @@ -2529,7 +2038,11 @@ mod illumos_tests { // First, reduce the storage limit, so that we only need to add a few // bundles. ctx.bundler - .update_cleanup_context(None, Some(StorageLimit(2)), None) + .update_cleanup_context( + None, + Some(StorageLimit::new(2).unwrap()), + None, + ) .await .context("failed to update cleanup context")?; diff --git a/sled-agent/tests/integration_tests/commands.rs b/sled-agent/tests/integration_tests/commands.rs index 26c82e488e..8a5b355770 100644 --- a/sled-agent/tests/integration_tests/commands.rs +++ b/sled-agent/tests/integration_tests/commands.rs @@ -13,9 +13,7 @@ use expectorate::assert_contents; use omicron_test_utils::dev::test_cmds::assert_exit_code; use omicron_test_utils::dev::test_cmds::path_to_executable; use omicron_test_utils::dev::test_cmds::run_command; -use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS; use omicron_test_utils::dev::test_cmds::EXIT_USAGE; -use openapiv3::OpenAPI; use subprocess::Exec; /// name of the "sled-agent-sim" executable @@ -56,26 +54,3 @@ fn test_sled_agent_no_args() { assert_contents("tests/output/cmd-sled-agent-noargs-stdout", &stdout_text); assert_contents("tests/output/cmd-sled-agent-noargs-stderr", &stderr_text); } - -#[test] -fn test_sled_agent_openapi_sled() { - let exec = Exec::cmd(path_to_sled_agent()).arg("openapi").arg("sled"); - let (exit_status, stdout_text, stderr_text) = run_command(exec); - assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); - assert_contents( - "tests/output/cmd-sled-agent-openapi-sled-stderr", - &stderr_text, - ); - - let spec: OpenAPI = serde_json::from_str(&stdout_text) - .expect("stdout was not valid OpenAPI"); - - // Check for lint errors. - let errors = openapi_lint::validate(&spec); - assert!(errors.is_empty(), "{}", errors.join("\n\n")); - - // Confirm that the output hasn't changed. It's expected that we'll change - // this file as the API evolves, but pay attention to the diffs to ensure - // that the changes match your expectations. - assert_contents("../openapi/sled-agent.json", &stdout_text); -} diff --git a/sled-agent/tests/output/cmd-sled-agent-noargs-stderr b/sled-agent/tests/output/cmd-sled-agent-noargs-stderr index ee397c0ef7..409d1ec0d8 100644 --- a/sled-agent/tests/output/cmd-sled-agent-noargs-stderr +++ b/sled-agent/tests/output/cmd-sled-agent-noargs-stderr @@ -3,9 +3,8 @@ See README.adoc for more information Usage: sled-agent Commands: - openapi Generates the OpenAPI specification - run Runs the Sled Agent server - help Print this message or the help of the given subcommand(s) + run Runs the Sled Agent server + help Print this message or the help of the given subcommand(s) Options: -h, --help Print help diff --git a/sled-agent/tests/output/cmd-sled-agent-openapi-bootstrap-stderr b/sled-agent/tests/output/cmd-sled-agent-openapi-bootstrap-stderr deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/sled-agent/tests/output/cmd-sled-agent-openapi-sled-stderr b/sled-agent/tests/output/cmd-sled-agent-openapi-sled-stderr deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/sled-agent/types/Cargo.toml b/sled-agent/types/Cargo.toml index a9ed8fcb22..e01d40db28 100644 --- a/sled-agent/types/Cargo.toml +++ b/sled-agent/types/Cargo.toml @@ -9,8 +9,10 @@ workspace = true [dependencies] anyhow.workspace = true +async-trait.workspace = true bootstore.workspace = true camino.workspace = true +chrono.workspace = true nexus-sled-agent-shared.workspace = true # Note: we're trying to avoid a dependency from sled-agent-types to nexus-types # because the correct direction of dependency is unclear. If there are types @@ -19,11 +21,13 @@ omicron-common.workspace = true omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true oxnet.workspace = true +propolis-client.workspace = true schemars.workspace = true serde.workspace = true +serde_human_bytes.workspace = true serde_json.workspace = true +sha3.workspace = true sled-hardware-types.workspace = true -sled-storage.workspace = true slog.workspace = true thiserror.workspace = true toml.workspace = true diff --git a/sled-agent/types/src/boot_disk.rs b/sled-agent/types/src/boot_disk.rs new file mode 100644 index 0000000000..30129d6c7e --- /dev/null +++ b/sled-agent/types/src/boot_disk.rs @@ -0,0 +1,62 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Common types related to boot disks. + +use omicron_common::disk::M2Slot; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootDiskPathParams { + pub boot_disk: M2Slot, +} + +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootDiskUpdatePathParams { + pub boot_disk: M2Slot, + pub update_id: Uuid, +} + +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootDiskWriteStartQueryParams { + pub update_id: Uuid, + // TODO do we already have sha2-256 hashes of the OS images, and if so + // should we use that instead? Another option is to use the external API + // `Digest` type, although it predates `serde_human_bytes` so just stores + // the hash as a `String`. + #[serde(with = "serde_human_bytes::hex_array")] + #[schemars(schema_with = "omicron_common::hex_schema::<32>")] + pub sha3_256_digest: [u8; 32], +} + +/// Current progress of an OS image being written to disk. +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Deserialize, JsonSchema, Serialize, +)] +#[serde(tag = "state", rename_all = "snake_case")] +pub enum BootDiskOsWriteProgress { + /// The image is still being uploaded. + ReceivingUploadedImage { bytes_received: usize }, + /// The image is being written to disk. + WritingImageToDisk { bytes_written: usize }, + /// The image is being read back from disk for validation. + ValidatingWrittenImage { bytes_read: usize }, +} + +/// Status of an update to a boot disk OS. +#[derive(Debug, Clone, Deserialize, JsonSchema, Serialize)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum BootDiskOsWriteStatus { + /// No update has been started for this disk, or any previously-started + /// update has completed and had its status cleared. + NoUpdateStarted, + /// An update is currently running. + InProgress { update_id: Uuid, progress: BootDiskOsWriteProgress }, + /// The most recent update completed successfully. + Complete { update_id: Uuid }, + /// The most recent update failed. + Failed { update_id: Uuid, message: String }, +} diff --git a/sled-agent/types/src/bootstore.rs b/sled-agent/types/src/bootstore.rs new file mode 100644 index 0000000000..9c9e8257a4 --- /dev/null +++ b/sled-agent/types/src/bootstore.rs @@ -0,0 +1,51 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::{collections::BTreeSet, net::SocketAddrV6}; + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_hardware_types::Baseboard; + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootstoreStatus { + pub fsm_ledger_generation: u64, + pub network_config_ledger_generation: Option, + pub fsm_state: String, + pub peers: BTreeSet, + pub established_connections: Vec, + pub accepted_connections: BTreeSet, + pub negotiating_connections: BTreeSet, +} + +impl From for BootstoreStatus { + fn from(value: bootstore::schemes::v0::Status) -> Self { + BootstoreStatus { + fsm_ledger_generation: value.fsm_ledger_generation, + network_config_ledger_generation: value + .network_config_ledger_generation, + fsm_state: value.fsm_state.to_string(), + peers: value.peers, + established_connections: value + .connections + .into_iter() + .map(EstablishedConnection::from) + .collect(), + accepted_connections: value.accepted_connections, + negotiating_connections: value.negotiating_connections, + } + } +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct EstablishedConnection { + pub baseboard: Baseboard, + pub addr: SocketAddrV6, +} + +impl From<(Baseboard, SocketAddrV6)> for EstablishedConnection { + fn from(value: (Baseboard, SocketAddrV6)) -> Self { + EstablishedConnection { baseboard: value.0, addr: value.1 } + } +} diff --git a/sled-agent/types/src/disk.rs b/sled-agent/types/src/disk.rs new file mode 100644 index 0000000000..332f1a0c5c --- /dev/null +++ b/sled-agent/types/src/disk.rs @@ -0,0 +1,41 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_common::api::internal::nexus::DiskRuntimeState; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Sent from to a sled agent to establish the runtime state of a Disk +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct DiskEnsureBody { + /// Last runtime state of the Disk known to Nexus (used if the agent has + /// never seen this Disk before). + pub initial_runtime: DiskRuntimeState, + /// requested runtime state of the Disk + pub target: DiskStateRequested, +} + +/// Used to request a Disk state change +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize, JsonSchema)] +#[serde(rename_all = "lowercase", tag = "state", content = "instance")] +pub enum DiskStateRequested { + Detached, + Attached(Uuid), + Destroyed, + Faulted, +} + +impl DiskStateRequested { + /// Returns whether the requested state is attached to an Instance or not. + pub fn is_attached(&self) -> bool { + match self { + DiskStateRequested::Detached => false, + DiskStateRequested::Destroyed => false, + DiskStateRequested::Faulted => false, + + DiskStateRequested::Attached(_) => true, + } + } +} diff --git a/sled-agent/types/src/firewall_rules.rs b/sled-agent/types/src/firewall_rules.rs new file mode 100644 index 0000000000..d7cb22f976 --- /dev/null +++ b/sled-agent/types/src/firewall_rules.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_common::api::{ + external, internal::shared::ResolvedVpcFirewallRule, +}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Update firewall rules for a VPC +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct VpcFirewallRulesEnsureBody { + pub vni: external::Vni, + pub rules: Vec, +} diff --git a/sled-agent/types/src/instance.rs b/sled-agent/types/src/instance.rs new file mode 100644 index 0000000000..0753e273dc --- /dev/null +++ b/sled-agent/types/src/instance.rs @@ -0,0 +1,172 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Common instance-related types. + +use std::{ + fmt, + net::{IpAddr, SocketAddr}, +}; + +use omicron_common::api::internal::{ + nexus::{ + InstanceProperties, InstanceRuntimeState, SledInstanceState, + VmmRuntimeState, + }, + shared::{ + DhcpConfig, NetworkInterface, ResolvedVpcFirewallRule, SourceNatConfig, + }, +}; +use omicron_uuid_kinds::PropolisUuid; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// The body of a request to ensure that a instance and VMM are known to a sled +/// agent. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct InstanceEnsureBody { + /// A description of the instance's virtual hardware and the initial runtime + /// state this sled agent should store for this incarnation of the instance. + pub hardware: InstanceHardware, + + /// The instance runtime state for the instance being registered. + pub instance_runtime: InstanceRuntimeState, + + /// The initial VMM runtime state for the VMM being registered. + pub vmm_runtime: VmmRuntimeState, + + /// The ID of the VMM being registered. This may not be the active VMM ID in + /// the instance runtime state (e.g. if the new VMM is going to be a + /// migration target). + pub propolis_id: PropolisUuid, + + /// The address at which this VMM should serve a Propolis server API. + pub propolis_addr: SocketAddr, + + /// Metadata used to track instance statistics. + pub metadata: InstanceMetadata, +} + +/// Describes the instance hardware. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +pub struct InstanceHardware { + pub properties: InstanceProperties, + pub nics: Vec, + pub source_nat: SourceNatConfig, + /// Zero or more external IP addresses (either floating or ephemeral), + /// provided to an instance to allow inbound connectivity. + pub ephemeral_ip: Option, + pub floating_ips: Vec, + pub firewall_rules: Vec, + pub dhcp_config: DhcpConfig, + // TODO: replace `propolis_client::*` with locally-modeled request type + pub disks: Vec, + pub cloud_init_bytes: Option, +} + +/// Metadata used to track statistics about an instance. +/// +// NOTE: The instance ID is not here, since it's already provided in other +// pieces of the instance-related requests. It is pulled from there when +// publishing metrics for the instance. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct InstanceMetadata { + pub silo_id: Uuid, + pub project_id: Uuid, +} + +/// The body of a request to move a previously-ensured instance into a specific +/// runtime state. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct InstancePutStateBody { + /// The state into which the instance should be driven. + pub state: InstanceStateRequested, +} + +/// The response sent from a request to move an instance into a specific runtime +/// state. +#[derive(Debug, Serialize, Deserialize, JsonSchema)] +pub struct InstancePutStateResponse { + /// The current runtime state of the instance after handling the request to + /// change its state. If the instance's state did not change, this field is + /// `None`. + pub updated_runtime: Option, +} + +/// Requestable running state of an Instance. +/// +/// A subset of [`omicron_common::api::external::InstanceState`]. +#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum InstanceStateRequested { + /// Run this instance by migrating in from a previous running incarnation of + /// the instance. + MigrationTarget(InstanceMigrationTargetParams), + /// Start the instance if it is not already running. + Running, + /// Stop the instance. + Stopped, + /// Immediately reset the instance, as though it had stopped and immediately + /// began to run again. + Reboot, +} + +impl fmt::Display for InstanceStateRequested { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.label()) + } +} + +impl InstanceStateRequested { + fn label(&self) -> &str { + match self { + InstanceStateRequested::MigrationTarget(_) => "migrating in", + InstanceStateRequested::Running => "running", + InstanceStateRequested::Stopped => "stopped", + InstanceStateRequested::Reboot => "reboot", + } + } + + /// Returns true if the state represents a stopped Instance. + pub fn is_stopped(&self) -> bool { + match self { + InstanceStateRequested::MigrationTarget(_) => false, + InstanceStateRequested::Running => false, + InstanceStateRequested::Stopped => true, + InstanceStateRequested::Reboot => false, + } + } +} + +/// The response sent from a request to unregister an instance. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct InstanceUnregisterResponse { + /// The current state of the instance after handling the request to + /// unregister it. If the instance's state did not change, this field is + /// `None`. + pub updated_runtime: Option, +} + +/// Parameters used when directing Propolis to initialize itself via live +/// migration. +#[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct InstanceMigrationTargetParams { + /// The Propolis ID of the migration source. + pub src_propolis_id: Uuid, + + /// The address of the Propolis server that will serve as the migration + /// source. + pub src_propolis_addr: SocketAddr, +} + +/// Used to dynamically update external IPs attached to an instance. +#[derive( + Copy, Clone, Debug, Eq, PartialEq, Hash, Deserialize, JsonSchema, Serialize, +)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum InstanceExternalIpBody { + Ephemeral(IpAddr), + Floating(IpAddr), +} diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs index 12e8f049f9..47e1535ade 100644 --- a/sled-agent/types/src/lib.rs +++ b/sled-agent/types/src/lib.rs @@ -4,6 +4,14 @@ //! Common types for sled-agent. +pub mod boot_disk; +pub mod bootstore; +pub mod disk; pub mod early_networking; +pub mod firewall_rules; +pub mod instance; pub mod rack_init; pub mod rack_ops; +pub mod sled; +pub mod time_sync; +pub mod zone_bundle; diff --git a/sled-agent/types/src/sled.rs b/sled-agent/types/src/sled.rs new file mode 100644 index 0000000000..37a064bdc9 --- /dev/null +++ b/sled-agent/types/src/sled.rs @@ -0,0 +1,219 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types related to operating on sleds. + +use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; + +use async_trait::async_trait; +use omicron_common::{ + address::{self, Ipv6Subnet, SLED_PREFIX}, + ledger::Ledgerable, +}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sha3::{Digest, Sha3_256}; +use uuid::Uuid; + +/// A representation of a Baseboard ID as used in the inventory subsystem +/// This type is essentially the same as a `Baseboard` except it doesn't have a +/// revision or HW type (Gimlet, PC, Unknown). +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct BaseboardId { + /// Oxide Part Number + pub part_number: String, + /// Serial number (unique for a given part number) + pub serial_number: String, +} + +/// A request to Add a given sled after rack initialization has occurred +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct AddSledRequest { + pub sled_id: BaseboardId, + pub start_request: StartSledAgentRequest, +} + +/// Configuration information for launching a Sled Agent. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequest { + /// The current generation number of data as stored in CRDB. + /// + /// The initial generation is set during RSS time and then only mutated + /// by Nexus. For now, we don't actually anticipate mutating this data, + /// but we leave open the possiblity. + pub generation: u64, + + // Which version of the data structure do we have. This is to help with + // deserialization and conversion in future updates. + pub schema_version: u32, + + // The actual configuration details + pub body: StartSledAgentRequestBody, +} + +impl StartSledAgentRequest { + pub fn sled_address(&self) -> SocketAddrV6 { + address::get_sled_address(self.body.subnet) + } + + pub fn switch_zone_ip(&self) -> Ipv6Addr { + address::get_switch_zone_address(self.body.subnet) + } + + /// Compute the sha3_256 digest of `self.rack_id` to use as a `salt` + /// for disk encryption. We don't want to include other values that are + /// consistent across sleds as it would prevent us from moving drives + /// between sleds. + pub fn hash_rack_id(&self) -> [u8; 32] { + // We know the unwrap succeeds as a Sha3_256 digest is 32 bytes + Sha3_256::digest(self.body.rack_id.as_bytes()) + .as_slice() + .try_into() + .unwrap() + } +} + +#[async_trait] +impl Ledgerable for StartSledAgentRequest { + fn is_newer_than(&self, other: &Self) -> bool { + self.generation > other.generation + } + + fn generation_bump(&mut self) { + // DO NOTHING! + // + // Generation bumps must only ever come from nexus and will be encoded + // in the struct itself + } + + // Attempt to deserialize the v1 or v0 version and return + // the v1 version. + fn deserialize( + s: &str, + ) -> Result { + // Try to deserialize the latest version of the data structure (v1). If + // that succeeds we are done. + if let Ok(val) = serde_json::from_str::(s) { + return Ok(val); + } + + // We don't have the latest version. Try to deserialize v0 and then + // convert it to the latest version. + let v0 = serde_json::from_str::(s)?.request; + Ok(v0.into()) + } +} + +/// This is the actual app level data of `StartSledAgentRequest` +/// +/// We nest it below the "header" of `generation` and `schema_version` so that +/// we can perform partial deserialization of `EarlyNetworkConfig` to only read +/// the header and defer deserialization of the body once we know the schema +/// version. This is possible via the use of [`serde_json::value::RawValue`] in +/// future (post-v1) deserialization paths. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequestBody { + /// Uuid of the Sled Agent to be created. + pub id: Uuid, + + /// Uuid of the rack to which this sled agent belongs. + pub rack_id: Uuid, + + /// Use trust quorum for key generation + pub use_trust_quorum: bool, + + /// Is this node an LRTQ learner node? + /// + /// We only put the node into learner mode if `use_trust_quorum` is also + /// true. + pub is_lrtq_learner: bool, + + /// Portion of the IP space to be managed by the Sled Agent. + pub subnet: Ipv6Subnet, +} + +/// The version of `StartSledAgentRequest` we originally shipped with. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +pub struct StartSledAgentRequestV0 { + /// Uuid of the Sled Agent to be created. + pub id: Uuid, + + /// Uuid of the rack to which this sled agent belongs. + pub rack_id: Uuid, + + /// The external NTP servers to use + pub ntp_servers: Vec, + + /// The external DNS servers to use + pub dns_servers: Vec, + + /// Use trust quorum for key generation + pub use_trust_quorum: bool, + + // Note: The order of these fields is load bearing, because we serialize + // `SledAgentRequest`s as toml. `subnet` serializes as a TOML table, so it + // must come after non-table fields. + /// Portion of the IP space to be managed by the Sled Agent. + pub subnet: Ipv6Subnet, +} + +impl From for StartSledAgentRequest { + fn from(v0: StartSledAgentRequestV0) -> Self { + StartSledAgentRequest { + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: v0.id, + rack_id: v0.rack_id, + use_trust_quorum: v0.use_trust_quorum, + is_lrtq_learner: false, + subnet: v0.subnet, + }, + } + } +} + +// A wrapper around StartSledAgentRequestV0 that was used +// for the ledger format. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] +struct PersistentSledAgentRequest { + request: StartSledAgentRequestV0, +} + +#[cfg(test)] +mod tests { + use std::net::Ipv6Addr; + + use super::*; + + #[test] + fn serialize_start_sled_agent_v0_deserialize_v1() { + let v0 = PersistentSledAgentRequest { + request: StartSledAgentRequestV0 { + id: Uuid::new_v4(), + rack_id: Uuid::new_v4(), + ntp_servers: vec![String::from("test.pool.example.com")], + dns_servers: vec!["1.1.1.1".parse().unwrap()], + use_trust_quorum: false, + subnet: Ipv6Subnet::new(Ipv6Addr::LOCALHOST), + }, + }; + let serialized = serde_json::to_string(&v0).unwrap(); + let expected = StartSledAgentRequest { + generation: 0, + schema_version: 1, + body: StartSledAgentRequestBody { + id: v0.request.id, + rack_id: v0.request.rack_id, + use_trust_quorum: v0.request.use_trust_quorum, + is_lrtq_learner: false, + subnet: v0.request.subnet, + }, + }; + + let actual: StartSledAgentRequest = + Ledgerable::deserialize(&serialized).unwrap(); + assert_eq!(expected, actual); + } +} diff --git a/sled-agent/types/src/time_sync.rs b/sled-agent/types/src/time_sync.rs new file mode 100644 index 0000000000..7ac9ded636 --- /dev/null +++ b/sled-agent/types/src/time_sync.rs @@ -0,0 +1,30 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::net::IpAddr; + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct TimeSync { + /// The synchronization state of the sled, true when the system clock + /// and the NTP clock are in sync (to within a small window). + pub sync: bool, + /// The NTP reference ID. + pub ref_id: u32, + /// The NTP reference IP address. + pub ip_addr: IpAddr, + /// The NTP stratum (our upstream's stratum plus one). + pub stratum: u8, + /// The NTP reference time (i.e. what chrony thinks the current time is, not + /// necessarily the current system time). + pub ref_time: f64, + // This could be f32, but there is a problem with progenitor/typify + // where, although the f32 correctly becomes "float" (and not "double") in + // the API spec, that "float" gets converted back to f64 when generating + // the client. + /// The current offset between the NTP clock and system clock. + pub correction: f64, +} diff --git a/sled-agent/types/src/zone_bundle.rs b/sled-agent/types/src/zone_bundle.rs new file mode 100644 index 0000000000..f7a388771d --- /dev/null +++ b/sled-agent/types/src/zone_bundle.rs @@ -0,0 +1,529 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types related to zone bundles. + +use std::{cmp::Ordering, collections::HashSet, time::Duration}; + +use camino::Utf8PathBuf; +use chrono::{DateTime, Utc}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use uuid::Uuid; + +/// An identifier for a zone bundle. +#[derive( + Clone, + Debug, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +pub struct ZoneBundleId { + /// The name of the zone this bundle is derived from. + pub zone_name: String, + /// The ID for this bundle itself. + pub bundle_id: Uuid, +} + +/// The reason or cause for a zone bundle, i.e., why it was created. +// +// NOTE: The ordering of the enum variants is important, and should not be +// changed without careful consideration. +// +// The ordering is used when deciding which bundles to remove automatically. In +// addition to time, the cause is used to sort bundles, so changing the variant +// order will change that priority. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +#[serde(rename_all = "snake_case")] +#[non_exhaustive] +pub enum ZoneBundleCause { + /// Some other, unspecified reason. + #[default] + Other, + /// A zone bundle taken when a sled agent finds a zone that it does not + /// expect to be running. + UnexpectedZone, + /// An instance zone was terminated. + TerminatedInstance, + /// Generated in response to an explicit request to the sled agent. + ExplicitRequest, +} + +/// Metadata about a zone bundle. +#[derive( + Clone, + Debug, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +pub struct ZoneBundleMetadata { + /// Identifier for this zone bundle + pub id: ZoneBundleId, + /// The time at which this zone bundle was created. + pub time_created: DateTime, + /// A version number for this zone bundle. + pub version: u8, + /// The reason or cause a bundle was created. + pub cause: ZoneBundleCause, +} + +impl ZoneBundleMetadata { + pub const VERSION: u8 = 0; + + /// Create a new set of metadata for the provided zone. + pub fn new(zone_name: &str, cause: ZoneBundleCause) -> Self { + Self { + id: ZoneBundleId { + zone_name: zone_name.to_string(), + bundle_id: Uuid::new_v4(), + }, + time_created: Utc::now(), + version: Self::VERSION, + cause, + } + } +} + +/// A dimension along with bundles can be sorted, to determine priority. +#[derive( + Clone, + Copy, + Debug, + Deserialize, + Eq, + Hash, + JsonSchema, + Serialize, + Ord, + PartialEq, + PartialOrd, +)] +#[serde(rename_all = "snake_case")] +pub enum PriorityDimension { + /// Sorting by time, with older bundles with lower priority. + Time, + /// Sorting by the cause for creating the bundle. + Cause, + // TODO-completeness: Support zone or zone type (e.g., service vs instance)? +} + +/// The priority order for bundles during cleanup. +/// +/// Bundles are sorted along the dimensions in [`PriorityDimension`], with each +/// dimension appearing exactly once. During cleanup, lesser-priority bundles +/// are pruned first, to maintain the dataset quota. Note that bundles are +/// sorted by each dimension in the order in which they appear, with each +/// dimension having higher priority than the next. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +pub struct PriorityOrder([PriorityDimension; PriorityOrder::EXPECTED_SIZE]); + +impl std::ops::Deref for PriorityOrder { + type Target = [PriorityDimension; PriorityOrder::EXPECTED_SIZE]; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Default for PriorityOrder { + fn default() -> Self { + Self::DEFAULT + } +} + +impl PriorityOrder { + // NOTE: Must match the number of variants in `PriorityDimension`. + const EXPECTED_SIZE: usize = 2; + const DEFAULT: Self = + Self([PriorityDimension::Cause, PriorityDimension::Time]); + + /// Construct a new priority order. + /// + /// This requires that each dimension appear exactly once. + pub fn new( + dims: &[PriorityDimension], + ) -> Result { + if dims.len() != Self::EXPECTED_SIZE { + return Err(PriorityOrderCreateError::WrongDimensionCount( + dims.len(), + )); + } + let mut seen = HashSet::new(); + for dim in dims.iter() { + if !seen.insert(dim) { + return Err(PriorityOrderCreateError::DuplicateFound(*dim)); + } + } + Ok(Self(dims.try_into().unwrap())) + } + + /// Get the priority order as a slice. + pub fn as_slice(&self) -> &[PriorityDimension] { + &self.0 + } + + /// Order zone bundle info according to the contained priority. + /// + /// We sort the info by each dimension, in the order in which it appears. + /// That means earlier dimensions have higher priority than later ones. + pub fn compare_bundles( + &self, + lhs: &ZoneBundleInfo, + rhs: &ZoneBundleInfo, + ) -> Ordering { + for dim in self.0.iter() { + let ord = match dim { + PriorityDimension::Cause => { + lhs.metadata.cause.cmp(&rhs.metadata.cause) + } + PriorityDimension::Time => { + lhs.metadata.time_created.cmp(&rhs.metadata.time_created) + } + }; + if matches!(ord, Ordering::Equal) { + continue; + } + return ord; + } + Ordering::Equal + } +} + +/// A period on which bundles are automatically cleaned up. +#[derive( + Clone, Copy, Deserialize, JsonSchema, PartialEq, PartialOrd, Serialize, +)] +pub struct CleanupPeriod(Duration); + +impl Default for CleanupPeriod { + fn default() -> Self { + Self(Duration::from_secs(600)) + } +} + +impl CleanupPeriod { + /// The minimum supported cleanup period. + pub const MIN: Self = Self(Duration::from_secs(60)); + + /// The maximum supported cleanup period. + pub const MAX: Self = Self(Duration::from_secs(60 * 60 * 24)); + + /// Construct a new cleanup period, checking that it's valid. + pub fn new(duration: Duration) -> Result { + if duration >= Self::MIN.as_duration() + && duration <= Self::MAX.as_duration() + { + Ok(Self(duration)) + } else { + Err(CleanupPeriodCreateError::OutOfBounds(duration)) + } + } + + /// Return the period as a duration. + pub const fn as_duration(&self) -> Duration { + self.0 + } +} + +impl TryFrom for CleanupPeriod { + type Error = CleanupPeriodCreateError; + + fn try_from(duration: Duration) -> Result { + Self::new(duration) + } +} + +impl std::fmt::Debug for CleanupPeriod { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Clone, Debug, PartialEq)] +pub struct ZoneBundleInfo { + /// The raw metadata for the bundle + pub metadata: ZoneBundleMetadata, + /// The full path to the bundle + pub path: Utf8PathBuf, + /// The number of bytes consumed on disk by the bundle + pub bytes: u64, +} + +/// The portion of a debug dataset used for zone bundles. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BundleUtilization { + /// The total dataset quota, in bytes. + pub dataset_quota: u64, + /// The total number of bytes available for zone bundles. + /// + /// This is `dataset_quota` multiplied by the context's storage limit. + pub bytes_available: u64, + /// Total bundle usage, in bytes. + pub bytes_used: u64, +} + +/// Context provided for the zone bundle cleanup task. +#[derive( + Clone, Copy, Debug, Default, Deserialize, JsonSchema, PartialEq, Serialize, +)] +pub struct CleanupContext { + /// The period on which automatic checks and cleanup is performed. + pub period: CleanupPeriod, + /// The limit on the dataset quota available for zone bundles. + pub storage_limit: StorageLimit, + /// The priority ordering for keeping old bundles. + pub priority: PriorityOrder, +} + +/// The count of bundles / bytes removed during a cleanup operation. +#[derive(Clone, Copy, Debug, Default, Deserialize, JsonSchema, Serialize)] +pub struct CleanupCount { + /// The number of bundles removed. + pub bundles: u64, + /// The number of bytes removed. + pub bytes: u64, +} + +/// The limit on space allowed for zone bundles, as a percentage of the overall +/// dataset's quota. +#[derive( + Clone, + Copy, + Debug, + Deserialize, + JsonSchema, + PartialEq, + PartialOrd, + Serialize, +)] +pub struct StorageLimit(u8); + +impl std::fmt::Display for StorageLimit { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}%", self.as_u8()) + } +} + +impl Default for StorageLimit { + fn default() -> Self { + StorageLimit(25) + } +} + +impl StorageLimit { + /// Minimum percentage of dataset quota supported. + pub const MIN: Self = Self(0); + + /// Maximum percentage of dataset quota supported. + pub const MAX: Self = Self(50); + + /// Construct a new limit allowed for zone bundles. + /// + /// This should be expressed as a percentage, in the range (Self::MIN, + /// Self::MAX]. + pub const fn new(percentage: u8) -> Result { + if percentage > Self::MIN.0 && percentage <= Self::MAX.0 { + Ok(Self(percentage)) + } else { + Err(StorageLimitCreateError::OutOfBounds(percentage)) + } + } + + /// Return the contained quota percentage. + pub const fn as_u8(&self) -> u8 { + self.0 + } + + // Compute the number of bytes available from a dataset quota, in bytes. + pub const fn bytes_available(&self, dataset_quota: u64) -> u64 { + (dataset_quota * self.as_u8() as u64) / 100 + } +} + +#[derive(Debug, Error)] +pub enum PriorityOrderCreateError { + #[error("expected exactly {n} dimensions, found {0}", n = PriorityOrder::EXPECTED_SIZE)] + WrongDimensionCount(usize), + #[error("duplicate element found in priority ordering: {0:?}")] + DuplicateFound(PriorityDimension), +} + +#[derive(Debug, Error)] +pub enum CleanupPeriodCreateError { + #[error( + "invalid cleanup period ({0:?}): must be \ + between {min:?} and {max:?}, inclusive", + min = CleanupPeriod::MIN, + max = CleanupPeriod::MAX, + )] + OutOfBounds(Duration), +} + +#[derive(Debug, Error)] +pub enum StorageLimitCreateError { + #[error("invalid storage limit ({0}): must be expressed as a percentage in ({min}, {max}]", + min = StorageLimit::MIN.0, + max = StorageLimit::MAX.0, + )] + OutOfBounds(u8), +} + +#[cfg(test)] +mod tests { + use chrono::TimeZone; + + use super::*; + + #[test] + fn test_sort_zone_bundle_cause() { + use ZoneBundleCause::*; + let mut original = + [ExplicitRequest, Other, TerminatedInstance, UnexpectedZone]; + let expected = + [Other, UnexpectedZone, TerminatedInstance, ExplicitRequest]; + original.sort(); + assert_eq!(original, expected); + } + + #[test] + fn test_priority_dimension() { + assert!(PriorityOrder::new(&[]).is_err()); + assert!(PriorityOrder::new(&[PriorityDimension::Cause]).is_err()); + assert!(PriorityOrder::new(&[ + PriorityDimension::Cause, + PriorityDimension::Cause + ]) + .is_err()); + assert!(PriorityOrder::new(&[ + PriorityDimension::Cause, + PriorityDimension::Cause, + PriorityDimension::Time + ]) + .is_err()); + + assert!(PriorityOrder::new(&[ + PriorityDimension::Cause, + PriorityDimension::Time + ]) + .is_ok()); + assert_eq!( + PriorityOrder::new(PriorityOrder::default().as_slice()).unwrap(), + PriorityOrder::default() + ); + } + + #[test] + fn test_storage_limit_bytes_available() { + let pct = StorageLimit(1); + assert_eq!(pct.bytes_available(100), 1); + assert_eq!(pct.bytes_available(1000), 10); + + let pct = StorageLimit(100); + assert_eq!(pct.bytes_available(100), 100); + assert_eq!(pct.bytes_available(1000), 1000); + + let pct = StorageLimit(100); + assert_eq!(pct.bytes_available(99), 99); + + let pct = StorageLimit(99); + assert_eq!(pct.bytes_available(1), 0); + + // Test non-power of 10. + let pct = StorageLimit(25); + assert_eq!(pct.bytes_available(32768), 8192); + } + + #[test] + fn test_compare_bundles() { + use PriorityDimension::*; + let time_first = PriorityOrder([Time, Cause]); + let cause_first = PriorityOrder([Cause, Time]); + + fn make_info( + year: i32, + month: u32, + day: u32, + cause: ZoneBundleCause, + ) -> ZoneBundleInfo { + ZoneBundleInfo { + metadata: ZoneBundleMetadata { + id: ZoneBundleId { + zone_name: String::from("oxz_whatever"), + bundle_id: uuid::Uuid::new_v4(), + }, + time_created: Utc + .with_ymd_and_hms(year, month, day, 0, 0, 0) + .single() + .unwrap(), + cause, + version: 0, + }, + path: Utf8PathBuf::from("/some/path"), + bytes: 0, + } + } + + let info = [ + make_info(2020, 1, 2, ZoneBundleCause::TerminatedInstance), + make_info(2020, 1, 2, ZoneBundleCause::ExplicitRequest), + make_info(2020, 1, 1, ZoneBundleCause::TerminatedInstance), + make_info(2020, 1, 1, ZoneBundleCause::ExplicitRequest), + ]; + + let mut sorted = info.clone(); + sorted.sort_by(|lhs, rhs| time_first.compare_bundles(lhs, rhs)); + // Low -> high priority + // [old/terminated, old/explicit, new/terminated, new/explicit] + let expected = [ + info[2].clone(), + info[3].clone(), + info[0].clone(), + info[1].clone(), + ]; + assert_eq!( + sorted, expected, + "sorting zone bundles by time-then-cause failed" + ); + + let mut sorted = info.clone(); + sorted.sort_by(|lhs, rhs| cause_first.compare_bundles(lhs, rhs)); + // Low -> high priority + // [old/terminated, new/terminated, old/explicit, new/explicit] + let expected = [ + info[2].clone(), + info[0].clone(), + info[3].clone(), + info[1].clone(), + ]; + assert_eq!( + sorted, expected, + "sorting zone bundles by cause-then-time failed" + ); + } +} diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 8168f32cea..3cbf00530a 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -10,7 +10,7 @@ use crate::config::MountConfig; use crate::dataset::{DatasetName, CONFIG_DATASET}; use crate::disk::RawDisk; use crate::error::Error; -use crate::resources::{AllDisks, DisksManagementResult, StorageResources}; +use crate::resources::{AllDisks, StorageResources}; use camino::Utf8PathBuf; use debug_ignore::DebugIgnore; use futures::future::FutureExt; @@ -18,7 +18,8 @@ use illumos_utils::zfs::{Mountpoint, Zfs}; use illumos_utils::zpool::ZpoolName; use key_manager::StorageKeyRequester; use omicron_common::disk::{ - DiskIdentity, DiskVariant, OmicronPhysicalDisksConfig, + DiskIdentity, DiskVariant, DisksManagementResult, + OmicronPhysicalDisksConfig, }; use omicron_common::ledger::Ledger; use slog::{info, o, warn, Logger}; @@ -826,10 +827,10 @@ mod tests { use crate::dataset::DatasetType; use crate::disk::RawSyntheticDisk; use crate::manager_test_harness::StorageManagerTestHarness; - use crate::resources::DiskManagementError; use super::*; use camino_tempfile::tempdir_in; + use omicron_common::disk::DiskManagementError; use omicron_common::ledger; use omicron_test_utils::dev::test_setup_log; use sled_hardware::DiskFirmware; diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index 98d6398d8b..425aafb12d 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -14,12 +14,10 @@ use illumos_utils::zpool::{PathInPool, ZpoolName}; use key_manager::StorageKeyRequester; use omicron_common::api::external::Generation; use omicron_common::disk::{ - DiskIdentity, DiskVariant, OmicronPhysicalDiskConfig, + DiskIdentity, DiskManagementError, DiskManagementStatus, DiskVariant, + DisksManagementResult, OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, }; -use omicron_uuid_kinds::ZpoolUuid; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; use sled_hardware::DiskFirmware; use slog::{info, o, warn, Logger}; use std::collections::BTreeMap; @@ -32,76 +30,6 @@ const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; -#[derive(Debug, thiserror::Error, JsonSchema, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "type", content = "value")] -pub enum DiskManagementError { - #[error("Disk requested by control plane, but not found on device")] - NotFound, - - #[error("Expected zpool UUID of {expected}, but saw {observed}")] - ZpoolUuidMismatch { expected: ZpoolUuid, observed: ZpoolUuid }, - - #[error("Failed to access keys necessary to unlock storage. This error may be transient.")] - KeyManager(String), - - #[error("Other error starting disk management: {0}")] - Other(String), -} - -impl DiskManagementError { - fn retryable(&self) -> bool { - match self { - DiskManagementError::KeyManager(_) => true, - _ => false, - } - } -} - -/// Identifies how a single disk management operation may have succeeded or -/// failed. -#[derive(Debug, JsonSchema, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub struct DiskManagementStatus { - pub identity: DiskIdentity, - pub err: Option, -} - -/// The result from attempting to manage underlying disks. -/// -/// This is more complex than a simple "Error" type because it's possible -/// for some disks to be initialized correctly, while others can fail. -/// -/// This structure provides a mechanism for callers to learn about partial -/// failures, and handle them appropriately on a per-disk basis. -#[derive(Default, Debug, JsonSchema, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -#[must_use = "this `DiskManagementResult` may contain errors, which should be handled"] -pub struct DisksManagementResult { - pub status: Vec, -} - -impl DisksManagementResult { - pub fn has_error(&self) -> bool { - for status in &self.status { - if status.err.is_some() { - return true; - } - } - false - } - - pub fn has_retryable_error(&self) -> bool { - for status in &self.status { - if let Some(err) = &status.err { - if err.retryable() { - return true; - } - } - } - false - } -} - // The Sled Agent is responsible for both observing disks and managing them at // the request of the broader control plane. This enum encompasses that duality, // by representing all disks that can exist, managed or not. diff --git a/wicketd/src/installinator_progress.rs b/wicketd/src/installinator_progress.rs index 7d076e7b0e..8f8465652e 100644 --- a/wicketd/src/installinator_progress.rs +++ b/wicketd/src/installinator_progress.rs @@ -295,10 +295,10 @@ mod tests { use installinator_common::{ InstallinatorCompletionMetadata, InstallinatorComponent, - InstallinatorSpec, InstallinatorStepId, M2Slot, StepEvent, - StepEventKind, StepInfo, StepInfoWithMetadata, StepOutcome, - WriteOutput, + InstallinatorSpec, InstallinatorStepId, StepEvent, StepEventKind, + StepInfo, StepInfoWithMetadata, StepOutcome, WriteOutput, }; + use omicron_common::disk::M2Slot; use omicron_test_utils::dev::test_setup_log; use schemars::JsonSchema; use update_engine::ExecutionId; diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index dee22f70c0..9980359253 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -35,9 +35,9 @@ use gateway_messages::ROT_PAGE_SIZE; use hubtools::RawHubrisArchive; use installinator_common::InstallinatorCompletionMetadata; use installinator_common::InstallinatorSpec; -use installinator_common::M2Slot; use installinator_common::WriteOutput; use omicron_common::api::external::SemverVersion; +use omicron_common::disk::M2Slot; use omicron_common::update::ArtifactHash; use slog::error; use slog::info; From 4a6be3a40f6c9f876ff53308532f6c37f030b13f Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 13 Aug 2024 15:54:25 -0700 Subject: [PATCH 29/91] [meta] update to mio 1.0.2 (#6317) mio 1.0.1 is busted on illumos, so ensure that we pick up an update. --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b588f2738d..ee3d2a49f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1481,7 +1481,7 @@ dependencies = [ "bitflags 2.6.0", "crossterm_winapi", "futures-core", - "mio 1.0.1", + "mio 1.0.2", "parking_lot 0.12.2", "rustix", "signal-hook", @@ -4551,9 +4551,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ "hermit-abi 0.3.9", "libc", @@ -9048,7 +9048,7 @@ checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" dependencies = [ "libc", "mio 0.8.11", - "mio 1.0.1", + "mio 1.0.2", "signal-hook", ] From 9d8078bc34a78778a3a7d82a58f90a1fe55677df Mon Sep 17 00:00:00 2001 From: Levon Tarver <11586085+internet-diglett@users.noreply.github.com> Date: Tue, 13 Aug 2024 19:34:49 -0500 Subject: [PATCH 30/91] BUGFIX: 6316 - be more specific when checking `allow_export_list_active` (#6318) This change updates the queries to lookup peers by `switch_port_settings_id` AND peer `address` when checking `allow_export_list_active` column. Related --- #6316 --- nexus/db-model/src/switch_port.rs | 8 ++++++++ nexus/db-queries/src/db/datastore/bgp.rs | 2 ++ 2 files changed, 10 insertions(+) diff --git a/nexus/db-model/src/switch_port.rs b/nexus/db-model/src/switch_port.rs index 48afd7b52a..f790d7d527 100644 --- a/nexus/db-model/src/switch_port.rs +++ b/nexus/db-model/src/switch_port.rs @@ -642,9 +642,13 @@ pub struct SwitchPortBgpPeerConfigCommunity { )] #[diesel(table_name = switch_port_settings_bgp_peer_config_allow_export)] pub struct SwitchPortBgpPeerConfigAllowExport { + /// Parent switch port configuration pub port_settings_id: Uuid, + /// Interface peer is reachable on pub interface_name: String, + /// Peer Address pub addr: IpNetwork, + /// Allowed Prefix pub prefix: IpNetwork, } @@ -660,9 +664,13 @@ pub struct SwitchPortBgpPeerConfigAllowExport { )] #[diesel(table_name = switch_port_settings_bgp_peer_config_allow_import)] pub struct SwitchPortBgpPeerConfigAllowImport { + /// Parent switch port configuration pub port_settings_id: Uuid, + /// Interface peer is reachable on pub interface_name: String, + /// Peer Address pub addr: IpNetwork, + /// Allowed Prefix pub prefix: IpNetwork, } diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs index 1244184c1d..f4bea0f605 100644 --- a/nexus/db-queries/src/db/datastore/bgp.rs +++ b/nexus/db-queries/src/db/datastore/bgp.rs @@ -606,6 +606,7 @@ impl DataStore { .transaction(&conn, |conn| async move { let active = peer_dsl::switch_port_settings_bgp_peer_config .filter(db_peer::port_settings_id.eq(port_settings_id)) + .filter(db_peer::addr.eq(addr)) .select(db_peer::allow_export_list_active) .limit(1) .first_async::(&conn) @@ -652,6 +653,7 @@ impl DataStore { .transaction(&conn, |conn| async move { let active = peer_dsl::switch_port_settings_bgp_peer_config .filter(db_peer::port_settings_id.eq(port_settings_id)) + .filter(db_peer::addr.eq(addr)) .select(db_peer::allow_import_list_active) .limit(1) .first_async::(&conn) From c61b43edfd881a0d1a082a89438ee44857f9cfea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Wed, 14 Aug 2024 14:06:19 +1200 Subject: [PATCH 31/91] [docs] Small fix to README (#6322) I was creating a new OpenAPI managed service and at first it wasn't clear to me that this was the doc I was looking for --- README.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.adoc b/README.adoc index 449bd3e5ea..d48a5c9736 100644 --- a/README.adoc +++ b/README.adoc @@ -223,7 +223,7 @@ Note that Omicron contains a nominally circular dependency: We effectively "break" this circular dependency by virtue of the OpenAPI documents being checked in. -==== Updating Managed Services +==== Updating or Creating New Managed Services See the documentation in link:./dev-tools/openapi-manager[`dev-tools/openapi-manager`] for more information. From 5ccb386efa5ff61966c867f74c73c3ff02a168b8 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 04:38:12 +0000 Subject: [PATCH 32/91] Update taiki-e/install-action digest to 4f13fb6 (#6324) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`7f737c1` -> `4f13fb6`](https://togithub.com/taiki-e/install-action/compare/7f737c1...4f13fb6) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index e310c011e7..ddc4ffc021 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@7f737c1056bae14d45b3daec1a2d26ad480e50f7 # v2 + uses: taiki-e/install-action@4f13fb62448d53782828736cd5b0fd395b5f0c06 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 823f8ab5499a8a6b8969caf79b050fb288850f4a Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 14 Aug 2024 13:52:49 -0400 Subject: [PATCH 33/91] [#5333 3/6] Region snapshot replacement start (#6294) This commit adds a "region snapshot replacement start" background task that will: 1) check for snapshots that are stored on disks that were expunged, and insert region snapshot replacement requests for them. 2) check if there are any region snapshot replacement requests in the `Requested` state and run the new "region snapshot replacement start" saga for them. This background task will also pick up manually requested region snapshot replacements. Also in this commit is the "region snapshot replacement start saga", which will transition a region snapshot replacement request from `Requested` to `Allocating` and: - allocate a new Region (with source set to Some so that a clone occurs) - create a blank Volume that will be used to stash the snapshot to delete later - swap the snapshot being replaced with the new region that was cloned from the others in the read-only region set - update the region snapshot replacement request's state to "ReplacementDone" and clearing the operating saga id This represents the first step to be taken after a snapshot goes away: allocate the replacement, and swap it in to the affected snapshot volume. Once this is done, anything that uses the snapshot volume as a read-only parent will no longer reference the expunged snapshot, and will work without any issues. Existing constructed Volumes running in a propolis or pantry context will remain unmodified: a future commit will contain the saga that takes care of performing the necessary read-only target replacements. --- Cargo.lock | 162 ++- Cargo.toml | 1 + dev-tools/omdb/src/bin/omdb/nexus.rs | 33 + dev-tools/omdb/tests/env.out | 12 + dev-tools/omdb/tests/successes.out | 11 + nexus-config/src/nexus_config.rs | 16 + nexus/Cargo.toml | 1 + nexus/db-queries/src/db/datastore/mod.rs | 4 + nexus/db-queries/src/db/datastore/volume.rs | 8 +- nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 21 +- nexus/src/app/background/tasks/mod.rs | 1 + .../region_snapshot_replacement_start.rs | 512 ++++++++ nexus/src/app/crucible.rs | 2 +- nexus/src/app/sagas/common_storage.rs | 31 + nexus/src/app/sagas/mod.rs | 4 + .../src/app/sagas/region_replacement_start.rs | 39 +- .../region_snapshot_replacement_start.rs | 1134 +++++++++++++++++ nexus/test-utils/src/resource_helpers.rs | 22 + nexus/tests/config.test.toml | 1 + nexus/types/src/internal_api/background.rs | 9 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + workspace-hack/Cargo.toml | 9 + 25 files changed, 1994 insertions(+), 43 deletions(-) create mode 100644 nexus/src/app/background/tasks/region_snapshot_replacement_start.rs create mode 100644 nexus/src/app/sagas/region_snapshot_replacement_start.rs diff --git a/Cargo.lock b/Cargo.lock index ee3d2a49f1..cd12b9de9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -913,6 +913,11 @@ name = "cc" version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] [[package]] name = "cert-dev" @@ -1514,6 +1519,36 @@ dependencies = [ "serde_json", ] +[[package]] +name = "crucible-common" +version = "0.0.1" +source = "git+https://github.com/oxidecomputer/crucible?rev=e58ca3693cb9ce0438947beba10e97ee38a0966b#e58ca3693cb9ce0438947beba10e97ee38a0966b" +dependencies = [ + "anyhow", + "atty", + "crucible-workspace-hack", + "dropshot", + "nix 0.28.0", + "rusqlite", + "rustls-pemfile 1.0.4", + "schemars", + "serde", + "serde_json", + "slog", + "slog-async", + "slog-bunyan", + "slog-dtrace", + "slog-term", + "tempfile", + "thiserror", + "tokio", + "tokio-rustls 0.24.1", + "toml 0.8.19", + "twox-hash", + "uuid", + "vergen", +] + [[package]] name = "crucible-pantry-client" version = "0.0.1" @@ -2484,6 +2519,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fancy-regex" version = "0.13.0" @@ -3004,6 +3051,19 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "git2" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b903b73e45dc0c6c596f2d37eccece7c1c8bb6e4407b001096387c63d0d93724" +dependencies = [ + "bitflags 2.6.0", + "libc", + "libgit2-sys", + "log", + "url", +] + [[package]] name = "glob" version = "0.3.1" @@ -3150,6 +3210,15 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "headers" version = "0.3.9" @@ -4024,6 +4093,15 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -4183,6 +4261,18 @@ dependencies = [ "zone 0.1.8", ] +[[package]] +name = "libgit2-sys" +version = "0.17.0+1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10472326a8a6477c3c20a64547b0059e4b0d086869eee31e6d7da728a8eb7224" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + [[package]] name = "libloading" version = "0.8.3" @@ -4266,6 +4356,16 @@ dependencies = [ "redox_syscall 0.5.1", ] +[[package]] +name = "libsqlite3-sys" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f" +dependencies = [ + "pkg-config", + "vcpkg", +] + [[package]] name = "libsw" version = "3.3.1" @@ -4298,6 +4398,18 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libz-sys" +version = "1.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e143b5e666b2695d28f6bca6497720813f699c9602dd7f5cac91008b8ada7f9" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linear-map" version = "1.2.0" @@ -5756,6 +5868,7 @@ dependencies = [ "cockroach-admin-client", "criterion", "crucible-agent-client", + "crucible-common", "crucible-pantry-client", "diesel", "display-error-chain", @@ -6186,6 +6299,7 @@ dependencies = [ "bstr 1.9.1", "byteorder", "bytes", + "cc", "chrono", "cipher", "clap", @@ -6230,6 +6344,7 @@ dependencies = [ "managed", "memchr", "mio 0.8.11", + "nix 0.28.0", "nom", "num-bigint", "num-integer", @@ -7336,7 +7451,7 @@ dependencies = [ "base64 0.22.1", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "hmac", "md-5", "memchr", @@ -7353,7 +7468,7 @@ checksum = "02048d9e032fb3cc3413bbf7b83a15d84a5d419778e2628751896d856498eee9" dependencies = [ "bytes", "chrono", - "fallible-iterator", + "fallible-iterator 0.2.0", "postgres-protocol", "serde", "serde_json", @@ -8321,6 +8436,20 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rusqlite" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae" +dependencies = [ + "bitflags 2.6.0", + "fallible-iterator 0.3.0", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec 1.13.2", +] + [[package]] name = "russh" version = "0.43.0" @@ -10238,7 +10367,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "futures-channel", "futures-util", "log", @@ -10723,6 +10852,17 @@ dependencies = [ "utf-8", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "rand 0.7.3", + "static_assertions", +] + [[package]] name = "typed-path" version = "0.7.1" @@ -11080,6 +11220,22 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vergen" +version = "8.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2990d9ea5967266ea0ccf413a4aa5c42a93dbcfda9cb49a97de6931726b12566" +dependencies = [ + "anyhow", + "cargo_metadata", + "cfg-if", + "git2", + "regex", + "rustc_version 0.4.0", + "rustversion", + "time", +] + [[package]] name = "version_check" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index 962bfb82de..3dd5e61236 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -307,6 +307,7 @@ crossterm = { version = "0.28.1", features = ["event-stream"] } crucible-agent-client = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } crucible-pantry-client = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } crucible-smf = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } +crucible-common = { git = "https://github.com/oxidecomputer/crucible", rev = "e58ca3693cb9ce0438947beba10e97ee38a0966b" } csv = "1.3.0" curve25519-dalek = "4" datatest-stable = "0.2.9" diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 9aae6b2205..1b6d2469f4 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -33,6 +33,7 @@ use nexus_saga_recovery::LastPass; use nexus_types::deployment::Blueprint; use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; +use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::DemoSagaUuid; @@ -1394,6 +1395,38 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } } }; + } else if name == "region_snapshot_replacement" { + match serde_json::from_value::( + details.clone(), + ) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + + Ok(status) => { + println!( + " total requests created ok: {}", + status.requests_created_ok.len(), + ); + for line in &status.requests_created_ok { + println!(" > {line}"); + } + + println!( + " total start saga invoked ok: {}", + status.start_invoked_ok.len(), + ); + for line in &status.start_invoked_ok { + println!(" > {line}"); + } + + println!(" errors: {}", status.errors.len()); + for line in &status.errors { + println!(" > {line}"); + } + } + } } else { println!( "warning: unknown background task: {:?} \ diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 5755df9488..59d9310b57 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -127,6 +127,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_start" + detect if region snapshots need replacement and begin the process + + task: "saga_recovery" recovers sagas assigned to this Nexus @@ -276,6 +280,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_start" + detect if region snapshots need replacement and begin the process + + task: "saga_recovery" recovers sagas assigned to this Nexus @@ -412,6 +420,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_start" + detect if region snapshots need replacement and begin the process + + task: "saga_recovery" recovers sagas assigned to this Nexus diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 66f07cb2f0..166936da9c 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -328,6 +328,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_start" + detect if region snapshots need replacement and begin the process + + task: "saga_recovery" recovers sagas assigned to this Nexus @@ -566,6 +570,13 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_start" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []}) + task: "saga_recovery" configured period: every 10m currently executing: no diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 9d8bf1ac9b..b222ebd23b 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -391,6 +391,8 @@ pub struct BackgroundTaskConfig { pub saga_recovery: SagaRecoveryConfig, /// configuration for lookup region port task pub lookup_region_port: LookupRegionPortConfig, + /// configuration for region snapshot replacement starter task + pub region_snapshot_replacement_start: RegionSnapshotReplacementStartConfig, } #[serde_as] @@ -627,6 +629,14 @@ pub struct LookupRegionPortConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionSnapshotReplacementStartConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -874,6 +884,7 @@ mod test { abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 60 lookup_region_port.period_secs = 60 + region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1036,6 +1047,10 @@ mod test { lookup_region_port: LookupRegionPortConfig { period_secs: Duration::from_secs(60), }, + region_snapshot_replacement_start: + RegionSnapshotReplacementStartConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -1112,6 +1127,7 @@ mod test { abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 60 lookup_region_port.period_secs = 60 + region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 86d9abc460..1128cd8f0f 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -25,6 +25,7 @@ chrono.workspace = true cockroach-admin-client.workspace = true crucible-agent-client.workspace = true crucible-pantry-client.workspace = true +crucible-common.workspace = true dns-service-client.workspace = true dpd-client.workspace = true mg-admin-client.workspace = true diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 58259be7ee..13c3708e4a 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -127,8 +127,12 @@ pub use vmm::VmmStateUpdateResult; pub use volume::read_only_resources_associated_with_volume; pub use volume::CrucibleResources; pub use volume::CrucibleTargets; +pub use volume::ExistingTarget; +pub use volume::ReplacementTarget; pub use volume::VolumeCheckoutReason; pub use volume::VolumeReplacementParams; +pub use volume::VolumeToDelete; +pub use volume::VolumeWithTarget; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index f777384b7b..f5c1f121e4 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1795,16 +1795,16 @@ pub struct VolumeReplacementParams { // parameters #[derive(Debug, Clone, Copy)] -pub struct VolumeWithTarget(Uuid); +pub struct VolumeWithTarget(pub Uuid); #[derive(Debug, Clone, Copy)] -pub struct ExistingTarget(SocketAddrV6); +pub struct ExistingTarget(pub SocketAddrV6); #[derive(Debug, Clone, Copy)] -pub struct ReplacementTarget(SocketAddrV6); +pub struct ReplacementTarget(pub SocketAddrV6); #[derive(Debug, Clone, Copy)] -pub struct VolumeToDelete(Uuid); +pub struct VolumeToDelete(pub Uuid); impl DataStore { /// Replace a read-write region in a Volume with a new region. diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 754f37c064..572de807d7 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -139,6 +139,7 @@ v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index bd50e846bd..3aebe35152 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -125,6 +125,7 @@ v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 850e63443a..cc42a8f302 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -108,6 +108,7 @@ use super::tasks::phantom_disks; use super::tasks::physical_disk_adoption; use super::tasks::region_replacement; use super::tasks::region_replacement_driver; +use super::tasks::region_snapshot_replacement_start::*; use super::tasks::saga_recovery; use super::tasks::service_firewall_rules; use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; @@ -161,6 +162,7 @@ pub struct BackgroundTasks { pub task_vpc_route_manager: Activator, pub task_saga_recovery: Activator, pub task_lookup_region_port: Activator, + pub task_region_snapshot_replacement_start: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as @@ -242,6 +244,7 @@ impl BackgroundTasksInitializer { task_vpc_route_manager: Activator::new(), task_saga_recovery: Activator::new(), task_lookup_region_port: Activator::new(), + task_region_snapshot_replacement_start: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -303,6 +306,7 @@ impl BackgroundTasksInitializer { task_vpc_route_manager, task_saga_recovery, task_lookup_region_port, + task_region_snapshot_replacement_start, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -721,13 +725,28 @@ impl BackgroundTasksInitializer { description: "fill in missing ports for region records", period: config.lookup_region_port.period_secs, task_impl: Box::new(lookup_region_port::LookupRegionPort::new( - datastore, + datastore.clone(), )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_lookup_region_port, }); + driver.register(TaskDefinition { + name: "region_snapshot_replacement_start", + description: + "detect if region snapshots need replacement and begin the \ + process", + period: config.region_snapshot_replacement_start.period_secs, + task_impl: Box::new(RegionSnapshotReplacementDetector::new( + datastore, + sagas.clone(), + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_region_snapshot_replacement_start, + }); + driver } } diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index fe041a6daa..b0281afd9f 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -25,6 +25,7 @@ pub mod phantom_disks; pub mod physical_disk_adoption; pub mod region_replacement; pub mod region_replacement_driver; +pub mod region_snapshot_replacement_start; pub mod saga_recovery; pub mod service_firewall_rules; pub mod sync_service_zone_nat; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs new file mode 100644 index 0000000000..9bc66d48c8 --- /dev/null +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -0,0 +1,512 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting region snapshots that need replacing and +//! beginning that process +//! +//! This task's responsibility is to create region snapshot replacement requests +//! when physical disks are expunged, and trigger the region snapshot +//! replacement start saga for any requests that are in state "Requested". See +//! the documentation in that saga's docstring for more information. + +use crate::app::authn; +use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_start::*; +use crate::app::sagas::NexusSaga; +use crate::app::RegionAllocationStrategy; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; +use serde_json::json; +use std::sync::Arc; + +pub struct RegionSnapshotReplacementDetector { + datastore: Arc, + sagas: Arc, +} + +impl RegionSnapshotReplacementDetector { + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementDetector { datastore, sagas } + } + + async fn send_start_request( + &self, + serialized_authn: authn::saga::Serialized, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let params = sagas::region_snapshot_replacement_start::Params { + serialized_authn, + request, + allocation_strategy: + RegionAllocationStrategy::RandomWithDistinctSleds { seed: None }, + }; + + let saga_dag = SagaRegionSnapshotReplacementStart::prepare(¶ms)?; + self.sagas.saga_start(saga_dag).await + } + + /// Find region snapshots on expunged physical disks and create region + /// snapshot replacement requests for them. + async fn create_requests_for_region_snapshots_on_expunged_disks( + &self, + opctx: &OpContext, + status: &mut RegionSnapshotReplacementStartStatus, + ) { + let log = &opctx.log; + + // Find region snapshots on expunged physical disks + let region_snapshots_to_be_replaced = match self + .datastore + .find_region_snapshots_on_expunged_physical_disks(opctx) + .await + { + Ok(region_snapshots) => region_snapshots, + + Err(e) => { + let s = format!( + "find_region_snapshots_on_expunged_physical_disks \ + failed: {e}", + ); + + error!(&log, "{s}"); + status.errors.push(s); + return; + } + }; + + for region_snapshot in region_snapshots_to_be_replaced { + // If no request exists yet, create one. + let existing_request = match self + .datastore + .lookup_region_snapshot_replacement_request( + opctx, + ®ion_snapshot, + ) + .await + { + Ok(existing_request) => existing_request, + + Err(e) => { + let s = + format!("error looking up replacement request: {e}"); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + status.errors.push(s); + continue; + } + }; + + if existing_request.is_none() { + match self + .datastore + .create_region_snapshot_replacement_request( + opctx, + ®ion_snapshot, + ) + .await + { + Ok(request_id) => { + let s = format!( + "created region snapshot replacement request \ + {request_id}" + ); + + info!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + status.requests_created_ok.push(s); + } + + Err(e) => { + let s = + format!("error creating replacement request: {e}"); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + status.errors.push(s); + } + } + } + } + } + + /// For each region snapshot replacement request in state "Requested", run + /// the start saga. + async fn start_requested_region_snapshot_replacements( + &self, + opctx: &OpContext, + status: &mut RegionSnapshotReplacementStartStatus, + ) { + let log = &opctx.log; + + let requests = match self + .datastore + .get_requested_region_snapshot_replacements(opctx) + .await + { + Ok(requests) => requests, + + Err(e) => { + let s = format!( + "query for region snapshot replacement requests failed: {e}" + ); + + error!(&log, "{s}"); + status.errors.push(s); + return; + } + }; + + for request in requests { + let request_id = request.id; + + let result = self + .send_start_request( + authn::saga::Serialized::for_opctx(opctx), + request.clone(), + ) + .await; + + match result { + Ok(()) => { + let s = format!( + "region snapshot replacement start invoked ok for \ + {request_id}" + ); + + info!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.start_invoked_ok.push(s); + } + + Err(e) => { + let s = format!( + "invoking region snapshot replacement start for \ + {request_id} failed: {e}", + ); + + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + } + } + } + } +} + +impl BackgroundTask for RegionSnapshotReplacementDetector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let log = &opctx.log; + info!(&log, "region snapshot replacement start task started"); + + let mut status = RegionSnapshotReplacementStartStatus::default(); + + self.create_requests_for_region_snapshots_on_expunged_disks( + opctx, + &mut status, + ) + .await; + + self.start_requested_region_snapshot_replacements( + opctx, + &mut status, + ) + .await; + + info!(&log, "region snapshot replacement start task done"); + + json!(status) + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::app::background::init::test::NoopStartSaga; + use crate::app::MIN_DISK_SIZE_BYTES; + use chrono::Utc; + use nexus_db_model::BlockSize; + use nexus_db_model::Generation; + use nexus_db_model::PhysicalDiskPolicy; + use nexus_db_model::RegionSnapshot; + use nexus_db_model::RegionSnapshotReplacement; + use nexus_db_model::Snapshot; + use nexus_db_model::SnapshotIdentity; + use nexus_db_model::SnapshotState; + use nexus_db_queries::authz; + use nexus_db_queries::db::lookup::LookupPath; + use nexus_test_utils::resource_helpers::create_project; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external; + use omicron_uuid_kinds::GenericUuid; + use std::collections::BTreeMap; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + type DiskTest<'a> = + nexus_test_utils::resource_helpers::DiskTest<'a, crate::Server>; + + #[nexus_test(server = crate::Server)] + async fn test_add_region_snapshot_replacement_causes_start( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // Add a region snapshot replacement request for a fake region snapshot + + let request = RegionSnapshotReplacement::new( + Uuid::new_v4(), // dataset id + Uuid::new_v4(), // region id + Uuid::new_v4(), // snapshot id + ); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request, + Uuid::new_v4(), + ) + .await + .unwrap(); + + // Activate the task - it should pick that up and try to run the + // region snapshot replacement start saga + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementStartStatus { + requests_created_ok: vec![], + start_invoked_ok: vec![format!( + "region snapshot replacement start invoked ok for \ + {request_id}" + )], + errors: vec![], + }, + ); + + assert_eq!(starter.count_reset(), 1); + } + + #[nexus_test(server = crate::Server)] + async fn test_expunge_disk_causes_region_snapshot_replacement_start( + cptestctx: &ControlPlaneTestContext, + ) { + let disk_test = DiskTest::new(cptestctx).await; + + let client = &cptestctx.external_client; + let project = create_project(&client, "testing").await; + let project_id = project.identity.id; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // Add three region snapshots for each dataset + + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + let mut dataset_to_zpool: BTreeMap = + BTreeMap::default(); + + for zpool in disk_test.zpools() { + for dataset in &zpool.datasets { + dataset_to_zpool + .insert(zpool.id.to_string(), dataset.id.to_string()); + + datastore + .region_snapshot_create(RegionSnapshot::new( + dataset.id, + region_id, + snapshot_id, + String::from("[fd00:1122:3344::101]:12345"), + )) + .await + .unwrap(); + } + } + + // Create the fake snapshot + + let (.., authz_project) = LookupPath::new(&opctx, &datastore) + .project_id(project_id) + .lookup_for(authz::Action::CreateChild) + .await + .unwrap(); + + datastore + .project_ensure_snapshot( + &opctx, + &authz_project, + Snapshot { + identity: SnapshotIdentity { + id: snapshot_id, + name: external::Name::try_from("snapshot".to_string()) + .unwrap() + .into(), + description: "snapshot".into(), + + time_created: Utc::now(), + time_modified: Utc::now(), + time_deleted: None, + }, + + project_id, + disk_id: Uuid::new_v4(), + volume_id: Uuid::new_v4(), + destination_volume_id: Uuid::new_v4(), + + gen: Generation::new(), + state: SnapshotState::Creating, + block_size: BlockSize::AdvancedFormat, + + size: external::ByteCount::try_from(MIN_DISK_SIZE_BYTES) + .unwrap() + .into(), + }, + ) + .await + .unwrap(); + + // Expunge one of the physical disks + + let first_zpool = + disk_test.zpools().next().expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(first_zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id, + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Activate the task - it should pick that up and try to run the region + // snapshot replacement start saga for the region snapshot on that + // expunged disk + + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + eprintln!("{:?}", &result); + + assert_eq!(result.requests_created_ok.len(), 1); + assert_eq!(result.start_invoked_ok.len(), 1); + assert!(result.errors.is_empty()); + + // The last part of the message is the region snapshot replacement + // request id + let request_created_uuid: Uuid = result.requests_created_ok[0] + .split(" ") + .last() + .unwrap() + .parse() + .unwrap(); + let request_started_uuid: Uuid = result.start_invoked_ok[0] + .split(" ") + .last() + .unwrap() + .parse() + .unwrap(); + + assert_eq!(request_created_uuid, request_started_uuid); + + assert_eq!(starter.count_reset(), 1); + + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + request_created_uuid, + ) + .await + .unwrap(); + + assert_eq!(request.old_snapshot_id, snapshot_id); + assert_eq!(request.old_region_id, region_id); + + let dataset_id = + dataset_to_zpool.get(&first_zpool.id.to_string()).unwrap(); + assert_eq!(&request.old_dataset_id.to_string(), dataset_id); + } +} diff --git a/nexus/src/app/crucible.rs b/nexus/src/app/crucible.rs index b8fca26c14..86de328355 100644 --- a/nexus/src/app/crucible.rs +++ b/nexus/src/app/crucible.rs @@ -150,7 +150,7 @@ impl super::Nexus { } /// Call out to Crucible agent and perform region creation. Optionally, - /// supply a read-only source to invoke a clone. + /// supply a read-only source's repair address to invoke a clone. pub async fn ensure_region_in_dataset( &self, log: &Logger, diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index 592463f5bb..d37370506c 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -15,6 +15,7 @@ use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; use omicron_common::retry_until_known_result; +use slog::Logger; use std::net::SocketAddrV6; // Common Pantry operations @@ -107,3 +108,33 @@ pub(crate) async fn call_pantry_detach_for_disk( Ok(()) } + +pub(crate) fn find_only_new_region( + log: &Logger, + existing_datasets_and_regions: Vec<(db::model::Dataset, db::model::Region)>, + new_datasets_and_regions: Vec<(db::model::Dataset, db::model::Region)>, +) -> Option<(db::model::Dataset, db::model::Region)> { + // Only filter on whether or not a Region is in the existing list! Datasets + // can change values (like size_used) if this saga interleaves with other + // saga runs of the same type. + let mut dataset_and_region: Vec<(db::model::Dataset, db::model::Region)> = + new_datasets_and_regions + .into_iter() + .filter(|(_, r)| { + !existing_datasets_and_regions.iter().any(|(_, er)| er == r) + }) + .collect(); + + if dataset_and_region.len() != 1 { + error!( + log, + "find_only_new_region saw dataset_and_region len {}: {:?}", + dataset_and_region.len(), + dataset_and_region, + ); + + None + } else { + dataset_and_region.pop() + } +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index b944fb4d2b..471118a5cb 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_start; pub mod snapshot_create; pub mod snapshot_delete; pub mod test_saga; @@ -190,6 +191,9 @@ fn make_action_registry() -> ActionRegistry { ::register_actions( &mut registry, ); + ::register_actions( + &mut registry, + ); #[cfg(test)] ::register_actions(&mut registry); diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index d4d455f927..86aab2ac22 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -26,12 +26,13 @@ //! ``` //! //! The first thing this saga does is set itself as the "operating saga" for the -//! request, and change the state to "Allocating". Then, it performs the following -//! steps: +//! request, and change the state to "Allocating". Then, it performs the +//! following steps: //! //! 1. Allocate a new region //! -//! 2. For the affected Volume, swap the region being replaced with the new region. +//! 2. For the affected Volume, swap the region being replaced with the new +//! region. //! //! 3. Create a fake volume that can be later deleted with the region being //! replaced. @@ -48,6 +49,7 @@ use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, }; +use crate::app::sagas::common_storage::find_only_new_region; use crate::app::sagas::declare_saga_actions; use crate::app::RegionAllocationStrategy; use crate::app::{authn, db}; @@ -57,7 +59,6 @@ use serde::Deserialize; use serde::Serialize; use sled_agent_client::types::CrucibleOpts; use sled_agent_client::types::VolumeConstructionRequest; -use slog::Logger; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; @@ -285,36 +286,6 @@ async fn srrs_alloc_new_region( Ok(datasets_and_regions) } -fn find_only_new_region( - log: &Logger, - existing_datasets_and_regions: Vec<(db::model::Dataset, db::model::Region)>, - new_datasets_and_regions: Vec<(db::model::Dataset, db::model::Region)>, -) -> Option<(db::model::Dataset, db::model::Region)> { - // Only filter on whether or not a Region is in the existing list! Datasets - // can change values (like size_used) if this saga interleaves with other - // saga runs of the same type. - let mut dataset_and_region: Vec<(db::model::Dataset, db::model::Region)> = - new_datasets_and_regions - .into_iter() - .filter(|(_, r)| { - !existing_datasets_and_regions.iter().any(|(_, er)| er == r) - }) - .collect(); - - if dataset_and_region.len() != 1 { - error!( - log, - "find_only_new_region saw dataset_and_region len {}: {:?}", - dataset_and_region.len(), - dataset_and_region, - ); - - None - } else { - dataset_and_region.pop() - } -} - async fn srrs_alloc_new_region_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs new file mode 100644 index 0000000000..941899d862 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -0,0 +1,1134 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! In the same way that read/write regions need to be replaced when a physical +//! disk is expunged, read-only regions need to be replaced too: Volumes are in +//! a similarly degraded state when the read-only Downstairs have gone away, and +//! remain in this degraded state until a new Region replaces the one that is +//! gone. +//! +//! It's this saga's responsibility to start that replacement process. This saga +//! handles the following region snapshot replacement request state transitions: +//! +//! ```text +//! Requested <-- +//! | +//! | | +//! v | +//! | +//! Allocating -- +//! +//! | +//! v +//! +//! ReplacementDone +//! ``` +//! +//! The first thing this saga does is set itself as the "operating saga" for the +//! request, and change the state to "Allocating". Then, it performs the +//! following steps: +//! +//! 1. Allocate a new region +//! +//! 2. Create a blank volume that can be later deleted to stash the snapshot +//! being replaced. This is populated in the `volume_replace_snapshot` +//! transaction so that `volume_references` for the corresponding region +//! snapshot remains accurate. +//! +//! 3. For the affected Volume, swap the snapshot being replaced with the new +//! region. +//! +//! 4. Update the region snapshot replacement request by clearing the operating +//! saga id and changing the state to "ReplacementDone". +//! +//! Any unwind will place the state back into Requested. +//! +//! See the documentation for the "region snapshot replacement garbage collect" +//! saga for the next step in the process. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::db::datastore::ExistingTarget; +use crate::app::db::datastore::RegionAllocationFor; +use crate::app::db::datastore::RegionAllocationParameters; +use crate::app::db::datastore::ReplacementTarget; +use crate::app::db::datastore::VolumeToDelete; +use crate::app::db::datastore::VolumeWithTarget; +use crate::app::db::lookup::LookupPath; +use crate::app::sagas::common_storage::find_only_new_region; +use crate::app::sagas::declare_saga_actions; +use crate::app::RegionAllocationStrategy; +use crate::app::{authn, db}; +use nexus_types::identity::Asset; +use nexus_types::identity::Resource; +use omicron_common::api::external::Error; +use serde::Deserialize; +use serde::Serialize; +use sled_agent_client::types::CrucibleOpts; +use sled_agent_client::types::VolumeConstructionRequest; +use std::net::SocketAddrV6; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement start saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + pub request: db::model::RegionSnapshotReplacement, + pub allocation_strategy: RegionAllocationStrategy, +} + +// region snapshot replacement start saga: actions + +declare_saga_actions! { + region_snapshot_replacement_start; + SET_SAGA_ID -> "unused_1" { + + rsrss_set_saga_id + - rsrss_set_saga_id_undo + } + GET_ALLOC_REGION_PARAMS -> "alloc_region_params" { + + rsrss_get_alloc_region_params + } + ALLOC_NEW_REGION -> "new_datasets_and_regions" { + + rsrss_alloc_new_region + - rsrss_alloc_new_region_undo + } + FIND_NEW_REGION -> "new_dataset_and_region" { + + rsrss_find_new_region + } + NEW_REGION_ENSURE -> "ensured_dataset_and_region" { + + rsrss_new_region_ensure + - rsrss_new_region_ensure_undo + } + GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + + rsrss_get_old_snapshot_volume_id + } + CREATE_FAKE_VOLUME -> "unused_2" { + + rsrss_create_fake_volume + - rsrss_create_fake_volume_undo + } + REPLACE_SNAPSHOT_IN_VOLUME -> "unused_3" { + + rsrss_replace_snapshot_in_volume + - rsrss_replace_snapshot_in_volume_undo + } + UPDATE_REQUEST_RECORD -> "unused_4" { + + rsrss_update_request_record + } +} + +// region snapshot replacement start saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementStart; +impl NexusSaga for SagaRegionSnapshotReplacementStart { + const NAME: &'static str = "region-snapshot-replacement-start"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_start_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(Node::action( + "new_volume_id", + "GenerateNewVolumeId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + builder.append(get_alloc_region_params_action()); + builder.append(alloc_new_region_action()); + builder.append(find_new_region_action()); + builder.append(new_region_ensure_action()); + builder.append(get_old_snapshot_volume_id_action()); + builder.append(create_fake_volume_action()); + builder.append(replace_snapshot_in_volume_action()); + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement start saga: action implementations + +async fn rsrss_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "allocating" state to + // block out other sagas that will be triggered for the same request. This + // avoids Nexus allocating a bunch of replacement read-only regions only to + // unwind all but one. + osagactx + .datastore() + .set_region_snapshot_replacement_allocating( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_allocating( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +#[derive(Debug, Deserialize, Serialize)] +struct AllocRegionParams { + block_size: u64, + blocks_per_extent: u64, + extent_count: u64, + current_allocated_regions: Vec<(db::model::Dataset, db::model::Region)>, + snapshot_id: Uuid, + snapshot_volume_id: Uuid, +} + +async fn rsrss_get_alloc_region_params( + sagactx: NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // Look up the existing snapshot + let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) + .snapshot_id(params.request.old_snapshot_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + // Find the region to replace + let db_region = osagactx + .datastore() + .get_region(params.request.old_region_id) + .await + .map_err(ActionError::action_failed)?; + + let current_allocated_regions = osagactx + .datastore() + .get_allocated_regions(db_snapshot.volume_id) + .await + .map_err(ActionError::action_failed)?; + + Ok(AllocRegionParams { + block_size: db_region.block_size().to_bytes(), + blocks_per_extent: db_region.blocks_per_extent(), + extent_count: db_region.extent_count(), + current_allocated_regions, + snapshot_id: db_snapshot.id(), + snapshot_volume_id: db_snapshot.volume_id, + }) +} + +async fn rsrss_alloc_new_region( + sagactx: NexusActionContext, +) -> Result, ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let alloc_region_params = + sagactx.lookup::("alloc_region_params")?; + + // Request an additional region for this snapshot volume. It's important + // _not_ to delete the existing snapshot first, as (if it's still there) + // then the Crucible agent could reuse the allocated port and cause trouble. + let datasets_and_regions = osagactx + .datastore() + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: alloc_region_params.snapshot_volume_id, + snapshot_id: alloc_region_params.snapshot_id, + }, + RegionAllocationParameters::FromRaw { + block_size: alloc_region_params.block_size, + blocks_per_extent: alloc_region_params.blocks_per_extent, + extent_count: alloc_region_params.extent_count, + }, + ¶ms.allocation_strategy, + alloc_region_params.current_allocated_regions.len() + 1, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(datasets_and_regions) +} + +async fn rsrss_alloc_new_region_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + let alloc_region_params = + sagactx.lookup::("alloc_region_params")?; + + let maybe_dataset_and_region = find_only_new_region( + log, + alloc_region_params.current_allocated_regions, + sagactx.lookup::>( + "new_datasets_and_regions", + )?, + ); + + // It should be guaranteed that if rsrss_alloc_new_region succeeded then it + // would have bumped the region redundancy, so we should see something here. + // Guard against the case anyway. + if let Some(dataset_and_region) = maybe_dataset_and_region { + let (_, region) = dataset_and_region; + osagactx + .datastore() + .regions_hard_delete(log, vec![region.id()]) + .await?; + } else { + warn!(&log, "maybe_dataset_and_region is None!"); + } + + Ok(()) +} + +async fn rsrss_find_new_region( + sagactx: NexusActionContext, +) -> Result<(db::model::Dataset, db::model::Region), ActionError> { + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + let alloc_region_params = + sagactx.lookup::("alloc_region_params")?; + + let maybe_dataset_and_region = find_only_new_region( + log, + alloc_region_params.current_allocated_regions, + sagactx.lookup::>( + "new_datasets_and_regions", + )?, + ); + + let Some(dataset_and_region) = maybe_dataset_and_region else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "expected dataset and region, saw {:?}!", + maybe_dataset_and_region, + ), + ))); + }; + + Ok(dataset_and_region) +} + +async fn rsrss_new_region_ensure( + sagactx: NexusActionContext, +) -> Result< + (nexus_db_model::Dataset, crucible_agent_client::types::Region), + ActionError, +> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + // With a list of datasets and regions to ensure, other sagas need to have a + // separate no-op forward step for the undo action to ensure that the undo + // step occurs in the case that the ensure partially fails. Here this is not + // required, there's only one dataset and region. + let new_dataset_and_region = sagactx + .lookup::<(db::model::Dataset, db::model::Region)>( + "new_dataset_and_region", + )?; + + let region_snapshot = osagactx + .datastore() + .region_snapshot_get( + params.request.old_dataset_id, + params.request.old_region_id, + params.request.old_snapshot_id, + ) + .await + .map_err(ActionError::action_failed)?; + + let Some(region_snapshot) = region_snapshot else { + return Err(ActionError::action_failed(format!( + "region snapshot {} {} {} deleted!", + params.request.old_dataset_id, + params.request.old_region_id, + params.request.old_snapshot_id, + ))); + }; + + let (new_dataset, new_region) = new_dataset_and_region; + + // Currently, the repair port is set using a fixed offset above the + // downstairs port. Once this goes away, Nexus will require a way to query + // for the repair port! + + let mut source_repair_addr: SocketAddrV6 = + match region_snapshot.snapshot_addr.parse() { + Ok(addr) => addr, + + Err(e) => { + return Err(ActionError::action_failed(format!( + "error parsing region_snapshot.snapshot_addr: {e}" + ))); + } + }; + + source_repair_addr.set_port( + source_repair_addr.port() + crucible_common::REPAIR_PORT_OFFSET, + ); + + let ensured_region = osagactx + .nexus() + .ensure_region_in_dataset( + log, + &new_dataset, + &new_region, + Some(source_repair_addr.to_string()), + ) + .await + .map_err(ActionError::action_failed)?; + + Ok((new_dataset, ensured_region)) +} + +async fn rsrss_new_region_ensure_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); + + warn!(log, "rsrss_new_region_ensure_undo: Deleting crucible regions"); + + let new_dataset_and_region = sagactx + .lookup::<(db::model::Dataset, db::model::Region)>( + "new_dataset_and_region", + )?; + + osagactx + .nexus() + .delete_crucible_regions(log, vec![new_dataset_and_region]) + .await?; + + Ok(()) +} + +async fn rsrss_get_old_snapshot_volume_id( + sagactx: NexusActionContext, +) -> Result { + // Save the snapshot's original volume ID, because we'll be altering it and + // need the original + + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) + .snapshot_id(params.request.old_snapshot_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + Ok(db_snapshot.volume_id) +} + +async fn rsrss_create_fake_volume( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + let new_volume_id = sagactx.lookup::("new_volume_id")?; + + // Create a fake volume record for the old snapshot target. This will be + // deleted after snapshot replacement has finished. It can be completely + // blank here, it will be replaced by `volume_replace_snapshot`. + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: new_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: new_volume_id, + target: vec![], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&volume_construction_request) + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; + + let volume = db::model::Volume::new(new_volume_id, volume_data); + + osagactx + .datastore() + .volume_create(volume) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_create_fake_volume_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + + // Delete the fake volume. + + let new_volume_id = sagactx.lookup::("new_volume_id")?; + osagactx.datastore().volume_hard_delete(new_volume_id).await?; + + Ok(()) +} + +#[derive(Debug)] +struct ReplaceParams { + old_volume_id: Uuid, + old_snapshot_address: SocketAddrV6, + new_region_address: SocketAddrV6, + new_volume_id: Uuid, +} + +async fn get_replace_params( + sagactx: &NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let new_volume_id = sagactx.lookup::("new_volume_id")?; + + let region_snapshot = osagactx + .datastore() + .region_snapshot_get( + params.request.old_dataset_id, + params.request.old_region_id, + params.request.old_snapshot_id, + ) + .await + .map_err(ActionError::action_failed)?; + + let Some(region_snapshot) = region_snapshot else { + return Err(ActionError::action_failed(format!( + "region snapshot {} {} {} deleted!", + params.request.old_dataset_id, + params.request.old_region_id, + params.request.old_snapshot_id, + ))); + }; + + let old_snapshot_address: SocketAddrV6 = + match region_snapshot.snapshot_addr.parse() { + Ok(addr) => addr, + + Err(e) => { + return Err(ActionError::action_failed(format!( + "parsing {} as SocketAddrV6 failed: {e}", + region_snapshot.snapshot_addr, + ))); + } + }; + + let (new_dataset, ensured_region) = sagactx.lookup::<( + db::model::Dataset, + crucible_agent_client::types::Region, + )>( + "ensured_dataset_and_region", + )?; + + let Some(new_dataset_address) = new_dataset.address() else { + return Err(ActionError::action_failed(format!( + "dataset {} does not have an address!", + new_dataset.id(), + ))); + }; + + let new_region_address = SocketAddrV6::new( + *new_dataset_address.ip(), + ensured_region.port_number, + 0, + 0, + ); + + let old_volume_id = sagactx.lookup::("old_snapshot_volume_id")?; + + // Return the replacement parameters for the forward action case - the undo + // will swap the existing and replacement target + Ok(ReplaceParams { + old_volume_id, + old_snapshot_address, + new_region_address, + new_volume_id, + }) +} + +async fn rsrss_replace_snapshot_in_volume( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); + + let replacement_params = get_replace_params(&sagactx).await?; + + info!( + log, + "replacing {} with {} in volume {}", + replacement_params.old_snapshot_address, + replacement_params.new_region_address, + replacement_params.old_volume_id, + ); + + // `volume_replace_snapshot` will swap the old snapshot for the new region. + // No repair or reconcilation needs to occur after this. + osagactx + .datastore() + .volume_replace_snapshot( + VolumeWithTarget(replacement_params.old_volume_id), + ExistingTarget(replacement_params.old_snapshot_address), + ReplacementTarget(replacement_params.new_region_address), + VolumeToDelete(replacement_params.new_volume_id), + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_replace_snapshot_in_volume_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + // Undo the forward action's volume_replace_snapshot call by swapping the + // existing target and replacement target parameters. + + let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); + + let replacement_params = get_replace_params(&sagactx).await?; + + // Note the old and new are _not_ swapped in this log message! The intention + // is that someone reviewing the logs could search for "replacing UUID with + // UUID in volume UUID" and get (in the case of no re-execution) two + // results. + info!( + log, + "undo: replacing {} with {} in volume {}", + replacement_params.old_snapshot_address, + replacement_params.new_region_address, + replacement_params.old_volume_id, + ); + + osagactx + .datastore() + .volume_replace_snapshot( + VolumeWithTarget(replacement_params.old_volume_id), + ExistingTarget(replacement_params.new_region_address), + ReplacementTarget(replacement_params.old_snapshot_address), + VolumeToDelete(replacement_params.new_volume_id), + ) + .await?; + + Ok(()) +} + +async fn rsrss_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + let new_dataset_and_region = sagactx + .lookup::<(db::model::Dataset, db::model::Region)>( + "new_dataset_and_region", + )?; + + let new_region_id = new_dataset_and_region.1.id(); + + let old_region_volume_id = sagactx.lookup::("new_volume_id")?; + + // Now that the region has been ensured and the construction request has + // been updated, update the replacement request record to 'ReplacementDone' + // and clear the operating saga id. There is no undo step for this, it + // should succeed idempotently. + datastore + .set_region_snapshot_replacement_replacement_done( + &opctx, + params.request.id, + saga_id, + new_region_id, + old_region_volume_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +#[cfg(test)] +pub(crate) mod test { + use crate::{ + app::db::lookup::LookupPath, app::db::DataStore, + app::saga::create_saga_dag, + app::sagas::region_snapshot_replacement_start::*, + app::sagas::test_helpers::test_opctx, app::RegionAllocationStrategy, + }; + use nexus_db_model::RegionSnapshotReplacement; + use nexus_db_model::RegionSnapshotReplacementState; + use nexus_db_model::Volume; + use nexus_db_queries::authn::saga::Serialized; + use nexus_db_queries::context::OpContext; + use nexus_test_utils::resource_helpers::create_disk; + use nexus_test_utils::resource_helpers::create_project; + use nexus_test_utils::resource_helpers::create_snapshot; + use nexus_test_utils::resource_helpers::DiskTest; + use nexus_test_utils_macros::nexus_test; + use nexus_types::external_api::views; + use nexus_types::identity::Asset; + use sled_agent_client::types::VolumeConstructionRequest; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + const DISK_NAME: &str = "my-disk"; + const SNAPSHOT_NAME: &str = "my-snap"; + const PROJECT_NAME: &str = "springfield-squidport"; + + async fn prepare_for_test( + cptestctx: &ControlPlaneTestContext, + ) -> PrepareResult { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + assert_eq!(region_allocations(&datastore).await, 0); + + let mut disk_test = DiskTest::new(cptestctx).await; + disk_test.add_zpool_with_dataset(cptestctx.first_sled()).await; + + assert_eq!(region_allocations(&datastore).await, 0); + + let _project_id = + create_project(&client, PROJECT_NAME).await.identity.id; + + assert_eq!(region_allocations(&datastore).await, 0); + + // Create a disk + let disk = create_disk(&client, PROJECT_NAME, DISK_NAME).await; + + assert_eq!(region_allocations(&datastore).await, 3); + + let disk_id = disk.identity.id; + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_id) + .fetch() + .await + .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); + + // Create a snapshot + let snapshot = + create_snapshot(&client, PROJECT_NAME, DISK_NAME, SNAPSHOT_NAME) + .await; + + assert_eq!(region_allocations(&datastore).await, 6); + + let snapshot_id = snapshot.identity.id; + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot_id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("test snapshot {:?} should exist", snapshot_id) + }); + + PrepareResult { db_disk, snapshot, db_snapshot } + } + + struct PrepareResult { + db_disk: nexus_db_model::Disk, + snapshot: views::Snapshot, + db_snapshot: nexus_db_model::Snapshot, + } + + #[nexus_test(server = crate::Server)] + async fn test_region_snapshot_replacement_start_saga( + cptestctx: &ControlPlaneTestContext, + ) { + let PrepareResult { db_disk, snapshot, db_snapshot } = + prepare_for_test(cptestctx).await; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + // Assert disk has three allocated regions + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + // Assert the snapshot has zero allocated regions + let snapshot_id = snapshot.identity.id; + + let snapshot_allocated_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + assert_eq!(snapshot_allocated_regions.len(), 0); + + // Replace one of the snapshot's targets + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + // Manually insert the region snapshot replacement request + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + // Run the region snapshot replacement start saga + let dag = + create_saga_dag::(Params { + serialized_authn: Serialized::for_opctx(&opctx), + request: request.clone(), + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + }) + .unwrap(); + + let runnable_saga = nexus.sagas.saga_prepare(dag).await.unwrap(); + + // Actually run the saga + runnable_saga.run_to_completion().await.unwrap(); + + // Validate the state transition + let result = datastore + .get_region_snapshot_replacement_request_by_id(&opctx, request.id) + .await + .unwrap(); + + assert_eq!( + result.replacement_state, + RegionSnapshotReplacementState::ReplacementDone + ); + assert!(result.new_region_id.is_some()); + assert!(result.operating_saga_id.is_none()); + + // Validate number of regions for disk didn't change + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + // Validate that the snapshot now has one allocated region + let snapshot_allocated_datasets_and_regions = datastore + .get_allocated_regions(db_snapshot.volume_id) + .await + .unwrap(); + + assert_eq!(snapshot_allocated_datasets_and_regions.len(), 1); + + let (_, snapshot_allocated_region) = + &snapshot_allocated_datasets_and_regions[0]; + + // Validate that the snapshot's volume contains this newly allocated + // region + + let new_region_addr = datastore + .region_addr(snapshot_allocated_region.id()) + .await + .unwrap() + .unwrap(); + + let volumes = datastore + .find_volumes_referencing_socket_addr( + &opctx, + new_region_addr.into(), + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 1); + assert_eq!(volumes[0].id(), db_snapshot.volume_id); + } + + fn new_test_params( + opctx: &OpContext, + request: &RegionSnapshotReplacement, + ) -> Params { + Params { + serialized_authn: Serialized::for_opctx(opctx), + request: request.clone(), + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + } + } + + pub(crate) async fn verify_clean_slate( + cptestctx: &ControlPlaneTestContext, + request: &RegionSnapshotReplacement, + affected_volume_original: &Volume, + ) { + let datastore = cptestctx.server.server_context().nexus.datastore(); + + crate::app::sagas::test_helpers::assert_no_failed_undo_steps( + &cptestctx.logctx.log, + datastore, + ) + .await; + + // For these tests, six provisioned regions exist: three for the + // original disk, and three for the (currently unused) snapshot + // destination volume + assert_eq!(region_allocations(&datastore).await, 6); + assert_region_snapshot_replacement_request_untouched( + cptestctx, &datastore, &request, + ) + .await; + assert_volume_untouched(&datastore, &affected_volume_original).await; + } + + async fn region_allocations(datastore: &DataStore) -> usize { + use async_bb8_diesel::AsyncConnection; + use async_bb8_diesel::AsyncRunQueryDsl; + use async_bb8_diesel::AsyncSimpleConnection; + use diesel::QueryDsl; + use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; + use nexus_db_queries::db::schema::region::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + conn.transaction_async(|conn| async move { + // Selecting all regions requires a full table scan + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); + + dsl::region + .count() + .get_result_async(&conn) + .await + .map(|x: i64| x as usize) + }) + .await + .unwrap() + } + + async fn assert_region_snapshot_replacement_request_untouched( + cptestctx: &ControlPlaneTestContext, + datastore: &DataStore, + request: &RegionSnapshotReplacement, + ) { + let opctx = test_opctx(cptestctx); + let db_request = datastore + .get_region_snapshot_replacement_request_by_id(&opctx, request.id) + .await + .unwrap(); + + assert_eq!(db_request.new_region_id, None); + assert_eq!( + db_request.replacement_state, + RegionSnapshotReplacementState::Requested + ); + assert_eq!(db_request.operating_saga_id, None); + } + + async fn assert_volume_untouched( + datastore: &DataStore, + affected_volume_original: &Volume, + ) { + let affected_volume = datastore + .volume_get(affected_volume_original.id()) + .await + .unwrap() + .unwrap(); + + let actual: VolumeConstructionRequest = + serde_json::from_str(&affected_volume.data()).unwrap(); + + let expected: VolumeConstructionRequest = + serde_json::from_str(&affected_volume_original.data()).unwrap(); + + assert_eq!(actual, expected); + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let PrepareResult { db_disk, snapshot, db_snapshot } = + prepare_for_test(cptestctx).await; + + let log = &cptestctx.logctx.log; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + let affected_volume_original = + datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); + + verify_clean_slate(&cptestctx, &request, &affected_volume_original) + .await; + + crate::app::sagas::test_helpers::action_failure_can_unwind_idempotently::< + SagaRegionSnapshotReplacementStart, + _, + _ + >( + nexus, + || Box::pin(async { new_test_params(&opctx, &request) }), + || Box::pin(async { + verify_clean_slate( + &cptestctx, + &request, + &affected_volume_original, + ).await; + }), + log + ).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let PrepareResult { db_disk, snapshot, db_snapshot: _ } = + prepare_for_test(cptestctx).await; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + // Build the saga DAG with the provided test parameters + let params = new_test_params(&opctx, &request); + let dag = create_saga_dag::(params) + .unwrap(); + crate::app::sagas::test_helpers::actions_succeed_idempotently( + nexus, dag, + ) + .await; + } +} diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index ac7188f232..14180459ab 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -432,6 +432,28 @@ pub async fn create_disk( .await } +pub async fn create_snapshot( + client: &ClientTestContext, + project_name: &str, + disk_name: &str, + snapshot_name: &str, +) -> views::Snapshot { + let snapshots_url = format!("/v1/snapshots?project={}", project_name); + + object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: snapshot_name.parse().unwrap(), + description: format!("snapshot {:?}", snapshot_name), + }, + disk: disk_name.to_string().try_into().unwrap(), + }, + ) + .await +} + pub async fn delete_disk( client: &ClientTestContext, project_name: &str, diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 8f65a73204..35b55184b9 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -137,6 +137,7 @@ lookup_region_port.period_secs = 60 # Therefore, disable the background task during tests. instance_updater.disable = true instance_updater.period_secs = 60 +region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 6463aa8ab6..2f8a411cf7 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -19,3 +19,12 @@ pub struct LookupRegionPortStatus { pub found_port_ok: Vec, pub errors: Vec, } + +/// The status of a `region_snapshot_replacement_start` background task +/// activation +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct RegionSnapshotReplacementStartStatus { + pub requests_created_ok: Vec, + pub start_invoked_ok: Vec, + pub errors: Vec, +} diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index c502c20b1b..437615938f 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -65,6 +65,7 @@ abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 instance_updater.period_secs = 30 +region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 30a0243122..95dcca14ae 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -65,6 +65,7 @@ abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 instance_updater.period_secs = 30 +region_snapshot_replacement_start.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 854a020167..eff58519a3 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -133,6 +133,7 @@ bstr-6f8ce4dd05d13bba = { package = "bstr", version = "0.2.17" } bstr-dff4ba8e3ae991db = { package = "bstr", version = "1.9.1" } byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } +cc = { version = "1.0.97", default-features = false, features = ["parallel"] } chrono = { version = "0.4.38", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } clap = { version = "4.5.15", features = ["cargo", "derive", "env", "wrap_help"] } @@ -229,6 +230,7 @@ zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } @@ -237,30 +239,35 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.build-dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.build-dependencies] mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } @@ -268,6 +275,7 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su [target.x86_64-unknown-illumos.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } @@ -277,6 +285,7 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu [target.x86_64-unknown-illumos.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } mio = { version = "0.8.11", features = ["net", "os-ext"] } +nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } From eeb723c6a0d727f016ca947541ac85c029801c06 Mon Sep 17 00:00:00 2001 From: "oxide-reflector-bot[bot]" <130185838+oxide-reflector-bot[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 18:24:54 +0000 Subject: [PATCH 34/91] Update dendrite to 21b1656 (#6329) Updated dendrite to commit 21b1656. --------- Co-authored-by: reflector[bot] <130185838+reflector[bot]@users.noreply.github.com> --- package-manifest.toml | 12 ++++++------ tools/dendrite_openapi_version | 2 +- tools/dendrite_stub_checksums | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index 2c68257050..e6cd464404 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -660,8 +660,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8293f28df659c070b48e13f87a51b836238b406e" -source.sha256 = "7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" +source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" +source.sha256 = "3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4" output.type = "zone" output.intermediate_only = true @@ -687,8 +687,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8293f28df659c070b48e13f87a51b836238b406e" -source.sha256 = "68bf16452a3159529fb1bd11f43adfb002020d086e0f64f48bd766bf47843ae9" +source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" +source.sha256 = "ad02632713a57fe8c5371316320309e1fad52f0ce2f7e6f768859aa94dfbb1d9" output.type = "zone" output.intermediate_only = true @@ -707,8 +707,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8293f28df659c070b48e13f87a51b836238b406e" -source.sha256 = "b7d6a1a20f302ded9c6e4bbba66b9432bec5edda593edfcdbb9429a95201655a" +source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" +source.sha256 = "23bca3873cdb0441cd18c0cf071b86d49755be06837479661876ac95d2f10f27" output.type = "zone" output.intermediate_only = true diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 652ebc31eb..2d0f4d4887 100755 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="8293f28df659c070b48e13f87a51b836238b406e" +COMMIT="21b16567f28e103f145cd18d53fac6958429c4ff" SHA2="3a54305ab4b1270c9a5fb0603f481fce199f3767c174a03559ff642f7f44687e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index cd8eb65a3e..e3d16d779c 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" -CIDL_SHA256_LINUX_DPD="290edfc4076d31d6f70aa7cc16ce758e10d14777d8542b688fa2880fdfde398c" +CIDL_SHA256_ILLUMOS="3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4" +CIDL_SHA256_LINUX_DPD="6aa070ab0590aca7458f2555012acc5571e61b3b1523de862d4bbb04b9d34135" CIDL_SHA256_LINUX_SWADM="e1e35784538a4fdd76dc257cc636ac3f43f7ef2842dabfe981f17f8ce6b8e1a2" From 3d6f21348ff0e328b6e9f1512fc04f30faf30b18 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Wed, 14 Aug 2024 15:38:15 -0700 Subject: [PATCH 35/91] move instance migration to internal API (#6311) Move instance migration from the external API to the internal API per the determinations in RFD 494. This is almost a purely mechanical change (I have not, for example, changed any API signatures or return values); the only unusual bit is one small adjustment to `Nexus::instance_ensure_registered` to deal with the fact that migration no longer runs in a context where the caller is acting on behalf of a user in a silo. Tests: cargo test; spun up a dev cluster and verified that (1) the migrate endpoint is gone from the external API, (2) it's present on the internal API and reachable from the switch zone, and (3) it behaves as it did before (at least to the extent I could test migration's behavior on a single-machine dev cluster). --- nexus/internal-api/src/lib.rs | 16 +- nexus/src/app/instance.rs | 44 ++--- nexus/src/app/sagas/instance_migrate.rs | 30 +-- nexus/src/app/sagas/instance_start.rs | 23 ++- nexus/src/app/sagas/instance_update/mod.rs | 3 +- nexus/src/external_api/http_entrypoints.rs | 43 ---- nexus/src/internal_api/http_entrypoints.rs | 29 +++ nexus/tests/integration_tests/endpoints.rs | 18 -- nexus/tests/integration_tests/instances.rs | 22 ++- nexus/tests/output/nexus_tags.txt | 1 - nexus/types/src/external_api/params.rs | 6 - nexus/types/src/internal_api/params.rs | 7 + openapi/nexus-internal.json | 217 +++++++++++++++++++++ openapi/nexus.json | 69 ------- 14 files changed, 336 insertions(+), 192 deletions(-) diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index 6a98c44614..7ac3e42f57 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -20,15 +20,15 @@ use nexus_types::{ }, internal_api::{ params::{ - OximeterInfo, RackInitializationRequest, SledAgentInfo, - SwitchPutRequest, SwitchPutResponse, + InstanceMigrateRequest, OximeterInfo, RackInitializationRequest, + SledAgentInfo, SwitchPutRequest, SwitchPutResponse, }, views::{BackgroundTask, DemoSaga, Ipv4NatEntryView, Saga}, }, }; use omicron_common::{ api::{ - external::http_pagination::PaginatedById, + external::{http_pagination::PaginatedById, Instance}, internal::nexus::{ DiskRuntimeState, DownstairsClientStopRequest, DownstairsClientStopped, ProducerEndpoint, @@ -119,6 +119,16 @@ pub trait NexusInternalApi { new_runtime_state: TypedBody, ) -> Result; + #[endpoint { + method = POST, + path = "/instances/{instance_id}/migrate", + }] + async fn instance_migrate( + rqctx: RequestContext, + path_params: Path, + migrate_params: TypedBody, + ) -> Result, HttpError>; + /// Report updated state for a disk. #[endpoint { method = PUT, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 344d2688f7..3106ab9f2a 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -191,6 +191,14 @@ enum InstanceStartDisposition { AlreadyStarted, } +/// The set of API resources needed when ensuring that an instance is registered +/// on a sled. +pub(crate) struct InstanceEnsureRegisteredApiResources { + pub(crate) authz_silo: nexus_auth::authz::Silo, + pub(crate) authz_project: nexus_auth::authz::Project, + pub(crate) authz_instance: nexus_auth::authz::Instance, +} + impl super::Nexus { pub fn instance_lookup<'a>( &'a self, @@ -473,14 +481,16 @@ impl super::Nexus { Ok(()) } - pub(crate) async fn project_instance_migrate( + pub(crate) async fn instance_migrate( self: &Arc, opctx: &OpContext, - instance_lookup: &lookup::Instance<'_>, - params: params::InstanceMigrate, + id: InstanceUuid, + params: nexus_types::internal_api::params::InstanceMigrateRequest, ) -> UpdateResult { - let (.., authz_instance) = - instance_lookup.lookup_for(authz::Action::Modify).await?; + let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) + .instance_id(id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await?; let state = self .db_datastore @@ -867,7 +877,11 @@ impl super::Nexus { pub(crate) async fn instance_ensure_registered( &self, opctx: &OpContext, - authz_instance: &authz::Instance, + InstanceEnsureRegisteredApiResources { + authz_silo, + authz_project, + authz_instance, + }: &InstanceEnsureRegisteredApiResources, db_instance: &db::model::Instance, propolis_id: &PropolisUuid, initial_vmm: &db::model::Vmm, @@ -1067,23 +1081,9 @@ impl super::Nexus { let ssh_keys: Vec = ssh_keys.map(|ssh_key| ssh_key.public_key).collect(); - // Construct instance metadata used to track its statistics. - // - // This requires another fetch on the silo and project, to extract their - // IDs. - let (.., db_project) = self - .project_lookup( - opctx, - params::ProjectSelector { - project: NameOrId::Id(db_instance.project_id), - }, - )? - .fetch() - .await?; - let (_, db_silo) = self.current_silo_lookup(opctx)?.fetch().await?; let metadata = sled_agent_client::types::InstanceMetadata { - silo_id: db_silo.id(), - project_id: db_project.id(), + silo_id: authz_silo.id(), + project_id: authz_project.id(), }; // Ask the sled agent to begin the state change. Then update the diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index bb4bf282e4..19bef2f046 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -4,15 +4,15 @@ use super::{NexusActionContext, NexusSaga, ACTION_GENERATE_ID}; use crate::app::instance::{ - InstanceRegisterReason, InstanceStateChangeError, - InstanceStateChangeRequest, + InstanceEnsureRegisteredApiResources, InstanceRegisterReason, + InstanceStateChangeError, InstanceStateChangeRequest, }; use crate::app::sagas::{ declare_saga_actions, instance_common::allocate_vmm_ipv6, }; -use crate::external_api::params; use nexus_db_queries::db::{identity::Resource, lookup::LookupPath}; use nexus_db_queries::{authn, authz, db}; +use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use serde::Deserialize; use serde::Serialize; @@ -30,7 +30,7 @@ pub struct Params { pub serialized_authn: authn::saga::Serialized, pub instance: db::model::Instance, pub src_vmm: db::model::Vmm, - pub migrate_params: params::InstanceMigrate, + pub migrate_params: InstanceMigrateRequest, } // The migration saga is similar to the instance start saga: get a destination @@ -401,11 +401,12 @@ async fn sim_ensure_destination_propolis( "dst_propolis_id" => %vmm.id, "dst_vmm_state" => ?vmm); - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(db_instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + let (authz_silo, authz_project, authz_instance) = + LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(db_instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; let src_propolis_id = PropolisUuid::from_untyped_uuid(params.src_vmm.id); let dst_propolis_id = PropolisUuid::from_untyped_uuid(vmm.id); @@ -413,7 +414,11 @@ async fn sim_ensure_destination_propolis( .nexus() .instance_ensure_registered( &opctx, - &authz_instance, + &InstanceEnsureRegisteredApiResources { + authz_silo, + authz_project, + authz_instance, + }, &db_instance, &dst_propolis_id, &vmm, @@ -565,6 +570,7 @@ async fn sim_instance_migrate( mod tests { use super::*; use crate::app::sagas::test_helpers; + use crate::external_api::params; use dropshot::test_util::ClientTestContext; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_project, object_create, @@ -637,7 +643,7 @@ mod tests { serialized_authn: authn::saga::Serialized::for_opctx(&opctx), instance: state.instance().clone(), src_vmm: vmm.clone(), - migrate_params: params::InstanceMigrate { + migrate_params: InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), }, }; @@ -706,7 +712,7 @@ mod tests { ), instance: old_instance.clone(), src_vmm: old_vmm.clone(), - migrate_params: params::InstanceMigrate { + migrate_params: InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), }, } diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 9e4e010eea..55fc312ae7 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -10,8 +10,10 @@ use super::{ instance_common::allocate_vmm_ipv6, NexusActionContext, NexusSaga, SagaInitError, }; -use crate::app::instance::InstanceRegisterReason; -use crate::app::instance::InstanceStateChangeError; +use crate::app::instance::{ + InstanceEnsureRegisteredApiResources, InstanceRegisterReason, + InstanceStateChangeError, +}; use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_queries::db::{identity::Resource, lookup::LookupPath}; @@ -502,17 +504,22 @@ async fn sis_ensure_registered( "instance_id" => %instance_id, "sled_id" => %sled_id); - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(instance_id) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + let (authz_silo, authz_project, authz_instance) = + LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(instance_id) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; osagactx .nexus() .instance_ensure_registered( &opctx, - &authz_instance, + &InstanceEnsureRegisteredApiResources { + authz_silo, + authz_project, + authz_instance, + }, &db_instance, &propolis_id, &vmm_record, diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 71abe63bbd..5f226480b8 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1403,6 +1403,7 @@ mod test { create_default_ip_pool, create_project, object_create, }; use nexus_test_utils_macros::nexus_test; + use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_common::api::internal::nexus::{ MigrationRuntimeState, MigrationState, Migrations, }; @@ -2358,7 +2359,7 @@ mod test { serialized_authn: authn::saga::Serialized::for_opctx(&opctx), instance: state.instance().clone(), src_vmm: vmm.clone(), - migrate_params: params::InstanceMigrate { + migrate_params: InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), }, }; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index a87bdd834d..8e8b63229b 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -167,7 +167,6 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(instance_view)?; api.register(instance_create)?; api.register(instance_delete)?; - api.register(instance_migrate)?; api.register(instance_reboot)?; api.register(instance_start)?; api.register(instance_stop)?; @@ -2866,48 +2865,6 @@ async fn instance_delete( .await } -// TODO should this be in the public API? -/// Migrate an instance -#[endpoint { - method = POST, - path = "/v1/instances/{instance}/migrate", - tags = ["instances"], -}] -async fn instance_migrate( - rqctx: RequestContext, - query_params: Query, - path_params: Path, - migrate_params: TypedBody, -) -> Result, HttpError> { - let apictx = rqctx.context(); - let nexus = &apictx.context.nexus; - let path = path_params.into_inner(); - let query = query_params.into_inner(); - let migrate_instance_params = migrate_params.into_inner(); - let instance_selector = params::InstanceSelector { - project: query.project, - instance: path.instance, - }; - let handler = async { - let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - let instance_lookup = - nexus.instance_lookup(&opctx, instance_selector)?; - let instance = nexus - .project_instance_migrate( - &opctx, - &instance_lookup, - migrate_instance_params, - ) - .await?; - Ok(HttpResponseOk(instance.into())) - }; - apictx - .context - .external_latencies - .instrument_dropshot_handler(&rqctx, handler) - .await -} - /// Reboot an instance #[endpoint { method = POST, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index c5322e3930..9965b6e21e 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -30,6 +30,7 @@ use nexus_types::external_api::params::UninitializedSledId; use nexus_types::external_api::shared::ProbeInfo; use nexus_types::external_api::shared::UninitializedSled; use nexus_types::external_api::views::SledPolicy; +use nexus_types::internal_api::params::InstanceMigrateRequest; use nexus_types::internal_api::params::SledAgentInfo; use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; @@ -42,6 +43,7 @@ use omicron_common::api::external::http_pagination::data_page_params_for; use omicron_common::api::external::http_pagination::PaginatedById; use omicron_common::api::external::http_pagination::ScanById; use omicron_common::api::external::http_pagination::ScanParams; +use omicron_common::api::external::Instance; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::DownstairsClientStopRequest; use omicron_common::api::internal::nexus::DownstairsClientStopped; @@ -190,6 +192,33 @@ impl NexusInternalApi for NexusInternalApiImpl { .await } + async fn instance_migrate( + rqctx: RequestContext, + path_params: Path, + migrate_params: TypedBody, + ) -> Result, HttpError> { + let apictx = &rqctx.context().context; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let migrate = migrate_params.into_inner(); + let handler = async { + let opctx = + crate::context::op_context_for_internal_api(&rqctx).await; + let instance = nexus + .instance_migrate( + &opctx, + InstanceUuid::from_untyped_uuid(path.instance_id), + migrate, + ) + .await?; + Ok(HttpResponseOk(instance.into())) + }; + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + async fn cpapi_disks_put( rqctx: RequestContext, path_params: Path, diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 6e4e59688a..9097082a20 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -359,12 +359,6 @@ pub static DEMO_INSTANCE_REBOOT_URL: Lazy = Lazy::new(|| { *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR ) }); -pub static DEMO_INSTANCE_MIGRATE_URL: Lazy = Lazy::new(|| { - format!( - "/v1/instances/{}/migrate?{}", - *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR - ) -}); pub static DEMO_INSTANCE_SERIAL_URL: Lazy = Lazy::new(|| { format!( "/v1/instances/{}/serial-console?{}", @@ -1823,18 +1817,6 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { AllowedMethod::Post(serde_json::Value::Null) ], }, - VerifyEndpoint { - url: &DEMO_INSTANCE_MIGRATE_URL, - visibility: Visibility::Protected, - unprivileged_access: UnprivilegedAccess::None, - allowed_methods: vec![ - AllowedMethod::Post(serde_json::to_value( - params::InstanceMigrate { - dst_sled_id: uuid::Uuid::new_v4(), - } - ).unwrap()), - ], - }, VerifyEndpoint { url: &DEMO_INSTANCE_SERIAL_URL, visibility: Visibility::Protected, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 2e41fac3a4..eb3c88eb38 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -48,6 +48,7 @@ use nexus_types::external_api::shared::SiloIdentityMode; use nexus_types::external_api::views::SshKey; use nexus_types::external_api::{params, views}; use nexus_types::identity::Resource; +use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::DiskState; @@ -737,6 +738,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { } let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; let instance_name = "bird-ecology"; @@ -791,10 +793,10 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { }; let migrate_url = - format!("/v1/instances/{}/migrate", &instance_id.to_string()); + format!("/instances/{}/migrate", &instance_id.to_string()); let instance = NexusRequest::new( - RequestBuilder::new(client, Method::POST, &migrate_url) - .body(Some(¶ms::InstanceMigrate { + RequestBuilder::new(internal_client, Method::POST, &migrate_url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), })) .expect_status(Some(StatusCode::OK)), @@ -907,6 +909,7 @@ async fn test_instance_migrate_v2p_and_routes( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; let datastore = nexus.datastore(); @@ -997,10 +1000,10 @@ async fn test_instance_migrate_v2p_and_routes( // Kick off migration and simulate its completion on the target. let migrate_url = - format!("/v1/instances/{}/migrate", &instance_id.to_string()); + format!("/instances/{}/migrate", &instance_id.to_string()); let _ = NexusRequest::new( - RequestBuilder::new(client, Method::POST, &migrate_url) - .body(Some(¶ms::InstanceMigrate { + RequestBuilder::new(internal_client, Method::POST, &migrate_url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), })) .expect_status(Some(StatusCode::OK)), @@ -1293,6 +1296,7 @@ async fn test_instance_metrics_with_migration( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; let instance_name = "bird-ecology"; @@ -1381,10 +1385,10 @@ async fn test_instance_metrics_with_migration( }; let migrate_url = - format!("/v1/instances/{}/migrate", &instance_id.to_string()); + format!("/instances/{}/migrate", &instance_id.to_string()); let _ = NexusRequest::new( - RequestBuilder::new(client, Method::POST, &migrate_url) - .body(Some(¶ms::InstanceMigrate { + RequestBuilder::new(internal_client, Method::POST, &migrate_url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: dst_sled_id.into_untyped_uuid(), })) .expect_status(Some(StatusCode::OK)), diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 4af018c5af..340d72569b 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -51,7 +51,6 @@ instance_ephemeral_ip_attach POST /v1/instances/{instance}/exter instance_ephemeral_ip_detach DELETE /v1/instances/{instance}/external-ips/ephemeral instance_external_ip_list GET /v1/instances/{instance}/external-ips instance_list GET /v1/instances -instance_migrate POST /v1/instances/{instance}/migrate instance_network_interface_create POST /v1/network-interfaces instance_network_interface_delete DELETE /v1/network-interfaces/{interface} instance_network_interface_list GET /v1/network-interfaces diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 8dcce913b3..a7dd0a72cc 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1093,12 +1093,6 @@ impl JsonSchema for UserData { } } -/// Migration parameters for an `Instance` -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct InstanceMigrate { - pub dst_sled_id: Uuid, -} - /// Forwarded to a propolis server to request the contents of an Instance's serial console. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] pub struct InstanceSerialConsoleRequest { diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index 3a26dde4ba..c803f003f1 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -207,3 +207,10 @@ pub struct OximeterInfo { /// The address on which this oximeter instance listens for requests pub address: SocketAddr, } + +/// Parameters used when migrating an instance. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct InstanceMigrateRequest { + /// The ID of the sled to which to migrate the target instance. + pub dst_sled_id: Uuid, +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 5dd7d3dea3..d054591f3a 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -784,6 +784,50 @@ } } }, + "/instances/{instance_id}/migrate": { + "post": { + "operationId": "instance_migrate", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceMigrateRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Instance" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/metrics/collectors": { "post": { "summary": "Accept a notification of a new oximeter collection server.", @@ -3300,6 +3344,179 @@ } ] }, + "Instance": { + "description": "View of an Instance", + "type": "object", + "properties": { + "description": { + "description": "human-readable free-form text about a resource", + "type": "string" + }, + "hostname": { + "description": "RFC1035-compliant hostname for the Instance.", + "type": "string" + }, + "id": { + "description": "unique, immutable, system-controlled identifier for each resource", + "type": "string", + "format": "uuid" + }, + "memory": { + "description": "memory allocated for this Instance", + "allOf": [ + { + "$ref": "#/components/schemas/ByteCount" + } + ] + }, + "name": { + "description": "unique, mutable, user-controlled identifier for each resource", + "allOf": [ + { + "$ref": "#/components/schemas/Name" + } + ] + }, + "ncpus": { + "description": "number of CPUs allocated for this Instance", + "allOf": [ + { + "$ref": "#/components/schemas/InstanceCpuCount" + } + ] + }, + "project_id": { + "description": "id for the project containing this Instance", + "type": "string", + "format": "uuid" + }, + "run_state": { + "$ref": "#/components/schemas/InstanceState" + }, + "time_created": { + "description": "timestamp when this resource was created", + "type": "string", + "format": "date-time" + }, + "time_modified": { + "description": "timestamp when this resource was last modified", + "type": "string", + "format": "date-time" + }, + "time_run_state_updated": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "description", + "hostname", + "id", + "memory", + "name", + "ncpus", + "project_id", + "run_state", + "time_created", + "time_modified", + "time_run_state_updated" + ] + }, + "InstanceCpuCount": { + "description": "The number of CPUs in an Instance", + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "InstanceMigrateRequest": { + "description": "Parameters used when migrating an instance.", + "type": "object", + "properties": { + "dst_sled_id": { + "description": "The ID of the sled to which to migrate the target instance.", + "type": "string", + "format": "uuid" + } + }, + "required": [ + "dst_sled_id" + ] + }, + "InstanceState": { + "description": "Running state of an Instance (primarily: booted or stopped)\n\nThis typically reflects whether it's starting, running, stopping, or stopped, but also includes states related to the Instance's lifecycle", + "oneOf": [ + { + "description": "The instance is being created.", + "type": "string", + "enum": [ + "creating" + ] + }, + { + "description": "The instance is currently starting up.", + "type": "string", + "enum": [ + "starting" + ] + }, + { + "description": "The instance is currently running.", + "type": "string", + "enum": [ + "running" + ] + }, + { + "description": "The instance has been requested to stop and a transition to \"Stopped\" is imminent.", + "type": "string", + "enum": [ + "stopping" + ] + }, + { + "description": "The instance is currently stopped.", + "type": "string", + "enum": [ + "stopped" + ] + }, + { + "description": "The instance is in the process of rebooting - it will remain in the \"rebooting\" state until the VM is starting once more.", + "type": "string", + "enum": [ + "rebooting" + ] + }, + { + "description": "The instance is in the process of migrating - it will remain in the \"migrating\" state until the migration process is complete and the destination propolis is ready to continue execution.", + "type": "string", + "enum": [ + "migrating" + ] + }, + { + "description": "The instance is attempting to recover from a failure.", + "type": "string", + "enum": [ + "repairing" + ] + }, + { + "description": "The instance has encountered a failure.", + "type": "string", + "enum": [ + "failed" + ] + }, + { + "description": "The instance has been deleted.", + "type": "string", + "enum": [ + "destroyed" + ] + } + ] + }, "IpNet": { "x-rust-type": { "crate": "oxnet", diff --git a/openapi/nexus.json b/openapi/nexus.json index da77eec2a8..27e2870b6e 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -2276,62 +2276,6 @@ } } }, - "/v1/instances/{instance}/migrate": { - "post": { - "tags": [ - "instances" - ], - "summary": "Migrate an instance", - "operationId": "instance_migrate", - "parameters": [ - { - "in": "query", - "name": "project", - "description": "Name or ID of the project", - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - }, - { - "in": "path", - "name": "instance", - "description": "Name or ID of the instance", - "required": true, - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstanceMigrate" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Instance" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/v1/instances/{instance}/reboot": { "post": { "tags": [ @@ -15271,19 +15215,6 @@ } ] }, - "InstanceMigrate": { - "description": "Migration parameters for an `Instance`", - "type": "object", - "properties": { - "dst_sled_id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "dst_sled_id" - ] - }, "InstanceNetworkInterface": { "description": "An `InstanceNetworkInterface` represents a virtual network interface device attached to an instance.", "type": "object", From 77230c7ac07b4a0516bf77146d0b0c8401351f3a Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 22:49:42 +0000 Subject: [PATCH 36/91] Update Rust crate russh to 0.44.1 (#6335) --- Cargo.lock | 147 ++++++++++++++++++++++++++++++++---- end-to-end-tests/Cargo.toml | 4 +- workspace-hack/Cargo.toml | 14 +++- 3 files changed, 146 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd12b9de9a..f827582501 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6291,6 +6291,7 @@ dependencies = [ "anyhow", "base16ct", "base64 0.22.1", + "base64ct", "bit-set", "bit-vec", "bitflags 1.3.2", @@ -6346,7 +6347,7 @@ dependencies = [ "mio 0.8.11", "nix 0.28.0", "nom", - "num-bigint", + "num-bigint-dig", "num-integer", "num-iter", "num-traits", @@ -6355,6 +6356,7 @@ dependencies = [ "peg-runtime", "pem-rfc7468", "petgraph", + "pkcs8", "postgres-types", "predicates", "proc-macro2", @@ -6363,12 +6365,14 @@ dependencies = [ "regex-syntax 0.8.4", "reqwest", "ring 0.17.8", + "rsa", "rustix", "schemars", "scopeguard", "semver 1.0.23", "serde", "serde_json", + "sha1", "sha2", "signal-hook-mio", "similar", @@ -6397,7 +6401,6 @@ dependencies = [ "usdt", "usdt-impl", "uuid", - "yasna", "zerocopy 0.7.34", "zeroize", ] @@ -6941,6 +6944,18 @@ dependencies = [ "sha2", ] +[[package]] +name = "p384" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209" +dependencies = [ + "ecdsa", + "elliptic-curve", + "primeorder", + "sha2", +] + [[package]] name = "p521" version = "0.13.3" @@ -7141,6 +7156,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", + "hmac", ] [[package]] @@ -7322,6 +7338,21 @@ dependencies = [ "spki", ] +[[package]] +name = "pkcs5" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6" +dependencies = [ + "aes", + "cbc", + "der", + "pbkdf2 0.12.2", + "scrypt", + "sha2", + "spki", +] + [[package]] name = "pkcs8" version = "0.10.2" @@ -7329,6 +7360,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ "der", + "pkcs5", + "rand_core 0.6.4", "spki", ] @@ -8452,19 +8485,21 @@ dependencies = [ [[package]] name = "russh" -version = "0.43.0" +version = "0.44.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c9534703dc13be1eefc5708618f4c346da8e4f04f260218613f351ed5e94259" +checksum = "6500eedfaf8cd81597899d896908a4b9cd5cb566db875e843c04ccf92add2c16" dependencies = [ "aes", "aes-gcm", "async-trait", "bitflags 2.6.0", "byteorder", + "cbc", "chacha20", "ctr", "curve25519-dalek", "digest", + "elliptic-curve", "flate2", "futures", "generic-array", @@ -8473,16 +8508,21 @@ dependencies = [ "log", "num-bigint", "once_cell", + "p256", + "p384", + "p521", "poly1305", "rand 0.8.5", + "rand_core 0.6.4", "russh-cryptovec", "russh-keys", "sha1", "sha2", + "ssh-encoding", + "ssh-key", "subtle", "thiserror", "tokio", - "tokio-util", ] [[package]] @@ -8497,41 +8537,53 @@ dependencies = [ [[package]] name = "russh-keys" -version = "0.43.0" +version = "0.44.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa4a5afa2fab6fd49d0c470a3b75c3c70a4f363c38db32df5ae3b44a3abf5ab9" +checksum = "fb8c0bfe024d4edd242f65a2ac6c8bf38a892930050b9eb90909d8fc2c413c8d" dependencies = [ "aes", "async-trait", "bcrypt-pbkdf", - "bit-vec", "block-padding", "byteorder", "cbc", "ctr", "data-encoding", + "der", + "digest", "dirs", + "ecdsa", "ed25519-dalek", + "elliptic-curve", "futures", "hmac", "inout", "log", "md5", - "num-bigint", "num-integer", "p256", + "p384", "p521", "pbkdf2 0.11.0", - "rand 0.7.3", + "pkcs1", + "pkcs5", + "pkcs8", + "rand 0.8.5", "rand_core 0.6.4", + "rsa", "russh-cryptovec", + "sec1", "serde", "sha1", "sha2", + "spki", + "ssh-encoding", + "ssh-key", "thiserror", "tokio", "tokio-stream", - "yasna", + "typenum", + "zeroize", ] [[package]] @@ -8750,6 +8802,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + [[package]] name = "samael" version = "0.0.15" @@ -8857,6 +8918,17 @@ dependencies = [ "syn 2.0.74", ] +[[package]] +name = "scrypt" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f" +dependencies = [ + "pbkdf2 0.12.2", + "salsa20", + "sha2", +] + [[package]] name = "sct" version = "0.7.1" @@ -9718,6 +9790,57 @@ dependencies = [ "syn 2.0.74", ] +[[package]] +name = "ssh-cipher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caac132742f0d33c3af65bfcde7f6aa8f62f0e991d80db99149eb9d44708784f" +dependencies = [ + "aes", + "aes-gcm", + "cbc", + "chacha20", + "cipher", + "ctr", + "poly1305", + "ssh-encoding", + "subtle", +] + +[[package]] +name = "ssh-encoding" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9242b9ef4108a78e8cd1a2c98e193ef372437f8c22be363075233321dd4a15" +dependencies = [ + "base64ct", + "pem-rfc7468", + "sha2", +] + +[[package]] +name = "ssh-key" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca9b366a80cf18bb6406f4cf4d10aebfb46140a8c0c33f666a144c5c76ecbafc" +dependencies = [ + "bcrypt-pbkdf", + "ed25519-dalek", + "num-bigint-dig", + "p256", + "p384", + "p521", + "rand_core 0.6.4", + "rsa", + "sec1", + "sha2", + "signature", + "ssh-cipher", + "ssh-encoding", + "subtle", + "zeroize", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -11998,8 +12121,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ - "bit-vec", - "num-bigint", "time", ] diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index eb7cd68812..781f3fb1c6 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -19,8 +19,8 @@ omicron-test-utils.workspace = true oxide-client.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["cookies"] } -russh = "0.43.0" -russh-keys = "0.43.0" +russh = "0.44.1" +russh-keys = "0.44.0" serde.workspace = true serde_json.workspace = true sled-agent-types.workspace = true diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index eff58519a3..688e1a0921 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -22,6 +22,7 @@ aho-corasick = { version = "1.1.3" } anyhow = { version = "1.0.86", features = ["backtrace"] } base16ct = { version = "0.2.0", default-features = false, features = ["alloc"] } base64 = { version = "0.22.1" } +base64ct = { version = "1.6.0", default-features = false, features = ["std"] } bit-set = { version = "0.5.3" } bit-vec = { version = "0.6.3" } bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" } @@ -72,7 +73,7 @@ log = { version = "0.4.21", default-features = false, features = ["std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.2" } nom = { version = "7.1.3" } -num-bigint = { version = "0.4.5", features = ["rand"] } +num-bigint-dig = { version = "0.8.4", default-features = false, features = ["i128", "prime", "serde", "u64_digit", "zeroize"] } num-integer = { version = "0.1.46", features = ["i128"] } num-iter = { version = "0.1.45", default-features = false, features = ["i128"] } num-traits = { version = "0.2.19", features = ["i128", "libm"] } @@ -80,6 +81,7 @@ openapiv3 = { version = "2.0.0", default-features = false, features = ["skip_ser peg-runtime = { version = "0.8.3", default-features = false, features = ["std"] } pem-rfc7468 = { version = "0.7.0", default-features = false, features = ["std"] } petgraph = { version = "0.6.5", features = ["serde-1"] } +pkcs8 = { version = "0.10.2", default-features = false, features = ["encryption", "pem", "std"] } postgres-types = { version = "0.2.7", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } @@ -88,11 +90,13 @@ regex-automata = { version = "0.4.6", default-features = false, features = ["dfa regex-syntax = { version = "0.8.4" } reqwest = { version = "0.11.27", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } ring = { version = "0.17.8", features = ["std"] } +rsa = { version = "0.9.6", features = ["serde", "sha2"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } +sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } @@ -115,7 +119,6 @@ unicode-normalization = { version = "0.1.23" } usdt = { version = "0.5.0" } usdt-impl = { version = "0.5.0", default-features = false, features = ["asm", "des"] } uuid = { version = "1.10.0", features = ["serde", "v4"] } -yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zerocopy = { version = "0.7.34", features = ["derive", "simd"] } zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } @@ -125,6 +128,7 @@ aho-corasick = { version = "1.1.3" } anyhow = { version = "1.0.86", features = ["backtrace"] } base16ct = { version = "0.2.0", default-features = false, features = ["alloc"] } base64 = { version = "0.22.1" } +base64ct = { version = "1.6.0", default-features = false, features = ["std"] } bit-set = { version = "0.5.3" } bit-vec = { version = "0.6.3" } bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" } @@ -176,7 +180,7 @@ log = { version = "0.4.21", default-features = false, features = ["std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.2" } nom = { version = "7.1.3" } -num-bigint = { version = "0.4.5", features = ["rand"] } +num-bigint-dig = { version = "0.8.4", default-features = false, features = ["i128", "prime", "serde", "u64_digit", "zeroize"] } num-integer = { version = "0.1.46", features = ["i128"] } num-iter = { version = "0.1.45", default-features = false, features = ["i128"] } num-traits = { version = "0.2.19", features = ["i128", "libm"] } @@ -184,6 +188,7 @@ openapiv3 = { version = "2.0.0", default-features = false, features = ["skip_ser peg-runtime = { version = "0.8.3", default-features = false, features = ["std"] } pem-rfc7468 = { version = "0.7.0", default-features = false, features = ["std"] } petgraph = { version = "0.6.5", features = ["serde-1"] } +pkcs8 = { version = "0.10.2", default-features = false, features = ["encryption", "pem", "std"] } postgres-types = { version = "0.2.7", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } @@ -192,11 +197,13 @@ regex-automata = { version = "0.4.6", default-features = false, features = ["dfa regex-syntax = { version = "0.8.4" } reqwest = { version = "0.11.27", features = ["blocking", "cookies", "json", "rustls-tls", "stream"] } ring = { version = "0.17.8", features = ["std"] } +rsa = { version = "0.9.6", features = ["serde", "sha2"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } +sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } @@ -222,7 +229,6 @@ unicode-xid = { version = "0.2.4" } usdt = { version = "0.5.0" } usdt-impl = { version = "0.5.0", default-features = false, features = ["asm", "des"] } uuid = { version = "1.10.0", features = ["serde", "v4"] } -yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zerocopy = { version = "0.7.34", features = ["derive", "simd"] } zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } From 82d75b5895412665b0de6391cc21d70fe7bdef95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Thu, 15 Aug 2024 11:38:50 +1200 Subject: [PATCH 37/91] [reconfigurator] `clickhouse-admin` SMF service with dropshot server (#6304) ## Overview New SMF service in `clickhouse` and `clickhouse_keeper` zones which runs a dropshot server. The API contains a single `/node/address` endpoint to retrieve the node's listen address. Other endpoints will be added in future PRs. ## Purpose This server will be used to manage ClickHouse server and Keeper nodes. For now it performs a single basic action to keep the size of this PR small, but this server will perform other actions like generating the XML config files, retrieving the state of the node etc. ## Testing I've deployed locally with the following results: ```console root@oxz_switch:~# curl http://[fd00:1122:3344:101::e]:8888/node/address {"clickhouse_address":"[fd00:1122:3344:101::e]:8123"} ``` ```console root@oxz_clickhouse_2c213ff2:~# cat /var/svc/log/oxide-clickhouse-admin:default.log [ Aug 14 06:54:42 Enabled. ] [ Aug 14 06:54:42 Rereading configuration. ] [ Aug 14 06:54:45 Rereading configuration. ] [ Aug 14 06:54:46 Executing start method ("ctrun -l child -o noorphan,regent /opt/oxide/clickhouse-admin/bin/clickhouse-admin run -c /var/svc/manifest/site/clickhouse-admin/config.toml -a [fd00:1122:3344:101::e]:8123 -H [fd00:1122:3344:101::e]:8888 &"). ] [ Aug 14 06:54:46 Method "start" exited with status 0. ] note: configured to log to "/dev/stdout" {"msg":"listening","v":0,"name":"clickhouse-admin","level":30,"time":"2024-08-14T06:54:46.721122327Z","hostname":"oxz_clickhouse_2c213ff2-6544-4316-939f-b51749cf3222","pid":5169,"local_addr":"[fd00:1122:3344:101::e]:8888","component":"dropshot","file":"/home/coatlicue/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:205"} {"msg":"accepted connection","v":0,"name":"clickhouse-admin","level":30,"time":"2024-08-14T06:56:17.908877036Z","hostname":"oxz_clickhouse_2c213ff2-6544-4316-939f-b51749cf3222","pid":5169,"local_addr":"[fd00:1122:3344:101::e]:8888","component":"dropshot","file":"/home/coatlicue/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:775","remote_addr":"[fd00:1122:3344:101::2]:37268"} {"msg":"request completed","v":0,"name":"clickhouse-admin","level":30,"time":"2024-08-14T06:56:17.91734856Z","hostname":"oxz_clickhouse_2c213ff2-6544-4316-939f-b51749cf3222","pid":5169,"uri":"/node/address","method":"GET","req_id":"62a3d8fc-e37e-42aa-a715-52dbce8aa493","remote_addr":"[fd00:1122:3344:101::2]:37268","local_addr":"[fd00:1122:3344:101::e]:8888","component":"dropshot","file":"/home/coatlicue/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:914","latency_us":3151,"response_code":"200"} ``` Related: https://github.com/oxidecomputer/omicron/issues/5999 --- Cargo.lock | 48 +++++++++++ Cargo.toml | 6 ++ clickhouse-admin/Cargo.toml | 42 ++++++++++ clickhouse-admin/api/Cargo.toml | 16 ++++ clickhouse-admin/api/src/lib.rs | 28 +++++++ clickhouse-admin/src/bin/clickhouse-admin.rs | 68 ++++++++++++++++ clickhouse-admin/src/clickward.rs | 51 ++++++++++++ clickhouse-admin/src/config.rs | 43 ++++++++++ clickhouse-admin/src/context.rs | 21 +++++ clickhouse-admin/src/http_entrypoints.rs | 31 ++++++++ clickhouse-admin/src/lib.rs | 70 ++++++++++++++++ common/src/address.rs | 1 + dev-tools/openapi-manager/Cargo.toml | 1 + dev-tools/openapi-manager/src/spec.rs | 11 +++ openapi/clickhouse-admin.json | 84 ++++++++++++++++++++ package-manifest.toml | 16 ++++ sled-agent/src/services.rs | 56 ++++++++++++- smf/clickhouse-admin/config.toml | 10 +++ smf/clickhouse-admin/manifest.xml | 45 +++++++++++ 19 files changed, 646 insertions(+), 2 deletions(-) create mode 100644 clickhouse-admin/Cargo.toml create mode 100644 clickhouse-admin/api/Cargo.toml create mode 100644 clickhouse-admin/api/src/lib.rs create mode 100644 clickhouse-admin/src/bin/clickhouse-admin.rs create mode 100644 clickhouse-admin/src/clickward.rs create mode 100644 clickhouse-admin/src/config.rs create mode 100644 clickhouse-admin/src/context.rs create mode 100644 clickhouse-admin/src/http_entrypoints.rs create mode 100644 clickhouse-admin/src/lib.rs create mode 100644 openapi/clickhouse-admin.json create mode 100644 smf/clickhouse-admin/config.toml create mode 100644 smf/clickhouse-admin/manifest.xml diff --git a/Cargo.lock b/Cargo.lock index f827582501..781785b8ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1113,6 +1113,18 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "clickhouse-admin-api" +version = "0.1.0" +dependencies = [ + "dropshot", + "omicron-common", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "schemars", + "serde", +] + [[package]] name = "clickward" version = "0.1.0" @@ -5654,6 +5666,41 @@ dependencies = [ "thiserror", ] +[[package]] +name = "omicron-clickhouse-admin" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "chrono", + "clap", + "clickhouse-admin-api", + "dropshot", + "expectorate", + "http 0.2.12", + "illumos-utils", + "nexus-test-utils", + "omicron-common", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "openapi-lint", + "openapiv3", + "schemars", + "serde", + "serde_json", + "slog", + "slog-async", + "slog-dtrace", + "slog-error-chain", + "subprocess", + "thiserror", + "tokio", + "tokio-postgres", + "toml 0.8.19", + "url", +] + [[package]] name = "omicron-cockroach-admin" version = "0.1.0" @@ -6476,6 +6523,7 @@ dependencies = [ "bootstrap-agent-api", "camino", "clap", + "clickhouse-admin-api", "cockroach-admin-api", "dns-server-api", "dropshot", diff --git a/Cargo.toml b/Cargo.toml index 3dd5e61236..b7cf6f6fd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,8 @@ members = [ "api_identity", "bootstore", "certificates", + "clickhouse-admin", + "clickhouse-admin/api", "clients/bootstrap-agent-client", "clients/cockroach-admin-client", "clients/ddm-admin-client", @@ -112,6 +114,8 @@ default-members = [ "api_identity", "bootstore", "certificates", + "clickhouse-admin", + "clickhouse-admin/api", "clients/bootstrap-agent-client", "clients/cockroach-admin-client", "clients/ddm-admin-client", @@ -294,6 +298,7 @@ cfg-if = "1.0" chrono = { version = "0.4", features = [ "serde" ] } ciborium = "0.2.2" clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] } +clickhouse-admin-api = { path = "clickhouse-admin/api" } clickward = { git = "https://github.com/oxidecomputer/clickward", rev = "ceec762e6a87d2a22bf56792a3025e145caa095e" } cockroach-admin-api = { path = "cockroach-admin/api" } cockroach-admin-client = { path = "clients/cockroach-admin-client" } @@ -417,6 +422,7 @@ nexus-test-utils = { path = "nexus/test-utils" } nexus-types = { path = "nexus/types" } num-integer = "0.1.46" num = { version = "0.4.3", default-features = false, features = [ "libm" ] } +omicron-clickhouse-admin = { path = "clickhouse-admin" } omicron-certificates = { path = "certificates" } omicron-cockroach-admin = { path = "cockroach-admin" } omicron-common = { path = "common" } diff --git a/clickhouse-admin/Cargo.toml b/clickhouse-admin/Cargo.toml new file mode 100644 index 0000000000..033836dfe0 --- /dev/null +++ b/clickhouse-admin/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "omicron-clickhouse-admin" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +camino.workspace = true +chrono.workspace = true +clap.workspace = true +clickhouse-admin-api.workspace = true +dropshot.workspace = true +http.workspace = true +illumos-utils.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +schemars.workspace = true +slog.workspace = true +slog-async.workspace = true +slog-dtrace.workspace = true +slog-error-chain.workspace = true +serde.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-postgres.workspace = true +toml.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +expectorate.workspace = true +nexus-test-utils.workspace = true +omicron-test-utils.workspace = true +openapi-lint.workspace = true +openapiv3.workspace = true +serde_json.workspace = true +subprocess.workspace = true +url.workspace = true + +[lints] +workspace = true diff --git a/clickhouse-admin/api/Cargo.toml b/clickhouse-admin/api/Cargo.toml new file mode 100644 index 0000000000..ceec09f6c8 --- /dev/null +++ b/clickhouse-admin/api/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "clickhouse-admin-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +dropshot.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true diff --git a/clickhouse-admin/api/src/lib.rs b/clickhouse-admin/api/src/lib.rs new file mode 100644 index 0000000000..9a011d4387 --- /dev/null +++ b/clickhouse-admin/api/src/lib.rs @@ -0,0 +1,28 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use dropshot::{HttpError, HttpResponseOk, RequestContext}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; + +#[dropshot::api_description] +pub trait ClickhouseAdminApi { + type Context; + + /// Retrieve the address the ClickHouse server or keeper node is listening on + #[endpoint { + method = GET, + path = "/node/address", + }] + async fn clickhouse_address( + rqctx: RequestContext, + ) -> Result, HttpError>; +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct ClickhouseAddress { + pub clickhouse_address: SocketAddrV6, +} diff --git a/clickhouse-admin/src/bin/clickhouse-admin.rs b/clickhouse-admin/src/bin/clickhouse-admin.rs new file mode 100644 index 0000000000..6f28a82804 --- /dev/null +++ b/clickhouse-admin/src/bin/clickhouse-admin.rs @@ -0,0 +1,68 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Executable program to run the Omicron ClickHouse admin interface + +use anyhow::anyhow; +use camino::Utf8PathBuf; +use clap::Parser; +use omicron_clickhouse_admin::{Clickward, Config}; +use omicron_common::cmd::fatal; +use omicron_common::cmd::CmdError; +use std::net::{SocketAddr, SocketAddrV6}; + +#[derive(Debug, Parser)] +#[clap( + name = "clickhouse-admin", + about = "Omicron ClickHouse cluster admin server" +)] +enum Args { + /// Start the ClickHouse admin server + Run { + // TODO: This address is solely for testing now. We should remove it + // once we have more endpoints up and running. + /// Socket address for a running clickhouse server or keeper instance + #[clap(long, short = 'a', action)] + clickhouse_address: SocketAddrV6, + + /// Address on which this server should run + #[clap(long, short = 'H', action)] + http_address: SocketAddrV6, + + /// Path to the server configuration file + #[clap(long, short, action)] + config: Utf8PathBuf, + }, +} + +#[tokio::main] +async fn main() { + if let Err(err) = main_impl().await { + fatal(err); + } +} + +async fn main_impl() -> Result<(), CmdError> { + let args = Args::parse(); + + match args { + Args::Run { clickhouse_address, http_address, config } => { + let mut config = Config::from_file(&config) + .map_err(|err| CmdError::Failure(anyhow!(err)))?; + config.dropshot.bind_address = SocketAddr::V6(http_address); + + let clickward = Clickward::new(clickhouse_address); + + let server = + omicron_clickhouse_admin::start_server(clickward, config) + .await + .map_err(|err| CmdError::Failure(anyhow!(err)))?; + server.await.map_err(|err| { + CmdError::Failure(anyhow!( + "server failed after starting: {err}" + )) + }) + } + } +} diff --git a/clickhouse-admin/src/clickward.rs b/clickhouse-admin/src/clickward.rs new file mode 100644 index 0000000000..114201e44b --- /dev/null +++ b/clickhouse-admin/src/clickward.rs @@ -0,0 +1,51 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use clickhouse_admin_api::ClickhouseAddress; +use dropshot::HttpError; +use slog_error_chain::{InlineErrorChain, SlogInlineError}; +use std::io; +use std::net::SocketAddrV6; + +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub enum ClickwardError { + #[error("clickward failure")] + Failure { + #[source] + err: io::Error, + }, +} + +impl From for HttpError { + fn from(err: ClickwardError) -> Self { + match err { + ClickwardError::Failure { .. } => { + let message = InlineErrorChain::new(&err).to_string(); + HttpError { + status_code: http::StatusCode::INTERNAL_SERVER_ERROR, + error_code: Some(String::from("Internal")), + external_message: message.clone(), + internal_message: message, + } + } + } + } +} + +#[derive(Debug)] +pub struct Clickward { + clickhouse_address: SocketAddrV6, +} + +impl Clickward { + pub fn new(clickhouse_address: SocketAddrV6) -> Self { + Self { clickhouse_address } + } + + pub fn clickhouse_address( + &self, + ) -> Result { + Ok(ClickhouseAddress { clickhouse_address: self.clickhouse_address }) + } +} diff --git a/clickhouse-admin/src/config.rs b/clickhouse-admin/src/config.rs new file mode 100644 index 0000000000..77a624835c --- /dev/null +++ b/clickhouse-admin/src/config.rs @@ -0,0 +1,43 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use camino::Utf8Path; +use camino::Utf8PathBuf; +use dropshot::ConfigDropshot; +use dropshot::ConfigLogging; +use serde::Deserialize; +use serde::Serialize; +use slog_error_chain::SlogInlineError; +use std::io; + +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct Config { + pub dropshot: ConfigDropshot, + pub log: ConfigLogging, +} +impl Config { + /// Load a `Config` from the given TOML file + pub fn from_file(path: &Utf8Path) -> Result { + let contents = std::fs::read_to_string(path) + .map_err(|err| LoadError::Read { path: path.to_owned(), err })?; + toml::de::from_str(&contents) + .map_err(|err| LoadError::Parse { path: path.to_owned(), err }) + } +} + +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub enum LoadError { + #[error("failed to read {path}")] + Read { + path: Utf8PathBuf, + #[source] + err: io::Error, + }, + #[error("failed to parse {path} as TOML")] + Parse { + path: Utf8PathBuf, + #[source] + err: toml::de::Error, + }, +} diff --git a/clickhouse-admin/src/context.rs b/clickhouse-admin/src/context.rs new file mode 100644 index 0000000000..cab875fe1d --- /dev/null +++ b/clickhouse-admin/src/context.rs @@ -0,0 +1,21 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::Clickward; +use slog::Logger; + +pub struct ServerContext { + clickward: Clickward, + _log: Logger, +} + +impl ServerContext { + pub fn new(clickward: Clickward, _log: Logger) -> Self { + Self { clickward, _log } + } + + pub fn clickward(&self) -> &Clickward { + &self.clickward + } +} diff --git a/clickhouse-admin/src/http_entrypoints.rs b/clickhouse-admin/src/http_entrypoints.rs new file mode 100644 index 0000000000..05988a73b0 --- /dev/null +++ b/clickhouse-admin/src/http_entrypoints.rs @@ -0,0 +1,31 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::context::ServerContext; +use clickhouse_admin_api::*; +use dropshot::HttpError; +use dropshot::HttpResponseOk; +use dropshot::RequestContext; +use std::sync::Arc; + +type ClickhouseApiDescription = dropshot::ApiDescription>; + +pub fn api() -> ClickhouseApiDescription { + clickhouse_admin_api_mod::api_description::() + .expect("registered entrypoints") +} + +enum ClickhouseAdminImpl {} + +impl ClickhouseAdminApi for ClickhouseAdminImpl { + type Context = Arc; + + async fn clickhouse_address( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let output = ctx.clickward().clickhouse_address()?; + Ok(HttpResponseOk(output)) + } +} diff --git a/clickhouse-admin/src/lib.rs b/clickhouse-admin/src/lib.rs new file mode 100644 index 0000000000..a48588c544 --- /dev/null +++ b/clickhouse-admin/src/lib.rs @@ -0,0 +1,70 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use context::ServerContext; +use omicron_common::FileKv; +use slog::{debug, error, Drain}; +use slog_dtrace::ProbeRegistration; +use slog_error_chain::SlogInlineError; +use std::error::Error; +use std::io; +use std::sync::Arc; + +mod clickward; +mod config; +mod context; +mod http_entrypoints; + +pub use clickward::Clickward; +pub use config::Config; + +#[derive(Debug, thiserror::Error, SlogInlineError)] +pub enum StartError { + #[error("failed to initialize logger")] + InitializeLogger(#[source] io::Error), + #[error("failed to register dtrace probes: {0}")] + RegisterDtraceProbes(String), + #[error("failed to initialize HTTP server")] + InitializeHttpServer(#[source] Box), +} + +pub type Server = dropshot::HttpServer>; + +/// Start the dropshot server +pub async fn start_server( + clickward: Clickward, + server_config: Config, +) -> Result { + let (drain, registration) = slog_dtrace::with_drain( + server_config + .log + .to_logger("clickhouse-admin") + .map_err(StartError::InitializeLogger)?, + ); + let log = slog::Logger::root(drain.fuse(), slog::o!(FileKv)); + match registration { + ProbeRegistration::Success => { + debug!(log, "registered DTrace probes"); + } + ProbeRegistration::Failed(err) => { + let err = StartError::RegisterDtraceProbes(err); + error!(log, "failed to register DTrace probes"; &err); + return Err(err); + } + } + + let context = ServerContext::new( + clickward, + log.new(slog::o!("component" => "ServerContext")), + ); + let http_server_starter = dropshot::HttpServerStarter::new( + &server_config.dropshot, + http_entrypoints::api(), + Arc::new(context), + &log.new(slog::o!("component" => "dropshot")), + ) + .map_err(StartError::InitializeHttpServer)?; + + Ok(http_server_starter.start()) +} diff --git a/common/src/address.rs b/common/src/address.rs index 5ed5689289..ba1193c7f0 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -59,6 +59,7 @@ pub const COCKROACH_ADMIN_PORT: u16 = 32222; pub const CRUCIBLE_PORT: u16 = 32345; pub const CLICKHOUSE_PORT: u16 = 8123; pub const CLICKHOUSE_KEEPER_PORT: u16 = 9181; +pub const CLICKHOUSE_ADMIN_PORT: u16 = 8888; pub const OXIMETER_PORT: u16 = 12223; pub const DENDRITE_PORT: u16 = 12224; pub const LLDP_PORT: u16 = 12230; diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index 85d27aaafd..fe90737d9e 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -12,6 +12,7 @@ anyhow.workspace = true atomicwrites.workspace = true bootstrap-agent-api.workspace = true camino.workspace = true +clickhouse-admin-api.workspace = true cockroach-admin-api.workspace = true clap.workspace = true dns-server-api.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 37a657ee93..29601a63d6 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -24,6 +24,17 @@ pub fn all_apis() -> Vec { filename: "bootstrap-agent.json", extra_validation: None, }, + ApiSpec { + title: "ClickHouse Cluster Admin API", + version: "0.0.1", + description: "API for interacting with the Oxide \ + control plane's ClickHouse cluster", + boundary: ApiBoundary::Internal, + api_description: + clickhouse_admin_api::clickhouse_admin_api_mod::stub_api_description, + filename: "clickhouse-admin.json", + extra_validation: None, + }, ApiSpec { title: "CockroachDB Cluster Admin API", version: "0.0.1", diff --git a/openapi/clickhouse-admin.json b/openapi/clickhouse-admin.json new file mode 100644 index 0000000000..6bb5367712 --- /dev/null +++ b/openapi/clickhouse-admin.json @@ -0,0 +1,84 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "ClickHouse Cluster Admin API", + "description": "API for interacting with the Oxide control plane's ClickHouse cluster", + "contact": { + "url": "https://oxide.computer", + "email": "api@oxide.computer" + }, + "version": "0.0.1" + }, + "paths": { + "/node/address": { + "get": { + "summary": "Retrieve the address the ClickHouse server or keeper node is listening on", + "operationId": "clickhouse_address", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClickhouseAddress" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + } + }, + "components": { + "schemas": { + "ClickhouseAddress": { + "type": "object", + "properties": { + "clickhouse_address": { + "type": "string" + } + }, + "required": [ + "clickhouse_address" + ] + }, + "Error": { + "description": "Error information from a response.", + "type": "object", + "properties": { + "error_code": { + "type": "string" + }, + "message": { + "type": "string" + }, + "request_id": { + "type": "string" + } + }, + "required": [ + "message", + "request_id" + ] + } + }, + "responses": { + "Error": { + "description": "Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } +} diff --git a/package-manifest.toml b/package-manifest.toml index e6cd464404..9189ed09a0 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -153,6 +153,9 @@ source.type = "composite" source.packages = [ "clickhouse_svc.tar.gz", "internal-dns-cli.tar.gz", + # TODO: This package is for solely for testing purposes. + # Remove once replicated clickhouse is up and running. + "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", "zone-network-install.tar.gz" ] @@ -179,6 +182,7 @@ source.type = "composite" source.packages = [ "clickhouse_keeper_svc.tar.gz", "internal-dns-cli.tar.gz", + "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", "zone-network-install.tar.gz" ] @@ -198,6 +202,18 @@ output.type = "zone" output.intermediate_only = true setup_hint = "Run `cargo xtask download clickhouse` to download the necessary binaries" +[package.omicron-clickhouse-admin] +service_name = "clickhouse-admin" +only_for_targets.image = "standard" +source.type = "local" +source.rust.binary_names = ["clickhouse-admin"] +source.rust.release = true +source.paths = [ + { from = "smf/clickhouse-admin", to = "/var/svc/manifest/site/clickhouse-admin" }, +] +output.type = "zone" +output.intermediate_only = true + [package.cockroachdb] service_name = "cockroachdb" only_for_targets.image = "standard" diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index b822ae2963..32cf844e6d 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -65,6 +65,7 @@ use nexus_config::{ConfigDropshotWithTls, DeploymentConfig}; use nexus_sled_agent_shared::inventory::{ OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig, ZoneKind, }; +use omicron_common::address::CLICKHOUSE_ADMIN_PORT; use omicron_common::address::CLICKHOUSE_KEEPER_PORT; use omicron_common::address::CLICKHOUSE_PORT; use omicron_common::address::COCKROACH_PORT; @@ -1573,12 +1574,37 @@ impl ServiceManager { .add_property_group(config), ); + let ch_address = + SocketAddr::new(IpAddr::V6(listen_addr), CLICKHOUSE_PORT) + .to_string(); + + let admin_address = SocketAddr::new( + IpAddr::V6(listen_addr), + CLICKHOUSE_ADMIN_PORT, + ) + .to_string(); + + let clickhouse_admin_config = + PropertyGroupBuilder::new("config") + .add_property( + "clickhouse_address", + "astring", + ch_address, + ) + .add_property("http_address", "astring", admin_address); + let clickhouse_admin_service = + ServiceBuilder::new("oxide/clickhouse-admin").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(clickhouse_admin_config), + ); + let profile = ProfileBuilder::new("omicron") .add_service(nw_setup_service) .add_service(disabled_ssh_service) .add_service(clickhouse_service) .add_service(dns_service) - .add_service(enabled_dns_client_service); + .add_service(enabled_dns_client_service) + .add_service(clickhouse_admin_service); profile .add_to_zone(&self.inner.log, &installed_zone) .await @@ -1644,12 +1670,38 @@ impl ServiceManager { ServiceInstanceBuilder::new("default") .add_property_group(config), ); + + let ch_address = + SocketAddr::new(IpAddr::V6(listen_addr), CLICKHOUSE_PORT) + .to_string(); + + let admin_address = SocketAddr::new( + IpAddr::V6(listen_addr), + CLICKHOUSE_ADMIN_PORT, + ) + .to_string(); + + let clickhouse_admin_config = + PropertyGroupBuilder::new("config") + .add_property( + "clickhouse_address", + "astring", + ch_address, + ) + .add_property("http_address", "astring", admin_address); + let clickhouse_admin_service = + ServiceBuilder::new("oxide/clickhouse-admin").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(clickhouse_admin_config), + ); + let profile = ProfileBuilder::new("omicron") .add_service(nw_setup_service) .add_service(disabled_ssh_service) .add_service(clickhouse_keeper_service) .add_service(dns_service) - .add_service(enabled_dns_client_service); + .add_service(enabled_dns_client_service) + .add_service(clickhouse_admin_service); profile .add_to_zone(&self.inner.log, &installed_zone) .await diff --git a/smf/clickhouse-admin/config.toml b/smf/clickhouse-admin/config.toml new file mode 100644 index 0000000000..86ee2c5d4b --- /dev/null +++ b/smf/clickhouse-admin/config.toml @@ -0,0 +1,10 @@ +[dropshot] +# 1 MiB; we don't expect any requests of more than nominal size. +request_body_max_bytes = 1048576 + +[log] +# Show log messages of this level and more severe +level = "info" +mode = "file" +path = "/dev/stdout" +if_exists = "append" diff --git a/smf/clickhouse-admin/manifest.xml b/smf/clickhouse-admin/manifest.xml new file mode 100644 index 0000000000..435f8a86ac --- /dev/null +++ b/smf/clickhouse-admin/manifest.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 63397cc07ab04ee33d9d2c9abb410a110ba238e1 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:52:32 -0700 Subject: [PATCH 38/91] Update Rust crate indexmap to 2.4.0 (#6325) --- Cargo.lock | 38 +++++++++++++++++++------------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 781785b8ec..ad0dc13987 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2228,7 +2228,7 @@ dependencies = [ "hostname 0.4.0", "http 0.2.12", "hyper 0.14.30", - "indexmap 2.3.0", + "indexmap 2.4.0", "multer", "openapiv3", "paste", @@ -3130,7 +3130,7 @@ dependencies = [ "debug-ignore", "fixedbitset", "guppy-workspace-hack", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.13.0", "nested", "once_cell", @@ -3162,7 +3162,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.3.0", + "indexmap 2.4.0", "slab", "tokio", "tokio-util", @@ -3785,9 +3785,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown 0.14.5", @@ -5186,7 +5186,7 @@ dependencies = [ "debug-ignore", "expectorate", "gateway-client", - "indexmap 2.3.0", + "indexmap 2.4.0", "internal-dns", "ipnet", "maplit", @@ -6380,7 +6380,7 @@ dependencies = [ "hex", "hmac", "hyper 0.14.30", - "indexmap 2.3.0", + "indexmap 2.4.0", "inout", "itertools 0.10.5", "itertools 0.12.1", @@ -6508,7 +6508,7 @@ version = "0.4.0" source = "git+https://github.com/oxidecomputer/openapi-lint?branch=main#ef442ee4343e97b6d9c217d3e7533962fe7d7236" dependencies = [ "heck 0.4.1", - "indexmap 2.3.0", + "indexmap 2.4.0", "lazy_static", "openapiv3", "regex", @@ -6550,7 +6550,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc02deea53ffe807708244e5914f6b099ad7015a207ee24317c22112e17d9c5c" dependencies = [ - "indexmap 2.3.0", + "indexmap 2.4.0", "serde", "serde_json", ] @@ -6822,7 +6822,7 @@ dependencies = [ "expectorate", "futures", "highway", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.13.0", "num", "omicron-common", @@ -7311,7 +7311,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap 2.3.0", + "indexmap 2.4.0", "serde", "serde_derive", ] @@ -7729,7 +7729,7 @@ dependencies = [ "getopts", "heck 0.5.0", "http 0.2.12", - "indexmap 2.3.0", + "indexmap 2.4.0", "openapiv3", "proc-macro2", "quote", @@ -8145,7 +8145,7 @@ dependencies = [ "dropshot", "expectorate", "humantime", - "indexmap 2.3.0", + "indexmap 2.4.0", "nexus-client", "nexus-db-queries", "nexus-reconfigurator-execution", @@ -9202,7 +9202,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.3.0", + "indexmap 2.4.0", "serde", "serde_derive", "serde_json", @@ -9228,7 +9228,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.3.0", + "indexmap 2.4.0", "itoa", "ryu", "serde", @@ -10672,7 +10672,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.3.0", + "indexmap 2.4.0", "serde", "serde_spanned", "toml_datetime", @@ -10685,7 +10685,7 @@ version = "0.22.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "583c44c02ad26b0c3f3066fe629275e50627026c51ac2e595cca4c230ce1ce1d" dependencies = [ - "indexmap 2.3.0", + "indexmap 2.4.0", "serde", "serde_spanned", "toml_datetime", @@ -11257,7 +11257,7 @@ dependencies = [ "derive-where", "either", "futures", - "indexmap 2.3.0", + "indexmap 2.4.0", "indicatif", "libsw", "linear-map", @@ -11655,7 +11655,7 @@ dependencies = [ "expectorate", "futures", "humantime", - "indexmap 2.3.0", + "indexmap 2.4.0", "indicatif", "itertools 0.13.0", "maplit", diff --git a/Cargo.toml b/Cargo.toml index b7cf6f6fd1..a87c88eeac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -373,7 +373,7 @@ hyper-rustls = "0.26.0" hyper-staticfile = "0.9.5" illumos-utils = { path = "illumos-utils" } indent_write = "2.2.0" -indexmap = "2.3.0" +indexmap = "2.4.0" indicatif = { version = "0.17.8", features = ["rayon"] } installinator = { path = "installinator" } installinator-api = { path = "installinator-api" } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 688e1a0921..2e0520d82d 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -62,7 +62,7 @@ hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3", features = ["serde"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } -indexmap = { version = "2.3.0", features = ["serde"] } +indexmap = { version = "2.4.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } @@ -169,7 +169,7 @@ hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3", features = ["serde"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } -indexmap = { version = "2.3.0", features = ["serde"] } +indexmap = { version = "2.4.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } From 2331f7d62e8b48e94ebfd8d3f05ba64e1276a123 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 14 Aug 2024 18:05:24 -0700 Subject: [PATCH 39/91] `omdb nexus background-tasks show` could support filtering (#6327) --- Cargo.lock | 1 + dev-tools/omdb/Cargo.toml | 1 + dev-tools/omdb/src/bin/omdb/nexus.rs | 71 ++++- dev-tools/omdb/tests/successes.out | 391 ++++++++++++++++++++++++ dev-tools/omdb/tests/test_all_output.rs | 14 + dev-tools/omdb/tests/usage_errors.out | 40 +++ 6 files changed, 511 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad0dc13987..f38eece4d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6055,6 +6055,7 @@ dependencies = [ "indicatif", "internal-dns", "ipnetwork", + "itertools 0.13.0", "multimap", "nexus-client", "nexus-config", diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 0990fdb11c..a92de1b6a9 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -28,6 +28,7 @@ gateway-messages.workspace = true gateway-test-utils.workspace = true humantime.workspace = true internal-dns.workspace = true +itertools.workspace = true nexus-client.workspace = true nexus-config.workspace = true nexus-db-model.workspace = true diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 1b6d2469f4..67a4180dd2 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -19,6 +19,7 @@ use clap::Subcommand; use clap::ValueEnum; use futures::future::try_join; use futures::TryStreamExt; +use itertools::Itertools; use nexus_client::types::ActivationReason; use nexus_client::types::BackgroundTask; use nexus_client::types::BackgroundTasksActivateRequest; @@ -46,6 +47,7 @@ use reedline::Reedline; use serde::Deserialize; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::str::FromStr; use tabled::Tabled; use uuid::Uuid; @@ -93,11 +95,21 @@ enum BackgroundTasksCommands { /// Print a summary of the status of all background tasks List, /// Print human-readable summary of the status of each background task - Show, + Show(BackgroundTasksShowArgs), /// Activate one or more background tasks Activate(BackgroundTasksActivateArgs), } +#[derive(Debug, Args)] +struct BackgroundTasksShowArgs { + /// Names of background tasks to show (default: all) + /// + /// You can use any background task name here or one of the special strings + /// "all", "dns_external", or "dns_internal". + #[clap(value_name = "TASK_NAME")] + tasks: Vec, +} + #[derive(Debug, Args)] struct BackgroundTasksActivateArgs { /// Name of the background tasks to activate @@ -361,8 +373,8 @@ impl NexusArgs { command: BackgroundTasksCommands::List, }) => cmd_nexus_background_tasks_list(&client).await, NexusCommands::BackgroundTasks(BackgroundTasksArgs { - command: BackgroundTasksCommands::Show, - }) => cmd_nexus_background_tasks_show(&client).await, + command: BackgroundTasksCommands::Show(args), + }) => cmd_nexus_background_tasks_show(&client, args).await, NexusCommands::BackgroundTasks(BackgroundTasksArgs { command: BackgroundTasksCommands::Activate(args), }) => { @@ -523,7 +535,9 @@ async fn cmd_nexus_background_tasks_list( ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; - let tasks = response.into_inner(); + // Convert the HashMap to a BTreeMap because we want the keys in sorted + // order. + let tasks = response.into_inner().into_iter().collect::>(); let table_rows = tasks.values().map(BackgroundTaskStatusRow::from); let table = tabled::Table::new(table_rows) .with(tabled::settings::Style::empty()) @@ -536,6 +550,7 @@ async fn cmd_nexus_background_tasks_list( /// Runs `omdb nexus background-tasks show` async fn cmd_nexus_background_tasks_show( client: &nexus_client::Client, + args: &BackgroundTasksShowArgs, ) -> Result<(), anyhow::Error> { let response = client.bgtask_list().await.context("listing background tasks")?; @@ -544,8 +559,50 @@ async fn cmd_nexus_background_tasks_show( let mut tasks = response.into_inner().into_iter().collect::>(); - // We want to pick the order that we print some tasks intentionally. Then - // we want to print anything else that we find. + // Now, pick out the tasks that the user selected. + // + // The set of user tasks may include: + // + // - nothing at all, in which case we include all tasks + // - individual task names + // - certain groups that we recognize, like "dns_external" for all the tasks + // related to external DNS propagation. "all" means "all tasks". + let selected_set: BTreeSet<_> = + args.tasks.iter().map(AsRef::as_ref).collect(); + let selected_all = selected_set.is_empty() || selected_set.contains("all"); + if !selected_all { + for s in &selected_set { + if !tasks.contains_key(*s) + && *s != "all" + && *s != "dns_external" + && *s != "dns_internal" + { + bail!( + "unknown task name: {:?} (known task names: all, \ + dns_external, dns_internal, {})", + s, + tasks.keys().join(", ") + ); + } + } + + tasks.retain(|k, _| { + selected_set.contains(k.as_str()) + || selected_set.contains("all") + || (selected_set.contains("dns_external") + && k.starts_with("dns_") + && k.ends_with("_external")) + || (selected_set.contains("dns_internal") + && k.starts_with("dns_") + && k.ends_with("_internal")) + }); + } + + // Some tasks should be grouped and printed together in a certain order, + // even though their names aren't alphabetical. Notably, the DNS tasks + // logically go from config -> servers -> propagation, so we want to print + // them in that order. So we pick these out first and then print anything + // else that we find in alphabetical order. for name in [ "dns_config_internal", "dns_servers_internal", @@ -559,7 +616,7 @@ async fn cmd_nexus_background_tasks_show( ] { if let Some(bgtask) = tasks.remove(name) { print_task(&bgtask); - } else { + } else if selected_all { eprintln!("warning: expected to find background task {:?}", name); } } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 166936da9c..19c555ec96 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -632,6 +632,397 @@ task: "vpc_route_manager" started at (s ago) and ran for ms warning: unknown background task: "vpc_route_manager" (don't know how to interpret details: Object {}) +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "saga_recovery"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + since Nexus started: + sagas recovered: 0 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 0 (in-progress, assigned to this Nexus) + recovered: 0 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + no recovered sagas + no saga recovery failures + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "blueprint_loader", "blueprint_executor"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "blueprint_loader" + configured period: every 1m s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + +task: "blueprint_executor" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "dns_internal"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 1 + +task: "dns_servers_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 1 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "dns_external"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 2 + +task: "dns_servers_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 2 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "all"] +termination: Exited(0) +--------------------------------------------- +stdout: +task: "dns_config_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 1 + +task: "dns_servers_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_internal" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 1 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +task: "dns_config_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last generation found: 2 + +task: "dns_servers_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + servers found: 1 + + DNS_SERVER_ADDR + [::1]:REDACTED_PORT + +task: "dns_propagation_external" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + attempt to propagate generation: 2 + + DNS_SERVER_ADDR LAST_RESULT + [::1]:REDACTED_PORT success + + +task: "nat_v4_garbage_collector" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + +task: "blueprint_loader" + configured period: every 1m s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + +task: "blueprint_executor" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +task: "abandoned_vmm_reaper" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total abandoned VMMs found: 0 + VMM records deleted: 0 + VMM records already deleted by another Nexus: 0 + sled resource reservations deleted: 0 + +task: "bfd_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN } + +task: "crdb_node_id_collector" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: no blueprint + +task: "decommissioned_disk_cleaner" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "decommissioned_disk_cleaner" (don't know how to interpret details: Object {"deleted": Number(0), "error": Null, "error_count": Number(0), "found": Number(0), "not_ready_to_be_deleted": Number(0)}) + +task: "external_endpoints" + configured period: every 1m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + external API endpoints: 2 ('*' below marks default) + + SILO_ID DNS_NAME + ..................... default-silo.sys.oxide-dev.test + * ..................... test-suite-silo.sys.oxide-dev.test + + warnings: 2 + warning: silo ..................... with DNS name "default-silo.sys.oxide-dev.test" has no usable certificates + warning: silo ..................... with DNS name "test-suite-silo.sys.oxide-dev.test" has no usable certificates + + TLS certificates: 0 + +task: "instance_updater" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total instances in need of updates: 0 + instances with destroyed active VMMs: 0 + instances with terminated active migrations: 0 + update sagas started: 0 + update sagas completed successfully: 0 + +task: "instance_watcher" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total instances checked: 0 + checks completed: 0 + successful checks: 0 + update sagas queued: 0 + failed checks: 0 + checks that could not be completed: 0 + stale instance metrics pruned: 0 + +task: "inventory_collection" + configured period: every 10m + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms + last collection id: ..................... + last collection started: + last collection done: + +task: "lookup_region_port" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total filled in ports: 0 + errors: 0 + +task: "metrics_producer_gc" + configured period: every 1m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "metrics_producer_gc" (don't know how to interpret details: Object {"expiration": String(""), "pruned": Array []}) + +task: "phantom_disks" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of phantom disks deleted: 0 + number of phantom disk delete errors: 0 + +task: "physical_disk_adoption" + configured period: every s + currently executing: no + last completed activation: , triggered by a dependent task completing + started at (s ago) and ran for ms + last completion reported error: task disabled + +task: "region_replacement" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of region replacements started ok: 0 + number of region replacement start errors: 0 + +task: "region_replacement_driver" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + number of region replacement drive sagas started ok: 0 + number of region replacement finish sagas started ok: 0 + number of errors: 0 + +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + since Nexus started: + sagas recovered: 0 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 0 (in-progress, assigned to this Nexus) + recovered: 0 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + no recovered sagas + no saga recovery failures + +task: "service_firewall_rule_propagation" + configured period: every 5m + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + +task: "service_zone_nat_tracker" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + last completion reported error: inventory collection is None + +task: "switch_port_config_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {}) + +task: "v2p_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "v2p_manager" (don't know how to interpret details: Object {}) + +task: "vpc_route_manager" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "vpc_route_manager" (don't know how to interpret details: Object {}) + --------------------------------------------- stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index d0258aeaed..45492c14ce 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -80,6 +80,7 @@ async fn test_omdb_usage_errors() { &["mgs"], &["nexus"], &["nexus", "background-tasks"], + &["nexus", "background-tasks", "show", "--help"], &["nexus", "blueprints"], &["nexus", "sagas"], // Missing "--destructive" flag. The URL is bogus but just ensures that @@ -144,6 +145,19 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], + // background tasks: test picking out specific names + &["nexus", "background-tasks", "show", "saga_recovery"], + &[ + "nexus", + "background-tasks", + "show", + "blueprint_loader", + "blueprint_executor", + ], + // background tasks: test recognized group names + &["nexus", "background-tasks", "show", "dns_internal"], + &["nexus", "background-tasks", "show", "dns_external"], + &["nexus", "background-tasks", "show", "all"], &["nexus", "sagas", "list"], &["--destructive", "nexus", "sagas", "demo-create"], &["nexus", "sagas", "list"], diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 1ee07410bf..55781136b6 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -491,6 +491,46 @@ Connection Options: Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= +EXECUTING COMMAND: omdb ["nexus", "background-tasks", "show", "--help"] +termination: Exited(0) +--------------------------------------------- +stdout: +Print human-readable summary of the status of each background task + +Usage: omdb nexus background-tasks show [OPTIONS] [TASK_NAME]... + +Arguments: + [TASK_NAME]... + Names of background tasks to show (default: all) + + You can use any background task name here or one of the special strings "all", + "dns_external", or "dns_internal". + +Options: + --log-level + log level filter + + [env: LOG_LEVEL=] + [default: warn] + + -h, --help + Print help (see a summary with '-h') + +Connection Options: + --nexus-internal-url + URL of the Nexus internal API + + [env: OMDB_NEXUS_URL=] + + --dns-server + [env: OMDB_DNS_SERVER=] + +Safety Options: + -w, --destructive + Allow potentially-destructive subcommands +--------------------------------------------- +stderr: +============================================= EXECUTING COMMAND: omdb ["nexus", "blueprints"] termination: Exited(2) --------------------------------------------- From 912038c1f8d46e24405d86491db1e29166716029 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 14 Aug 2024 23:12:31 -0700 Subject: [PATCH 40/91] fix auto-mismerge of #6294 and #6327 (#6344) --- dev-tools/omdb/tests/successes.out | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 19c555ec96..db1cb1da61 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -968,6 +968,13 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_start" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms +warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []}) + task: "saga_recovery" configured period: every 10m currently executing: no From 13ae33bdb8c5a7e7345b53e79fa44d635eeffff0 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 15 Aug 2024 07:59:15 +0000 Subject: [PATCH 41/91] Update Rust crate serde_tokenstream to v0.2.2 (#6341) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f38eece4d5..097d3a3aaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9171,9 +9171,9 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8790a7c3fe883e443eaa2af6f705952bc5d6e8671a220b9335c8cae92c037e74" +checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" dependencies = [ "proc-macro2", "quote", From 22e4de36e218b0a76b39088e3ba5004a2031d7f7 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 15 Aug 2024 09:30:08 +0000 Subject: [PATCH 42/91] Update Rust crate serde_json to 1.0.125 (#6345) --- Cargo.lock | 224 ++++++++++++++------------------------ Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 +- 3 files changed, 86 insertions(+), 144 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 097d3a3aaa..b6cadd5e69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,7 +59,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", - "getrandom 0.2.14", + "getrandom", "once_cell", "version_check", "zerocopy 0.7.34", @@ -376,10 +376,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1" dependencies = [ "futures-core", - "getrandom 0.2.14", + "getrandom", "instant", "pin-project-lite", - "rand 0.8.5", + "rand", "tokio", ] @@ -683,7 +683,7 @@ dependencies = [ "omicron-workspace-hack", "pq-sys", "proptest", - "rand 0.8.5", + "rand", "secrecy", "serde", "serde_with", @@ -1609,7 +1609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ "generic-array", - "rand_core 0.6.4", + "rand_core", "subtle", "zeroize", ] @@ -1621,7 +1621,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", - "rand_core 0.6.4", + "rand_core", "typenum", ] @@ -1676,7 +1676,7 @@ dependencies = [ "curve25519-dalek-derive", "digest", "fiat-crypto", - "rand_core 0.6.4", + "rand_core", "rustc_version 0.4.0", "subtle", "zeroize", @@ -1934,7 +1934,7 @@ dependencies = [ "dhcproto-macros", "hex", "ipnet", - "rand 0.8.5", + "rand", "thiserror", "trust-dns-proto", "url", @@ -2198,7 +2198,7 @@ dependencies = [ "progenitor", "progenitor-client", "quote", - "rand 0.8.5", + "rand", "regress", "reqwest", "rustfmt-wrapper", @@ -2332,7 +2332,7 @@ checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" dependencies = [ "curve25519-dalek", "ed25519", - "rand_core 0.6.4", + "rand_core", "serde", "sha2", "subtle", @@ -2360,7 +2360,7 @@ dependencies = [ "hkdf", "pem-rfc7468", "pkcs8", - "rand_core 0.6.4", + "rand_core", "sec1", "subtle", "zeroize", @@ -2418,7 +2418,7 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "oxide-client", - "rand 0.8.5", + "rand", "reqwest", "russh", "russh-keys", @@ -2589,7 +2589,7 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" dependencies = [ - "rand_core 0.6.4", + "rand_core", "subtle", ] @@ -2905,7 +2905,7 @@ dependencies = [ "gateway-messages", "omicron-workspace-hack", "progenitor", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", @@ -3023,17 +3023,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - [[package]] name = "getrandom" version = "0.2.14" @@ -3043,7 +3032,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -3113,7 +3102,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" dependencies = [ "ff", - "rand_core 0.6.4", + "rand_core", "subtle", ] @@ -3603,7 +3592,7 @@ dependencies = [ "hyper 0.14.30", "mime_guess", "percent-encoding", - "rand 0.8.5", + "rand", "tokio", "url", "winapi", @@ -4254,7 +4243,7 @@ dependencies = [ "portpicker", "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=6dceb9ef69c217cb78a2018bbedafbc19f6ec1af)", "propolis-server-config", - "rand 0.8.5", + "rand", "regex", "reqwest", "ron 0.7.1", @@ -4669,7 +4658,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.48.0", ] @@ -4682,7 +4671,7 @@ dependencies = [ "hermit-abi 0.3.9", "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.52.0", ] @@ -4744,7 +4733,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" dependencies = [ - "getrandom 0.2.14", + "getrandom", ] [[package]] @@ -4932,7 +4921,7 @@ dependencies = [ "oxnet", "parse-display", "pq-sys", - "rand 0.8.5", + "rand", "ref-cast", "schemars", "semver 1.0.23", @@ -5001,7 +4990,7 @@ dependencies = [ "pq-sys", "predicates", "pretty_assertions", - "rand 0.8.5", + "rand", "rcgen", "ref-cast", "regex", @@ -5035,7 +5024,7 @@ dependencies = [ "omicron-workspace-hack", "once_cell", "oxnet", - "rand 0.8.5", + "rand", "serde_json", ] @@ -5200,7 +5189,7 @@ dependencies = [ "omicron-workspace-hack", "oxnet", "proptest", - "rand 0.8.5", + "rand", "sled-agent-client", "slog", "static_assertions", @@ -5466,7 +5455,7 @@ checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" dependencies = [ "num-integer", "num-traits", - "rand 0.8.5", + "rand", ] [[package]] @@ -5481,7 +5470,7 @@ dependencies = [ "num-integer", "num-iter", "num-traits", - "rand 0.8.5", + "rand", "serde", "smallvec 1.13.2", "zeroize", @@ -5771,7 +5760,7 @@ dependencies = [ "progenitor", "progenitor-client", "proptest", - "rand 0.8.5", + "rand", "regress", "reqwest", "schemars", @@ -5989,7 +5978,7 @@ dependencies = [ "pretty_assertions", "progenitor-client", "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=24a74d0c76b6a63961ecef76acb1516b6e66c5c9)", - "rand 0.8.5", + "rand", "rcgen", "ref-cast", "regex", @@ -6136,7 +6125,7 @@ dependencies = [ "clap", "criterion", "omicron-workspace-hack", - "rand 0.8.5", + "rand", "rust-argon2", "schemars", "serde", @@ -6245,7 +6234,7 @@ dependencies = [ "pretty_assertions", "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=24a74d0c76b6a63961ecef76acb1516b6e66c5c9)", "propolis-mock-server", - "rand 0.8.5", + "rand", "rcgen", "reqwest", "schemars", @@ -6375,7 +6364,7 @@ dependencies = [ "futures-util", "gateway-messages", "generic-array", - "getrandom 0.2.14", + "getrandom", "group", "hashbrown 0.14.5", "hex", @@ -6692,7 +6681,7 @@ dependencies = [ "hyper 0.14.30", "omicron-workspace-hack", "progenitor", - "rand 0.8.5", + "rand", "regress", "reqwest", "serde", @@ -6787,7 +6776,7 @@ dependencies = [ "oximeter-api", "oximeter-client", "oximeter-db", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", @@ -6870,7 +6859,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "rand 0.8.5", + "rand", "rand_distr", "regex", "rstest", @@ -6900,7 +6889,7 @@ dependencies = [ "libc", "omicron-workspace-hack", "oximeter", - "rand 0.8.5", + "rand", "schemars", "serde", "slog", @@ -7015,7 +7004,7 @@ dependencies = [ "ecdsa", "elliptic-curve", "primeorder", - "rand_core 0.6.4", + "rand_core", "sha2", ] @@ -7150,7 +7139,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" dependencies = [ "base64ct", - "rand_core 0.6.4", + "rand_core", "subtle", ] @@ -7161,7 +7150,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" dependencies = [ "base64ct", - "rand_core 0.6.4", + "rand_core", "subtle", ] @@ -7410,7 +7399,7 @@ checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ "der", "pkcs5", - "rand_core 0.6.4", + "rand_core", "spki", ] @@ -7510,7 +7499,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be97d76faf1bfab666e1375477b23fde79eccf0276e9b63b92a39d676a889ba9" dependencies = [ - "rand 0.8.5", + "rand", ] [[package]] @@ -7537,7 +7526,7 @@ dependencies = [ "hmac", "md-5", "memchr", - "rand 0.8.5", + "rand", "sha2", "stringprep", ] @@ -7800,7 +7789,7 @@ dependencies = [ "base64 0.21.7", "futures", "progenitor", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", @@ -7821,7 +7810,7 @@ dependencies = [ "base64 0.21.7", "futures", "progenitor", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", @@ -7847,7 +7836,7 @@ dependencies = [ "hyper 0.14.30", "progenitor", "propolis_types 0.0.0 (git+https://github.com/oxidecomputer/propolis?rev=24a74d0c76b6a63961ecef76acb1516b6e66c5c9)", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", @@ -7904,8 +7893,8 @@ dependencies = [ "bitflags 2.6.0", "lazy_static", "num-traits", - "rand 0.8.5", - "rand_chacha 0.3.1", + "rand", + "rand_chacha", "rand_xorshift", "regex-syntax 0.8.4", "rusty-fork", @@ -7981,19 +7970,6 @@ dependencies = [ "nibble_vec", ] -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc", -] - [[package]] name = "rand" version = "0.8.5" @@ -8001,18 +7977,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", + "rand_chacha", + "rand_core", ] [[package]] @@ -8022,16 +7988,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", + "rand_core", ] [[package]] @@ -8040,7 +7997,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.14", + "getrandom", ] [[package]] @@ -8050,16 +8007,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" dependencies = [ "num-traits", - "rand 0.8.5", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", + "rand", ] [[package]] @@ -8068,7 +8016,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a9febe641d2842ffc76ee962668a17578767c4e01735e4802b21ed9a24b2e4e" dependencies = [ - "rand_core 0.6.4", + "rand_core", ] [[package]] @@ -8077,7 +8025,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" dependencies = [ - "rand_core 0.6.4", + "rand_core", ] [[package]] @@ -8208,7 +8156,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" dependencies = [ - "getrandom 0.2.14", + "getrandom", "libredox", "thiserror", ] @@ -8416,7 +8364,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.14", + "getrandom", "libc", "spin 0.9.8", "untrusted 0.9.0", @@ -8470,7 +8418,7 @@ dependencies = [ "num-traits", "pkcs1", "pkcs8", - "rand_core 0.6.4", + "rand_core", "serde", "sha2", "signature", @@ -8561,8 +8509,8 @@ dependencies = [ "p384", "p521", "poly1305", - "rand 0.8.5", - "rand_core 0.6.4", + "rand", + "rand_core", "russh-cryptovec", "russh-keys", "sha1", @@ -8617,8 +8565,8 @@ dependencies = [ "pkcs1", "pkcs5", "pkcs8", - "rand 0.8.5", - "rand_core 0.6.4", + "rand", + "rand_core", "rsa", "russh-cryptovec", "sec1", @@ -8880,7 +8828,7 @@ dependencies = [ "openssl-sys", "pkg-config", "quick-xml", - "rand 0.8.5", + "rand", "serde", "thiserror", "url", @@ -9120,9 +9068,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.124" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", "memchr", @@ -9330,7 +9278,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ "digest", - "rand_core 0.6.4", + "rand_core", ] [[package]] @@ -9481,7 +9429,7 @@ dependencies = [ "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", - "rand 0.8.5", + "rand", "schemars", "serde", "sled-hardware-types", @@ -9525,7 +9473,7 @@ dependencies = [ "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", - "rand 0.8.5", + "rand", "schemars", "serde", "serde_json", @@ -9879,7 +9827,7 @@ dependencies = [ "p256", "p384", "p521", - "rand_core 0.6.4", + "rand_core", "rsa", "sec1", "sha2", @@ -10549,7 +10497,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "rand 0.8.5", + "rand", "socket2 0.5.7", "tokio", "tokio-util", @@ -10818,7 +10766,7 @@ dependencies = [ "futures-util", "lazy_static", "radix_trie", - "rand 0.8.5", + "rand", "thiserror", "time", "tokio", @@ -10842,7 +10790,7 @@ dependencies = [ "idna 0.2.3", "ipnet", "lazy_static", - "rand 0.8.5", + "rand", "smallvec 1.13.2", "thiserror", "tinyvec", @@ -10961,7 +10909,7 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "parse-size", - "rand 0.8.5", + "rand", "ring 0.17.8", "serde", "serde_json", @@ -10998,7 +10946,7 @@ dependencies = [ "http 0.2.12", "httparse", "log", - "rand 0.8.5", + "rand", "sha1", "thiserror", "url", @@ -11017,7 +10965,7 @@ dependencies = [ "http 1.1.0", "httparse", "log", - "rand 0.8.5", + "rand", "sha1", "thiserror", "url", @@ -11031,7 +10979,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", - "rand 0.7.3", + "rand", "static_assertions", ] @@ -11047,8 +10995,8 @@ version = "0.1.0" dependencies = [ "newtype-uuid", "omicron-workspace-hack", - "rand 0.8.5", - "rand_core 0.6.4", + "rand", + "rand_core", "rand_seeder", "uuid", ] @@ -11232,7 +11180,7 @@ dependencies = [ "omicron-common", "omicron-test-utils", "omicron-workspace-hack", - "rand 0.8.5", + "rand", "sha2", "slog", "thiserror", @@ -11372,7 +11320,7 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ - "getrandom 0.2.14", + "getrandom", "serde", ] @@ -11440,9 +11388,9 @@ dependencies = [ "curve25519-dalek", "elliptic-curve", "hex", - "rand 0.8.5", - "rand_chacha 0.3.1", - "rand_core 0.6.4", + "rand", + "rand_chacha", + "rand_core", "serde", "subtle", "thiserror-no-std", @@ -11506,12 +11454,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -11787,7 +11729,7 @@ dependencies = [ "openapi-lint", "openapiv3", "oxnet", - "rand 0.8.5", + "rand", "reqwest", "schemars", "serde", diff --git a/Cargo.toml b/Cargo.toml index a87c88eeac..a1ae9858ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -507,7 +507,7 @@ secrecy = "0.8.0" semver = { version = "1.0.23", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } -serde_json = "1.0.124" +serde_json = "1.0.125" serde_path_to_error = "0.1.16" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 2e0520d82d..35c266cdf3 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -95,7 +95,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } @@ -202,7 +202,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } From 8b3e948dd8afab54d5fe0c821beb4af5a83b7701 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 15 Aug 2024 06:38:01 -0400 Subject: [PATCH 43/91] SP bump to pick up dumping fixes (#6338) --- tools/permslip_staging | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/permslip_staging b/tools/permslip_staging index d886cc4246..d2ddc45f20 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ -34cf117633f82cc8f665dc3b6c78dc2aff61ca87d2b2687290605080265dda30 manifest-gimlet-v1.0.23.toml +6ea87b554882860f1a9b1cf97b2f4a9c61fadf3d69e6ea1bdcd781d306d6ca9c manifest-gimlet-v1.0.24.toml 85553dd164933a9b9e4f22409abd1190b1d632d192b5f7527129acaa778a671a manifest-oxide-rot-1-v1.0.13.toml -db995edfe91959df3cb20ea8156f75b9dcff5ec5e77f98a28766617a8ed2e0c5 manifest-psc-v1.0.22.toml -26b6096a377edb3d7da50b1b499af104e6195bc7c7c6eb1b2751b32434d7ac9e manifest-sidecar-v1.0.23.toml +11bc0684155119f494a6e21810e4dc97b9efadb8154d570f67143dae98a45060 manifest-psc-v1.0.23.toml +60205852109f1584d29e2b086eae5a72d7f61b2e1f64d958e6326312ed2b0d66 manifest-sidecar-v1.0.24.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml From b50fe3a792e0a1ef5f215acf7a6a644fbda8aff2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 15 Aug 2024 12:28:45 -0400 Subject: [PATCH 44/91] [#5333 4/6] Region snapshot replacement GC (#6330) This commit adds a "snapshot replacement garbage collect" background task that will scan for region snapshot replacement requests in the ReplacementDone state and trigger a saga to delete the old snapshot volume populated in the snapshot replacement start saga. The "garbage collect" saga added here does nothing except change the state of the snapshot replacement request and invoke a volume delete sub-saga. At the end of this saga the record's state is changed to Running. --- dev-tools/omdb/src/bin/omdb/nexus.rs | 26 ++ dev-tools/omdb/tests/env.out | 12 + dev-tools/omdb/tests/successes.out | 20 ++ nexus-config/src/nexus_config.rs | 17 + nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 23 +- nexus/src/app/background/tasks/mod.rs | 1 + ...on_snapshot_replacement_garbage_collect.rs | 265 ++++++++++++++ nexus/src/app/sagas/mod.rs | 4 + ...on_snapshot_replacement_garbage_collect.rs | 326 ++++++++++++++++++ nexus/tests/config.test.toml | 1 + nexus/types/src/internal_api/background.rs | 8 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 15 files changed, 706 insertions(+), 1 deletion(-) create mode 100644 nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs create mode 100644 nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 67a4180dd2..0828cef892 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -34,6 +34,7 @@ use nexus_saga_recovery::LastPass; use nexus_types::deployment::Blueprint; use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; +use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::CollectionUuid; @@ -1478,6 +1479,31 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!(" errors: {}", status.errors.len()); + for line in &status.errors { + println!(" > {line}"); + } + } + } + } else if name == "region_snapshot_replacement_garbage_collection" { + match serde_json::from_value::< + RegionSnapshotReplacementGarbageCollectStatus, + >(details.clone()) + { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + + Ok(status) => { + println!( + " total garbage collections requested: {}", + status.garbage_collect_requested.len(), + ); + for line in &status.garbage_collect_requested { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 59d9310b57..ec407cd123 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -127,6 +127,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_garbage_collection" + clean up all region snapshot replacement step volumes + + task: "region_snapshot_replacement_start" detect if region snapshots need replacement and begin the process @@ -280,6 +284,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_garbage_collection" + clean up all region snapshot replacement step volumes + + task: "region_snapshot_replacement_start" detect if region snapshots need replacement and begin the process @@ -420,6 +428,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_garbage_collection" + clean up all region snapshot replacement step volumes + + task: "region_snapshot_replacement_start" detect if region snapshots need replacement and begin the process diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index db1cb1da61..41c5a15a1c 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -328,6 +328,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_garbage_collection" + clean up all region snapshot replacement step volumes + + task: "region_snapshot_replacement_start" detect if region snapshots need replacement and begin the process @@ -570,6 +574,14 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_garbage_collection" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total garbage collections requested: 0 + errors: 0 + task: "region_snapshot_replacement_start" configured period: every s currently executing: no @@ -968,6 +980,14 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_garbage_collection" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total garbage collections requested: 0 + errors: 0 + task: "region_snapshot_replacement_start" configured period: every s currently executing: no diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index b222ebd23b..f6e60bb558 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -393,6 +393,9 @@ pub struct BackgroundTaskConfig { pub lookup_region_port: LookupRegionPortConfig, /// configuration for region snapshot replacement starter task pub region_snapshot_replacement_start: RegionSnapshotReplacementStartConfig, + /// configuration for region snapshot replacement garbage collection + pub region_snapshot_replacement_garbage_collection: + RegionSnapshotReplacementGarbageCollectionConfig, } #[serde_as] @@ -637,6 +640,14 @@ pub struct RegionSnapshotReplacementStartConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionSnapshotReplacementGarbageCollectionConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -885,6 +896,7 @@ mod test { saga_recovery.period_secs = 60 lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 + region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1051,6 +1063,10 @@ mod test { RegionSnapshotReplacementStartConfig { period_secs: Duration::from_secs(30), }, + region_snapshot_replacement_garbage_collection: + RegionSnapshotReplacementGarbageCollectionConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -1128,6 +1144,7 @@ mod test { saga_recovery.period_secs = 60 lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 + region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 572de807d7..c87e1255b5 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -140,6 +140,7 @@ abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 +region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 3aebe35152..f844adccbe 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -126,6 +126,7 @@ abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 +region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index cc42a8f302..6bd805a491 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -108,6 +108,7 @@ use super::tasks::phantom_disks; use super::tasks::physical_disk_adoption; use super::tasks::region_replacement; use super::tasks::region_replacement_driver; +use super::tasks::region_snapshot_replacement_garbage_collect::*; use super::tasks::region_snapshot_replacement_start::*; use super::tasks::saga_recovery; use super::tasks::service_firewall_rules; @@ -163,6 +164,7 @@ pub struct BackgroundTasks { pub task_saga_recovery: Activator, pub task_lookup_region_port: Activator, pub task_region_snapshot_replacement_start: Activator, + pub task_region_snapshot_replacement_garbage_collection: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as @@ -245,6 +247,8 @@ impl BackgroundTasksInitializer { task_saga_recovery: Activator::new(), task_lookup_region_port: Activator::new(), task_region_snapshot_replacement_start: Activator::new(), + task_region_snapshot_replacement_garbage_collection: Activator::new( + ), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -307,6 +311,7 @@ impl BackgroundTasksInitializer { task_saga_recovery, task_lookup_region_port, task_region_snapshot_replacement_start, + task_region_snapshot_replacement_garbage_collection, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -739,7 +744,7 @@ impl BackgroundTasksInitializer { process", period: config.region_snapshot_replacement_start.period_secs, task_impl: Box::new(RegionSnapshotReplacementDetector::new( - datastore, + datastore.clone(), sagas.clone(), )), opctx: opctx.child(BTreeMap::new()), @@ -747,6 +752,22 @@ impl BackgroundTasksInitializer { activator: task_region_snapshot_replacement_start, }); + driver.register(TaskDefinition { + name: "region_snapshot_replacement_garbage_collection", + description: + "clean up all region snapshot replacement step volumes", + period: config + .region_snapshot_replacement_garbage_collection + .period_secs, + task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new( + datastore, + sagas.clone(), + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_region_snapshot_replacement_garbage_collection, + }); + driver } } diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index b0281afd9f..7ba68d0b80 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -25,6 +25,7 @@ pub mod phantom_disks; pub mod physical_disk_adoption; pub mod region_replacement; pub mod region_replacement_driver; +pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod saga_recovery; pub mod service_firewall_rules; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs new file mode 100644 index 0000000000..4c66c166ff --- /dev/null +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -0,0 +1,265 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for deleting volumes that stash a replaced region snapshot + +use crate::app::authn; +use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_garbage_collect::*; +use crate::app::sagas::NexusSaga; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; +use serde_json::json; +use std::sync::Arc; + +pub struct RegionSnapshotReplacementGarbageCollect { + datastore: Arc, + sagas: Arc, +} + +impl RegionSnapshotReplacementGarbageCollect { + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementGarbageCollect { datastore, sagas } + } + + async fn send_garbage_collect_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id + else { + // This state is illegal! + let s = format!( + "request {} old snapshot volume id is None!", + request.id, + ); + + return Err(omicron_common::api::external::Error::internal_error( + &s, + )); + }; + + let params = + sagas::region_snapshot_replacement_garbage_collect::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + old_snapshot_volume_id, + request, + }; + + let saga_dag = + SagaRegionSnapshotReplacementGarbageCollect::prepare(¶ms)?; + self.sagas.saga_start(saga_dag).await + } + + async fn clean_up_region_snapshot_replacement_volumes( + &self, + opctx: &OpContext, + status: &mut RegionSnapshotReplacementGarbageCollectStatus, + ) { + let log = &opctx.log; + + let requests = match self + .datastore + .get_replacement_done_region_snapshot_replacements(opctx) + .await + { + Ok(requests) => requests, + + Err(e) => { + let s = format!("querying for requests to collect failed! {e}"); + error!(&log, "{s}"); + status.errors.push(s); + return; + } + }; + + for request in requests { + let request_id = request.id; + + let result = + self.send_garbage_collect_request(opctx, request.clone()).await; + + match result { + Ok(()) => { + let s = format!( + "region snapshot replacement garbage collect request \ + ok for {request_id}" + ); + + info!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.garbage_collect_requested.push(s); + } + + Err(e) => { + let s = format!( + "sending region snapshot replacement garbage collect \ + request failed: {e}", + ); + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + } + } + } + } +} + +impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async move { + let log = &opctx.log; + info!( + &log, + "region snapshot replacement garbage collect task started", + ); + + let mut status = + RegionSnapshotReplacementGarbageCollectStatus::default(); + + self.clean_up_region_snapshot_replacement_volumes( + opctx, + &mut status, + ) + .await; + + info!( + &log, + "region snapshot replacement garbage collect task done" + ); + + json!(status) + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::app::background::init::test::NoopStartSaga; + use nexus_db_model::RegionSnapshotReplacement; + use nexus_db_model::RegionSnapshotReplacementState; + use nexus_test_utils_macros::nexus_test; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_region_snapshot_replacement_garbage_collect_task( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementGarbageCollect::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementGarbageCollectStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!( + result, + RegionSnapshotReplacementGarbageCollectStatus::default() + ); + assert_eq!(starter.count_reset(), 0); + + // Add two region snapshot requests that need garbage collection + + let mut request = RegionSnapshotReplacement::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + ); + request.replacement_state = + RegionSnapshotReplacementState::ReplacementDone; + request.old_snapshot_volume_id = Some(Uuid::new_v4()); + + let request_1_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request, + Uuid::new_v4(), + ) + .await + .unwrap(); + + let mut request = RegionSnapshotReplacement::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + ); + request.replacement_state = + RegionSnapshotReplacementState::ReplacementDone; + request.old_snapshot_volume_id = Some(Uuid::new_v4()); + + let request_2_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request, + Uuid::new_v4(), + ) + .await + .unwrap(); + + // Activate the task - it should pick up the two requests + + let result: RegionSnapshotReplacementGarbageCollectStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + for error in &result.errors { + eprintln!("{error}"); + } + + assert_eq!(result.garbage_collect_requested.len(), 2); + + let s = format!( + "region snapshot replacement garbage collect request ok for \ + {request_1_id}" + ); + assert!(result.garbage_collect_requested.contains(&s)); + + let s = format!( + "region snapshot replacement garbage collect request ok for \ + {request_2_id}" + ); + assert!(result.garbage_collect_requested.contains(&s)); + + assert_eq!(result.errors.len(), 0); + + assert_eq!(starter.count_reset(), 2); + } +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 471118a5cb..926b983460 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod snapshot_create; pub mod snapshot_delete; @@ -194,6 +195,9 @@ fn make_action_registry() -> ActionRegistry { ::register_actions( &mut registry, ); + ::register_actions( + &mut registry, + ); #[cfg(test)] ::register_actions(&mut registry); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs new file mode 100644 index 0000000000..e3c5143a68 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -0,0 +1,326 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Clean up the volume that stashes the target replaced during the region +//! snapshot replacement start saga. After that's done, change the region +//! snapshot replacement state to Running. This saga handles the following +//! region snapshot replacement request state transitions: +//! +//! ```text +//! ReplacementDone <-- +//! | +//! | | +//! v | +//! | +//! DeletingOldVolume -- +//! +//! | +//! v +//! +//! Running +//! ``` +//! +//! See the documentation for the "region snapshot replacement step" saga for +//! the next step(s) in the process. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::sagas::volume_delete; +use crate::app::{authn, db}; +use serde::Deserialize; +use serde::Serialize; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement garbage collect saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + /// The fake volume created for the snapshot that was replaced + // Note: this is only required in the params to build the volume-delete sub + // saga + pub old_snapshot_volume_id: Uuid, + pub request: db::model::RegionSnapshotReplacement, +} + +// region snapshot replacement garbage collect saga: actions + +declare_saga_actions! { + region_snapshot_replacement_garbage_collect; + SET_SAGA_ID -> "unused_1" { + + rsrgs_set_saga_id + - rsrgs_set_saga_id_undo + } + UPDATE_REQUEST_RECORD -> "unused_2" { + + rsrgs_update_request_record + } +} + +// region snapshot replacement garbage collect saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementGarbageCollect; +impl NexusSaga for SagaRegionSnapshotReplacementGarbageCollect { + const NAME: &'static str = "region-snapshot-replacement-garbage-collect"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_garbage_collect_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + + let subsaga_params = volume_delete::Params { + serialized_authn: params.serialized_authn.clone(), + volume_id: params.old_snapshot_volume_id, + }; + + let subsaga_dag = { + let subsaga_builder = steno::DagBuilder::new(steno::SagaName::new( + volume_delete::SagaVolumeDelete::NAME, + )); + volume_delete::SagaVolumeDelete::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + "params_for_volume_delete_subsaga", + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + "params_for_volume_delete_subsaga".to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "volume_delete_subsaga_no_result", + subsaga_dag, + "params_for_volume_delete_subsaga", + )); + + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement garbage collect saga: action implementations + +async fn rsrgs_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "deleting old volume" + // state to block out other sagas that will be triggered for the same + // request. + osagactx + .datastore() + .set_region_snapshot_replacement_deleting_old_volume( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrgs_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_deleting_old_volume( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +async fn rsrgs_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Now that the snapshot volume has been deleted, update the replacement + // request record to 'Running'. There is no undo step for this, it should + // succeed idempotently. + + datastore + .set_region_snapshot_replacement_running( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +#[cfg(test)] +pub(crate) mod test { + use crate::app::sagas::region_snapshot_replacement_garbage_collect::{ + Params, SagaRegionSnapshotReplacementGarbageCollect, + }; + use nexus_db_model::RegionSnapshotReplacement; + use nexus_db_model::RegionSnapshotReplacementState; + use nexus_db_model::Volume; + use nexus_db_queries::authn::saga::Serialized; + use nexus_db_queries::context::OpContext; + use nexus_test_utils_macros::nexus_test; + use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::types::VolumeConstructionRequest; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_region_snapshot_replacement_garbage_collect_saga( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + // Manually insert required records + let old_snapshot_volume_id = Uuid::new_v4(); + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: old_snapshot_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: old_snapshot_volume_id, + target: vec![ + // XXX if you put something here, you'll need a + // synthetic dataset record + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: None, + }; + + let volume_data = + serde_json::to_string(&volume_construction_request).unwrap(); + + datastore + .volume_create(Volume::new(old_snapshot_volume_id, volume_data)) + .await + .unwrap(); + + let mut request = RegionSnapshotReplacement::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + ); + request.replacement_state = + RegionSnapshotReplacementState::ReplacementDone; + request.old_snapshot_volume_id = Some(old_snapshot_volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request.clone(), + Uuid::new_v4(), + ) + .await + .unwrap(); + + // Run the saga + let params = Params { + serialized_authn: Serialized::for_opctx(&opctx), + old_snapshot_volume_id, + request: request.clone(), + }; + + let _output = nexus + .sagas + .saga_execute::(params) + .await + .unwrap(); + + // Validate the state transition + let result = datastore + .get_region_snapshot_replacement_request_by_id(&opctx, request.id) + .await + .unwrap(); + + assert_eq!( + result.replacement_state, + RegionSnapshotReplacementState::Running + ); + + // Validate the Volume was deleted + assert!(datastore + .volume_get(old_snapshot_volume_id) + .await + .unwrap() + .is_none()); + } +} diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 35b55184b9..d9cbb5eb34 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -138,6 +138,7 @@ lookup_region_port.period_secs = 60 instance_updater.disable = true instance_updater.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 +region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 2f8a411cf7..8e4b6b3013 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -28,3 +28,11 @@ pub struct RegionSnapshotReplacementStartStatus { pub start_invoked_ok: Vec, pub errors: Vec, } + +/// The status of a `region_snapshot_replacement_garbage_collect` background +/// task activation +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct RegionSnapshotReplacementGarbageCollectStatus { + pub garbage_collect_requested: Vec, + pub errors: Vec, +} diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 437615938f..2e3a8fe578 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -66,6 +66,7 @@ saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 instance_updater.period_secs = 30 region_snapshot_replacement_start.period_secs = 30 +region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 95dcca14ae..dbd61e953d 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -66,6 +66,7 @@ saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 instance_updater.period_secs = 30 region_snapshot_replacement_start.period_secs = 30 +region_snapshot_replacement_garbage_collection.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From 4543057882bff644354e1c0cc897bbd4753510e7 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Thu, 15 Aug 2024 13:25:44 -0400 Subject: [PATCH 45/91] [common] Move policy constants to their own module (#6340) These constants really don't belong in `address.rs` as they are only tangentially related to addresses. Also add policy for replicated clickhouse. This will be used in a follow up PR. Fixes #6299 --- common/src/address.rs | 26 +------------ common/src/lib.rs | 1 + common/src/policy.rs | 40 ++++++++++++++++++++ nexus/reconfigurator/execution/src/dns.rs | 6 +-- nexus/reconfigurator/planning/src/system.rs | 3 +- nexus/reconfigurator/preparation/src/lib.rs | 7 ++-- nexus/src/app/deployment.rs | 6 +-- nexus/types/src/deployment/planning_input.rs | 18 +++++++++ sled-agent/src/rack_setup/plan/service.rs | 18 ++++++--- 9 files changed, 84 insertions(+), 41 deletions(-) create mode 100644 common/src/policy.rs diff --git a/common/src/address.rs b/common/src/address.rs index ba1193c7f0..c23f5c41ed 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -8,6 +8,7 @@ //! and Nexus, who need to agree upon addressing schemes. use crate::api::external::{self, Error}; +use crate::policy::{DNS_REDUNDANCY, MAX_DNS_REDUNDANCY}; use ipnetwork::Ipv6Network; use once_cell::sync::Lazy; use oxnet::{Ipv4Net, Ipv6Net}; @@ -25,31 +26,6 @@ pub const MAX_PORT: u16 = u16::MAX; /// minimum possible value for a tcp or udp port pub const MIN_PORT: u16 = u16::MIN; -/// The amount of redundancy for boundary NTP servers. -pub const BOUNDARY_NTP_REDUNDANCY: usize = 2; - -/// The amount of redundancy for Nexus services. -/// -/// This is used by both RSS (to distribute the initial set of services) and the -/// Reconfigurator (to know whether to add new Nexus zones) -pub const NEXUS_REDUNDANCY: usize = 3; - -/// The amount of redundancy for CockroachDb services. -/// -/// This is used by both RSS (to distribute the initial set of services) and the -/// Reconfigurator (to know whether to add new crdb zones) -pub const COCKROACHDB_REDUNDANCY: usize = 5; - -/// The amount of redundancy for internal DNS servers. -/// -/// Must be less than or equal to MAX_DNS_REDUNDANCY. -pub const DNS_REDUNDANCY: usize = 3; - -/// The maximum amount of redundancy for DNS servers. -/// -/// This determines the number of addresses which are reserved for DNS servers. -pub const MAX_DNS_REDUNDANCY: usize = 5; - pub const DNS_PORT: u16 = 53; pub const DNS_HTTP_PORT: u16 = 5353; pub const SLED_AGENT_PORT: u16 = 12345; diff --git a/common/src/lib.rs b/common/src/lib.rs index e4f53cbfab..6da32c56ba 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -26,6 +26,7 @@ pub mod backoff; pub mod cmd; pub mod disk; pub mod ledger; +pub mod policy; pub mod progenitor_operation_retry; pub mod update; pub mod vlan; diff --git a/common/src/policy.rs b/common/src/policy.rs new file mode 100644 index 0000000000..677dbfe2b9 --- /dev/null +++ b/common/src/policy.rs @@ -0,0 +1,40 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fleet policy related functionality used by both Reconfigurator and RSS. + +/// The amount of redundancy for boundary NTP servers. +pub const BOUNDARY_NTP_REDUNDANCY: usize = 2; + +/// The amount of redundancy for Nexus services. +/// +/// This is used by both RSS (to distribute the initial set of services) and the +/// Reconfigurator (to know whether to add new Nexus zones) +pub const NEXUS_REDUNDANCY: usize = 3; + +/// The amount of redundancy for CockroachDb services. +/// +/// This is used by both RSS (to distribute the initial set of services) and the +/// Reconfigurator (to know whether to add new crdb zones) +pub const COCKROACHDB_REDUNDANCY: usize = 5; + +/// The amount of redundancy for internal DNS servers. +/// +/// Must be less than or equal to MAX_DNS_REDUNDANCY. +pub const DNS_REDUNDANCY: usize = 3; + +/// The maximum amount of redundancy for DNS servers. +/// +/// This determines the number of addresses which are reserved for DNS servers. +pub const MAX_DNS_REDUNDANCY: usize = 5; + +/// The amount of redundancy for clickhouse servers +/// +/// Clickhouse servers contain lazily replicated data +pub const CLICKHOUSE_SERVER_REDUNDANCY: usize = 3; + +/// The amount of redundancy for clickhouse keepers +/// +/// Keepers maintain strongly consistent metadata about data replication +pub const CLICKHOUSE_KEEPER_REDUNDANCY: usize = 5; diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 885ffa67d1..846d19ead3 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -502,13 +502,13 @@ mod test { use omicron_common::address::get_switch_zone_address; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; - use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; - use omicron_common::address::COCKROACHDB_REDUNDANCY; - use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Generation; use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; + use omicron_common::policy::COCKROACHDB_REDUNDANCY; + use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_common::zpool_name::ZpoolName; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::ExternalIpUuid; diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index f6989be9ef..7298db7a73 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -33,13 +33,13 @@ use nexus_types::inventory::SpType; use omicron_common::address::get_sled_address; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; -use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::RACK_PREFIX; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; use omicron_common::disk::DiskVariant; +use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; @@ -328,6 +328,7 @@ impl SystemDescription { target_cockroachdb_zone_count: self.target_cockroachdb_zone_count, target_cockroachdb_cluster_version: self .target_cockroachdb_cluster_version, + clickhouse_policy: None, }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index e0ba0f10ba..fc0e4638f8 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -33,13 +33,13 @@ use nexus_types::identity::Resource; use nexus_types::inventory::Collection; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; -use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; -use omicron_common::address::COCKROACHDB_REDUNDANCY; -use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::SLED_PREFIX; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; use omicron_common::disk::DiskIdentity; +use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; +use omicron_common::policy::COCKROACHDB_REDUNDANCY; +use omicron_common::policy::NEXUS_REDUNDANCY; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; @@ -82,6 +82,7 @@ impl PlanningInputFromDb<'_> { target_cockroachdb_zone_count: self.target_cockroachdb_zone_count, target_cockroachdb_cluster_version: self .target_cockroachdb_cluster_version, + clickhouse_policy: None, }; let mut builder = PlanningInputBuilder::new( policy, diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index e9095cc991..50ae332d3f 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -17,9 +17,6 @@ use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; use nexus_types::inventory::Collection; -use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; -use omicron_common::address::COCKROACHDB_REDUNDANCY; -use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; @@ -28,6 +25,9 @@ use omicron_common::api::external::InternalContext; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; +use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; +use omicron_common::policy::COCKROACHDB_REDUNDANCY; +use omicron_common::policy::NEXUS_REDUNDANCY; use slog_error_chain::InlineErrorChain; use uuid::Uuid; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 1af3636d0e..c6a61aac78 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -709,6 +709,23 @@ pub struct Policy { /// at present this is hardcoded based on the version of CockroachDB we /// presently ship and the tick-tock pattern described in RFD 469. pub target_cockroachdb_cluster_version: CockroachDbClusterVersion, + + /// Policy information for a replicated clickhouse setup + /// + /// If this policy is `None`, then we are using a single node clickhouse + /// setup. Eventually we will only allow multi-node setups and this will no + /// longer be an option. + pub clickhouse_policy: Option, +} + +/// Policy for replicated clickhouse setups +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ClickhousePolicy { + /// Desired number of clickhouse servers + pub target_servers: usize, + + /// Desired number of clickhouse keepers + pub target_keepers: usize, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -761,6 +778,7 @@ impl PlanningInputBuilder { target_cockroachdb_zone_count: 0, target_cockroachdb_cluster_version: CockroachDbClusterVersion::POLICY, + clickhouse_policy: None, }, internal_dns_version: Generation::new(), external_dns_version: Generation::new(), diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index ff137f131f..c9ed0c2248 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -14,10 +14,8 @@ use nexus_sled_agent_shared::inventory::{ }; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, - BOUNDARY_NTP_REDUNDANCY, COCKROACHDB_REDUNDANCY, DENDRITE_PORT, - DNS_HTTP_PORT, DNS_PORT, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, MGD_PORT, - MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, - RSS_RESERVED_ADDRESSES, SLED_PREFIX, + DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, MGD_PORT, MGS_PORT, NTP_PORT, + NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; use omicron_common::api::external::{Generation, MacAddr, Vni}; use omicron_common::api::internal::shared::{ @@ -31,6 +29,10 @@ use omicron_common::disk::{ DiskVariant, OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, }; use omicron_common::ledger::{self, Ledger, Ledgerable}; +use omicron_common::policy::{ + BOUNDARY_NTP_REDUNDANCY, COCKROACHDB_REDUNDANCY, DNS_REDUNDANCY, + MAX_DNS_REDUNDANCY, NEXUS_REDUNDANCY, +}; use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid, ZpoolUuid}; use rand::prelude::SliceRandom; use schemars::JsonSchema; @@ -54,11 +56,15 @@ use uuid::Uuid; const OXIMETER_COUNT: usize = 1; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // when Nexus provisions Clickhouse. -// TODO(https://github.com/oxidecomputer/omicron/issues/4000): Set to 2 once we enable replicated ClickHouse +// TODO(https://github.com/oxidecomputer/omicron/issues/4000): Use +// omicron_common::policy::CLICKHOUSE_SERVER_REDUNDANCY once we enable +// replicated ClickHouse const CLICKHOUSE_COUNT: usize = 1; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // when Nexus provisions Clickhouse keeper. -// TODO(https://github.com/oxidecomputer/omicron/issues/4000): Set to 3 once we enable replicated ClickHouse +// TODO(https://github.com/oxidecomputer/omicron/issues/4000): Use +// omicron_common::policy::CLICKHOUSE_KEEPER_REDUNDANCY once we enable +// replicated ClickHouse const CLICKHOUSE_KEEPER_COUNT: usize = 0; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove. // when Nexus provisions Crucible. From 64ef70cb973a9a1f0810c8e01b3009764ffb7531 Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 15 Aug 2024 10:34:46 -0700 Subject: [PATCH 46/91] [installinator] use camino-tempfile rather than tempfile (#6346) I was in this code and spotted this -- we can just use camino-tempfile here. --- Cargo.lock | 2 +- installinator/Cargo.toml | 3 +-- installinator/src/async_temp_file.rs | 12 ++++++------ installinator/src/write.rs | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b6cadd5e69..544097d1e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3832,6 +3832,7 @@ dependencies = [ "buf-list", "bytes", "camino", + "camino-tempfile", "cancel-safe-futures", "clap", "display-error-chain", @@ -3861,7 +3862,6 @@ dependencies = [ "slog-envlogger", "slog-term", "smf", - "tempfile", "test-strategy", "thiserror", "tokio", diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml index 00dfb6440b..0d59950a2a 100644 --- a/installinator/Cargo.toml +++ b/installinator/Cargo.toml @@ -13,6 +13,7 @@ async-trait.workspace = true buf-list.workspace = true bytes.workspace = true camino.workspace = true +camino-tempfile.workspace = true cancel-safe-futures.workspace = true clap.workspace = true display-error-chain.workspace = true @@ -37,7 +38,6 @@ slog-async.workspace = true slog-envlogger.workspace = true slog-term.workspace = true smf.workspace = true -tempfile.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["full"] } tufaceous-lib.workspace = true @@ -50,7 +50,6 @@ omicron-test-utils.workspace = true hex-literal.workspace = true partial-io.workspace = true proptest.workspace = true -tempfile.workspace = true test-strategy.workspace = true tokio = { workspace = true, features = ["test-util"] } tokio-stream.workspace = true diff --git a/installinator/src/async_temp_file.rs b/installinator/src/async_temp_file.rs index c884908ac8..168fffa2aa 100644 --- a/installinator/src/async_temp_file.rs +++ b/installinator/src/async_temp_file.rs @@ -3,13 +3,13 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use camino::Utf8PathBuf; +use camino_tempfile::NamedUtf8TempFile; +use camino_tempfile::Utf8PathPersistError; +use camino_tempfile::Utf8TempPath; use std::io; use std::pin::Pin; use std::task::Context; use std::task::Poll; -use tempfile::NamedTempFile; -use tempfile::PathPersistError; -use tempfile::TempPath; use tokio::fs::File; use tokio::io::AsyncWrite; @@ -18,7 +18,7 @@ pub(crate) struct AsyncNamedTempFile { // in our `persist()` method below. This allows us to drop the temp path // (deleting the temporary file) if we're dropped before `persist()` is // called. - temp_path: Option, + temp_path: Option, destination: Utf8PathBuf, inner: File, } @@ -41,7 +41,7 @@ impl AsyncNamedTempFile { .to_owned(); let temp_file = - tokio::task::spawn_blocking(|| NamedTempFile::new_in(parent)) + tokio::task::spawn_blocking(|| NamedUtf8TempFile::new_in(parent)) .await .unwrap()?; let temp_path = temp_file.into_temp_path(); @@ -62,7 +62,7 @@ impl AsyncNamedTempFile { tokio::task::spawn_blocking(move || temp_path.persist(&destination)) .await .unwrap() - .map_err(|PathPersistError { error, .. }| error) + .map_err(|Utf8PathPersistError { error, .. }| error) } } diff --git a/installinator/src/write.rs b/installinator/src/write.rs index fdc83cffa2..583c5a7b51 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -918,6 +918,7 @@ mod tests { use anyhow::Result; use bytes::{Buf, Bytes}; use camino::Utf8Path; + use camino_tempfile::tempdir; use futures::StreamExt; use installinator_common::{ Event, InstallinatorCompletionMetadata, InstallinatorComponent, @@ -934,7 +935,6 @@ mod tests { PartialAsyncWrite, PartialOp, }; use proptest::prelude::*; - use tempfile::tempdir; use test_strategy::proptest; use tokio::io::AsyncReadExt; use tokio::sync::Mutex; @@ -1032,7 +1032,7 @@ mod tests { ) -> Result<()> { let logctx = test_setup_log("test_write_artifact"); let tempdir = tempdir()?; - let tempdir_path: &Utf8Path = tempdir.path().try_into()?; + let tempdir_path = tempdir.path(); let destination_host = tempdir_path.join("test-host.bin"); let destination_control_plane = From 93aa572287ed927960ad138304d0e3055674f6f4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 15 Aug 2024 15:40:53 -0400 Subject: [PATCH 47/91] Use the correct background task name (#6348) region_snapshot_replacement -> region_snapshot_replacement_start --- dev-tools/omdb/src/bin/omdb/nexus.rs | 2 +- dev-tools/omdb/tests/successes.out | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 0828cef892..ede2743404 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1453,7 +1453,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } } }; - } else if name == "region_snapshot_replacement" { + } else if name == "region_snapshot_replacement_start" { match serde_json::from_value::( details.clone(), ) { diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 41c5a15a1c..9d432525c3 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -587,7 +587,9 @@ task: "region_snapshot_replacement_start" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms -warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []}) + total requests created ok: 0 + total start saga invoked ok: 0 + errors: 0 task: "saga_recovery" configured period: every 10m @@ -993,7 +995,9 @@ task: "region_snapshot_replacement_start" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms -warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []}) + total requests created ok: 0 + total start saga invoked ok: 0 + errors: 0 task: "saga_recovery" configured period: every 10m From 5646755848e1f3ace5066ae5196c40e15d088133 Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 15 Aug 2024 13:22:39 -0700 Subject: [PATCH 48/91] [meta] remove profile override for rand_hc (#6351) rand_hc is no longer in our dependency graph. (Thanks @ahl for pointing this out!) --- Cargo.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a1ae9858ab..92c4ead65f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -734,8 +734,6 @@ opt-level = 3 opt-level = 3 [profile.dev.package.rand_core] opt-level = 3 -[profile.dev.package.rand_hc] -opt-level = 3 [profile.dev.package.rand_xorshift] opt-level = 3 [profile.dev.package.rsa] From 66ac7b328889e1921f84cc6b082765023daf722a Mon Sep 17 00:00:00 2001 From: Adam Leventhal Date: Thu, 15 Aug 2024 15:40:49 -0700 Subject: [PATCH 49/91] follow trust-dns to its new name: hickory (#5912) --- Cargo.lock | 204 ++++++++++++++----------- Cargo.toml | 8 +- clients/oxide-client/Cargo.toml | 2 +- clients/oxide-client/src/lib.rs | 19 +-- dns-server/Cargo.toml | 13 +- dns-server/src/bin/dns-server.rs | 2 +- dns-server/src/dns_server.rs | 33 ++-- dns-server/src/lib.rs | 18 ++- dns-server/src/storage.rs | 10 +- dns-server/tests/basic_test.rs | 24 +-- end-to-end-tests/Cargo.toml | 2 +- end-to-end-tests/src/helpers/ctx.rs | 2 +- internal-dns-cli/Cargo.toml | 2 +- internal-dns-cli/src/bin/dnswait.rs | 6 +- internal-dns/Cargo.toml | 2 +- internal-dns/src/resolver.rs | 46 ++++-- nexus/Cargo.toml | 4 +- nexus/src/app/external_dns.rs | 23 ++- nexus/test-utils/Cargo.toml | 2 +- nexus/test-utils/src/lib.rs | 18 +-- nexus/tests/integration_tests/silos.rs | 4 +- wicketd/Cargo.toml | 2 +- wicketd/src/preflight_check/uplink.rs | 38 ++--- workspace-hack/Cargo.toml | 4 +- 24 files changed, 268 insertions(+), 220 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 544097d1e5..5fd17fd158 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2106,6 +2106,10 @@ dependencies = [ "dns-service-client", "dropshot", "expectorate", + "hickory-client", + "hickory-proto", + "hickory-resolver", + "hickory-server", "http 0.2.12", "omicron-test-utils", "omicron-workspace-hack", @@ -2125,10 +2129,6 @@ dependencies = [ "thiserror", "tokio", "toml 0.8.19", - "trust-dns-client", - "trust-dns-proto", - "trust-dns-resolver", - "trust-dns-server", "uuid", ] @@ -2408,6 +2408,7 @@ dependencies = [ "clap", "colored", "dhcproto", + "hickory-resolver", "http 0.2.12", "humantime", "hyper 0.14.30", @@ -2428,7 +2429,6 @@ dependencies = [ "socket2 0.5.7", "tokio", "toml 0.8.19", - "trust-dns-resolver", "uuid", ] @@ -2450,6 +2450,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "enum-as-inner" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 2.0.74", +] + [[package]] name = "env_logger" version = "0.9.3" @@ -3318,6 +3330,90 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "hickory-client" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab9683b08d8f8957a857b0236455d80e1886eaa8c6178af556aa7871fb61b55" +dependencies = [ + "cfg-if", + "data-encoding", + "futures-channel", + "futures-util", + "hickory-proto", + "once_cell", + "radix_trie", + "rand", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "hickory-proto" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07698b8420e2f0d6447a436ba999ec85d8fbf2a398bbd737b82cac4a2e96e512" +dependencies = [ + "async-trait", + "cfg-if", + "data-encoding", + "enum-as-inner 0.6.0", + "futures-channel", + "futures-io", + "futures-util", + "idna 0.4.0", + "ipnet", + "once_cell", + "rand", + "thiserror", + "tinyvec", + "tokio", + "tracing", + "url", +] + +[[package]] +name = "hickory-resolver" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28757f23aa75c98f254cf0405e6d8c25b831b32921b050a66692427679b1f243" +dependencies = [ + "cfg-if", + "futures-util", + "hickory-proto", + "ipconfig", + "lru-cache", + "once_cell", + "parking_lot 0.12.2", + "rand", + "resolv-conf", + "smallvec 1.13.2", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "hickory-server" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9be0e43c556b9b3fdb6c7c71a9a32153a2275d02419e3de809e520bfcfe40c37" +dependencies = [ + "async-trait", + "bytes", + "cfg-if", + "enum-as-inner 0.6.0", + "futures-util", + "hickory-proto", + "serde", + "thiserror", + "time", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "highway" version = "1.2.0" @@ -3681,6 +3777,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "idna" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "idna" version = "0.5.0" @@ -3947,6 +4053,7 @@ dependencies = [ "dropshot", "expectorate", "futures", + "hickory-resolver", "hyper 0.14.30", "omicron-common", "omicron-test-utils", @@ -3961,7 +4068,6 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "trust-dns-resolver", "uuid", ] @@ -3972,12 +4078,12 @@ dependencies = [ "anyhow", "clap", "dropshot", + "hickory-resolver", "internal-dns", "omicron-common", "omicron-workspace-hack", "slog", "tokio", - "trust-dns-resolver", ] [[package]] @@ -5292,6 +5398,7 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "headers", + "hickory-resolver", "http 0.2.12", "hyper 0.14.30", "illumos-utils", @@ -5318,7 +5425,6 @@ dependencies = [ "slog", "tokio", "tokio-util", - "trust-dns-resolver", "uuid", ] @@ -5920,6 +6026,7 @@ dependencies = [ "gateway-test-utils", "headers", "hex", + "hickory-resolver", "http 0.2.12", "httptest", "hubtools", @@ -6012,7 +6119,6 @@ dependencies = [ "tokio-postgres", "tokio-util", "tough", - "trust-dns-resolver", "tufaceous", "tufaceous-lib", "update-common", @@ -6368,6 +6474,7 @@ dependencies = [ "group", "hashbrown 0.14.5", "hex", + "hickory-proto", "hmac", "hyper 0.14.30", "indexmap 2.4.0", @@ -6431,7 +6538,6 @@ dependencies = [ "toml_edit 0.19.15", "toml_edit 0.22.20", "tracing", - "trust-dns-proto", "unicode-bidi", "unicode-normalization", "unicode-xid", @@ -6677,6 +6783,7 @@ dependencies = [ "base64 0.22.1", "chrono", "futures", + "hickory-resolver", "http 0.2.12", "hyper 0.14.30", "omicron-workspace-hack", @@ -6688,7 +6795,6 @@ dependencies = [ "serde_json", "thiserror", "tokio", - "trust-dns-resolver", "uuid", ] @@ -10573,15 +10679,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" -dependencies = [ - "serde", -] - [[package]] name = "toml" version = "0.7.8" @@ -10754,26 +10851,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "trust-dns-client" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c408c32e6a9dbb38037cece35740f2cf23c875d8ca134d33631cec83f74d3fe" -dependencies = [ - "cfg-if", - "data-encoding", - "futures-channel", - "futures-util", - "lazy_static", - "radix_trie", - "rand", - "thiserror", - "time", - "tokio", - "tracing", - "trust-dns-proto", -] - [[package]] name = "trust-dns-proto" version = "0.22.0" @@ -10783,7 +10860,7 @@ dependencies = [ "async-trait", "cfg-if", "data-encoding", - "enum-as-inner", + "enum-as-inner 0.5.1", "futures-channel", "futures-io", "futures-util", @@ -10794,53 +10871,10 @@ dependencies = [ "smallvec 1.13.2", "thiserror", "tinyvec", - "tokio", "tracing", "url", ] -[[package]] -name = "trust-dns-resolver" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aff21aa4dcefb0a1afbfac26deb0adc93888c7d295fb63ab273ef276ba2b7cfe" -dependencies = [ - "cfg-if", - "futures-util", - "ipconfig", - "lazy_static", - "lru-cache", - "parking_lot 0.12.2", - "resolv-conf", - "smallvec 1.13.2", - "thiserror", - "tokio", - "tracing", - "trust-dns-proto", -] - -[[package]] -name = "trust-dns-server" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99022f9befa6daec2a860be68ac28b1f0d9d7ccf441d8c5a695e35a58d88840d" -dependencies = [ - "async-trait", - "bytes", - "cfg-if", - "enum-as-inner", - "futures-executor", - "futures-util", - "serde", - "thiserror", - "time", - "tokio", - "toml 0.5.11", - "tracing", - "trust-dns-client", - "trust-dns-proto", -] - [[package]] name = "try-lock" version = "0.2.5" @@ -11707,6 +11741,7 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "hex", + "hickory-resolver", "http 0.2.12", "hubtools", "hyper 0.14.30", @@ -11746,7 +11781,6 @@ dependencies = [ "tokio-util", "toml 0.8.19", "tough", - "trust-dns-resolver", "tufaceous", "tufaceous-lib", "update-common", diff --git a/Cargo.toml b/Cargo.toml index 92c4ead65f..2bb189b6c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -362,6 +362,10 @@ headers = "0.3.9" heck = "0.5" hex = "0.4.3" hex-literal = "0.4.1" +hickory-client = "0.24.1" +hickory-proto = "0.24.1" +hickory-resolver = "0.24.1" +hickory-server = "0.24.1" highway = "1.2.0" hkdf = "0.12.4" http = "0.2.12" @@ -572,10 +576,6 @@ tokio-util = { version = "0.7.11", features = ["io", "io-util"] } toml = "0.8.19" toml_edit = "0.22.20" tough = { version = "0.17.1", features = [ "http" ] } -trust-dns-client = "0.22" -trust-dns-proto = "0.22" -trust-dns-resolver = "0.22" -trust-dns-server = "0.22" trybuild = "1.0.99" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } diff --git a/clients/oxide-client/Cargo.toml b/clients/oxide-client/Cargo.toml index f2adcacb1b..183640946f 100644 --- a/clients/oxide-client/Cargo.toml +++ b/clients/oxide-client/Cargo.toml @@ -12,6 +12,7 @@ anyhow.workspace = true base64.workspace = true chrono.workspace = true futures.workspace = true +hickory-resolver.workspace = true http.workspace = true hyper.workspace = true progenitor.workspace = true @@ -22,6 +23,5 @@ serde.workspace = true serde_json.workspace = true thiserror.workspace = true tokio = { workspace = true, features = [ "net" ] } -trust-dns-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/oxide-client/src/lib.rs b/clients/oxide-client/src/lib.rs index 07a190c38e..249ea18146 100644 --- a/clients/oxide-client/src/lib.rs +++ b/clients/oxide-client/src/lib.rs @@ -7,13 +7,13 @@ use anyhow::anyhow; use anyhow::Context; use futures::FutureExt; +use hickory_resolver::config::{ + NameServerConfig, Protocol, ResolverConfig, ResolverOpts, +}; +use hickory_resolver::TokioAsyncResolver; use std::net::SocketAddr; use std::sync::Arc; use thiserror::Error; -use trust_dns_resolver::config::{ - NameServerConfig, Protocol, ResolverConfig, ResolverOpts, -}; -use trust_dns_resolver::TokioAsyncResolver; progenitor::generate_api!( spec = "../../openapi/nexus.json", @@ -46,14 +46,15 @@ impl CustomDnsResolver { socket_addr: dns_addr, protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); + let mut resolver_opts = ResolverOpts::default(); + // Enable edns for potentially larger records + resolver_opts.edns0 = true; - let resolver = Arc::new( - TokioAsyncResolver::tokio(resolver_config, ResolverOpts::default()) - .context("failed to create resolver")?, - ); + let resolver = + Arc::new(TokioAsyncResolver::tokio(resolver_config, resolver_opts)); Ok(CustomDnsResolver { dns_addr, resolver }) } diff --git a/dns-server/Cargo.toml b/dns-server/Cargo.toml index d11dabaf85..b4516b8b77 100644 --- a/dns-server/Cargo.toml +++ b/dns-server/Cargo.toml @@ -15,24 +15,24 @@ clap.workspace = true dns-server-api.workspace = true dns-service-client.workspace = true dropshot.workspace = true +hickory-client.workspace = true +hickory-proto.workspace = true +hickory-resolver.workspace = true +hickory-server.workspace = true http.workspace = true pretty-hex.workspace = true schemars.workspace = true serde.workspace = true serde_json.workspace = true sled.workspace = true -slog.workspace = true -slog-term.workspace = true slog-async.workspace = true slog-envlogger.workspace = true +slog-term.workspace = true +slog.workspace = true tempfile.workspace = true thiserror.workspace = true tokio = { workspace = true, features = [ "full" ] } toml.workspace = true -trust-dns-client.workspace = true -trust-dns-proto.workspace = true -trust-dns-resolver.workspace = true -trust-dns-server.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true @@ -44,4 +44,3 @@ openapiv3.workspace = true openapi-lint.workspace = true serde_json.workspace = true subprocess.workspace = true -trust-dns-resolver.workspace = true diff --git a/dns-server/src/bin/dns-server.rs b/dns-server/src/bin/dns-server.rs index 52a9c17c0d..9e8d098ee2 100644 --- a/dns-server/src/bin/dns-server.rs +++ b/dns-server/src/bin/dns-server.rs @@ -3,7 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Executable that starts the HTTP-configurable DNS server used for both -//! internal DNS (RFD 248) and extenral DNS (RFD 357) for the Oxide system +//! internal DNS (RFD 248) and external DNS (RFD 357) for the Oxide system use anyhow::anyhow; use anyhow::Context; diff --git a/dns-server/src/dns_server.rs b/dns-server/src/dns_server.rs index 5c761f2aa3..4ecbe382c8 100644 --- a/dns-server/src/dns_server.rs +++ b/dns-server/src/dns_server.rs @@ -13,6 +13,19 @@ use crate::storage::Store; use anyhow::anyhow; use anyhow::Context; use dns_server_api::DnsRecord; +use hickory_proto::op::Header; +use hickory_proto::op::ResponseCode; +use hickory_proto::rr::rdata::SRV; +use hickory_proto::rr::RData; +use hickory_proto::rr::Record; +use hickory_proto::rr::RecordType; +use hickory_proto::serialize::binary::BinDecodable; +use hickory_proto::serialize::binary::BinDecoder; +use hickory_proto::serialize::binary::BinEncoder; +use hickory_resolver::Name; +use hickory_server::authority::MessageRequest; +use hickory_server::authority::MessageResponse; +use hickory_server::authority::MessageResponseBuilder; use pretty_hex::*; use serde::Deserialize; use slog::{debug, error, info, o, trace, Logger}; @@ -21,17 +34,6 @@ use std::str::FromStr; use std::sync::Arc; use thiserror::Error; use tokio::net::UdpSocket; -use trust_dns_proto::op::header::Header; -use trust_dns_proto::op::response_code::ResponseCode; -use trust_dns_proto::rr::rdata::SRV; -use trust_dns_proto::rr::record_data::RData; -use trust_dns_proto::rr::record_type::RecordType; -use trust_dns_proto::rr::{Name, Record}; -use trust_dns_proto::serialize::binary::{ - BinDecodable, BinDecoder, BinEncoder, -}; -use trust_dns_server::authority::MessageResponse; -use trust_dns_server::authority::{MessageRequest, MessageResponseBuilder}; use uuid::Uuid; /// Configuration related to the DNS server @@ -167,7 +169,10 @@ async fn handle_dns_packet(request: Request) { Err(error) => { let header = Header::response_from_request(mr.header()); let rb_servfail = MessageResponseBuilder::from_message_request(&mr); - error!(log, "failed to handle incoming DNS message: {:#}", error); + error!( + log, + "failed to handle incoming DNS message: {:#?} {:#}", mr, error + ); match error { RequestError::NxDomain(_) => { let rb_nxdomain = @@ -222,7 +227,7 @@ fn dns_record_to_record( let mut a = Record::new(); a.set_name(name.clone()) .set_rr_type(RecordType::A) - .set_data(Some(RData::A(addr))); + .set_data(Some(RData::A(addr.into()))); Ok(a) } @@ -230,7 +235,7 @@ fn dns_record_to_record( let mut aaaa = Record::new(); aaaa.set_name(name.clone()) .set_rr_type(RecordType::AAAA) - .set_data(Some(RData::AAAA(addr))); + .set_data(Some(RData::AAAA(addr.into()))); Ok(aaaa) } diff --git a/dns-server/src/lib.rs b/dns-server/src/lib.rs index 424159e41d..8abd3b945e 100644 --- a/dns-server/src/lib.rs +++ b/dns-server/src/lib.rs @@ -47,13 +47,13 @@ pub mod http_server; pub mod storage; use anyhow::{anyhow, Context}; +use hickory_resolver::config::NameServerConfig; +use hickory_resolver::config::Protocol; +use hickory_resolver::config::ResolverConfig; +use hickory_resolver::config::ResolverOpts; +use hickory_resolver::TokioAsyncResolver; use slog::o; use std::net::SocketAddr; -use trust_dns_resolver::config::NameServerConfig; -use trust_dns_resolver::config::Protocol; -use trust_dns_resolver::config::ResolverConfig; -use trust_dns_resolver::config::ResolverOpts; -use trust_dns_resolver::TokioAsyncResolver; /// Starts both the HTTP and DNS servers over a given store. pub async fn start_servers( @@ -167,12 +167,14 @@ impl TransientServer { socket_addr: self.dns_server.local_address(), protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); + let mut resolver_opts = ResolverOpts::default(); + // Enable edns for potentially larger records + resolver_opts.edns0 = true; let resolver = - TokioAsyncResolver::tokio(resolver_config, ResolverOpts::default()) - .context("creating DNS resolver")?; + TokioAsyncResolver::tokio(resolver_config, resolver_opts); Ok(resolver) } } diff --git a/dns-server/src/storage.rs b/dns-server/src/storage.rs index 85b2e79b8b..b3141f6751 100644 --- a/dns-server/src/storage.rs +++ b/dns-server/src/storage.rs @@ -95,6 +95,8 @@ use anyhow::{anyhow, Context}; use camino::Utf8PathBuf; use dns_server_api::{DnsConfig, DnsConfigParams, DnsConfigZone, DnsRecord}; +use hickory_proto::rr::LowerName; +use hickory_resolver::Name; use serde::{Deserialize, Serialize}; use sled::transaction::ConflictableTransactionError; use slog::{debug, error, info, o, warn}; @@ -104,8 +106,6 @@ use std::sync::atomic::Ordering; use std::sync::Arc; use thiserror::Error; use tokio::sync::Mutex; -use trust_dns_client::rr::LowerName; -use trust_dns_client::rr::Name; const KEY_CONFIG: &'static str = "config"; @@ -586,7 +586,7 @@ impl Store { /// If the returned set would have been empty, returns `QueryError::NoName`. pub(crate) fn query( &self, - mr: &trust_dns_server::authority::MessageRequest, + mr: &hickory_server::authority::MessageRequest, ) -> Result, QueryError> { let name = mr.query().name(); let orig_name = mr.query().original().name(); @@ -784,14 +784,14 @@ mod test { use dns_server_api::DnsConfigParams; use dns_server_api::DnsConfigZone; use dns_server_api::DnsRecord; + use hickory_proto::rr::LowerName; + use hickory_resolver::Name; use omicron_test_utils::dev::test_setup_log; use std::collections::BTreeSet; use std::collections::HashMap; use std::net::Ipv6Addr; use std::str::FromStr; use std::sync::Arc; - use trust_dns_client::rr::LowerName; - use trust_dns_client::rr::Name; /// As usual, `TestContext` groups the various pieces we need in a bunch of /// our tests and helps make sure they get cleaned up properly. diff --git a/dns-server/tests/basic_test.rs b/dns-server/tests/basic_test.rs index b3b7f37378..fa5bfea468 100644 --- a/dns-server/tests/basic_test.rs +++ b/dns-server/tests/basic_test.rs @@ -9,6 +9,12 @@ use dns_service_client::{ Client, }; use dropshot::{test_util::LogContext, HandlerTaskMode}; +use hickory_resolver::error::ResolveErrorKind; +use hickory_resolver::TokioAsyncResolver; +use hickory_resolver::{ + config::{NameServerConfig, Protocol, ResolverConfig, ResolverOpts}, + proto::op::ResponseCode, +}; use omicron_test_utils::dev::test_setup_log; use slog::o; use std::{ @@ -16,12 +22,6 @@ use std::{ net::Ipv6Addr, net::{IpAddr, Ipv4Addr}, }; -use trust_dns_resolver::error::ResolveErrorKind; -use trust_dns_resolver::TokioAsyncResolver; -use trust_dns_resolver::{ - config::{NameServerConfig, Protocol, ResolverConfig, ResolverOpts}, - proto::op::ResponseCode, -}; const TEST_ZONE: &'static str = "oxide.internal"; @@ -374,17 +374,19 @@ async fn init_client_server( ) .await?; - let mut rc = ResolverConfig::new(); - rc.add_name_server(NameServerConfig { + let mut resolver_config = ResolverConfig::new(); + resolver_config.add_name_server(NameServerConfig { socket_addr: dns_server.local_address(), protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); + let mut resolver_opts = ResolverOpts::default(); + // Enable edns for potentially larger records + resolver_opts.edns0 = true; - let resolver = - TokioAsyncResolver::tokio(rc, ResolverOpts::default()).unwrap(); + let resolver = TokioAsyncResolver::tokio(resolver_config, resolver_opts); let client = Client::new(&format!("http://{}", dropshot_server.local_addr()), log); diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index 781f3fb1c6..b2400f7603 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -26,7 +26,7 @@ serde_json.workspace = true sled-agent-types.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } toml.workspace = true -trust-dns-resolver.workspace = true +hickory-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true ispf.workspace = true diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs index d9a2d7027a..5363557502 100644 --- a/end-to-end-tests/src/helpers/ctx.rs +++ b/end-to-end-tests/src/helpers/ctx.rs @@ -1,6 +1,7 @@ use crate::helpers::generate_name; use anyhow::{anyhow, Context as _, Result}; use chrono::Utc; +use hickory_resolver::error::ResolveErrorKind; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{Name, ProjectCreate}; use oxide_client::CustomDnsResolver; @@ -13,7 +14,6 @@ use std::net::IpAddr; use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; -use trust_dns_resolver::error::ResolveErrorKind; use uuid::Uuid; const RSS_CONFIG_STR: &str = include_str!(concat!( diff --git a/internal-dns-cli/Cargo.toml b/internal-dns-cli/Cargo.toml index dae0af0280..3e34c21622 100644 --- a/internal-dns-cli/Cargo.toml +++ b/internal-dns-cli/Cargo.toml @@ -11,9 +11,9 @@ workspace = true anyhow.workspace = true clap.workspace = true dropshot.workspace = true +hickory-resolver.workspace = true internal-dns.workspace = true omicron-common.workspace = true slog.workspace = true tokio.workspace = true -trust-dns-resolver.workspace = true omicron-workspace-hack.workspace = true diff --git a/internal-dns-cli/src/bin/dnswait.rs b/internal-dns-cli/src/bin/dnswait.rs index 9e003ed14f..8dbd675d64 100644 --- a/internal-dns-cli/src/bin/dnswait.rs +++ b/internal-dns-cli/src/bin/dnswait.rs @@ -65,10 +65,8 @@ async fn main() -> Result<()> { let resolver = if opt.nameserver_addresses.is_empty() { info!(&log, "using system configuration"); - let async_resolver = - trust_dns_resolver::AsyncResolver::tokio_from_system_conf() - .context("initializing resolver from system configuration")?; - Resolver::new_with_resolver(log.clone(), async_resolver) + Resolver::new_from_system_conf(log.clone()) + .context("initializing resolver from system configuration")? } else { let addrs = opt.nameserver_addresses; info!(&log, "using explicit nameservers"; "nameservers" => ?addrs); diff --git a/internal-dns/Cargo.toml b/internal-dns/Cargo.toml index c08cc012c1..c12035e2cb 100644 --- a/internal-dns/Cargo.toml +++ b/internal-dns/Cargo.toml @@ -18,7 +18,7 @@ omicron-uuid-kinds.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } slog.workspace = true thiserror.workspace = true -trust-dns-resolver.workspace = true +hickory-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/internal-dns/src/resolver.rs b/internal-dns/src/resolver.rs index fdd5dce428..b3dadf16d2 100644 --- a/internal-dns/src/resolver.rs +++ b/internal-dns/src/resolver.rs @@ -2,24 +2,24 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use hickory_resolver::config::{ + LookupIpStrategy, NameServerConfig, Protocol, ResolverConfig, ResolverOpts, +}; +use hickory_resolver::lookup::SrvLookup; +use hickory_resolver::TokioAsyncResolver; use hyper::client::connect::dns::Name; use omicron_common::address::{ Ipv6Subnet, ReservedRackSubnet, AZ_PREFIX, DNS_PORT, }; use slog::{debug, error, info, trace}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; -use trust_dns_resolver::config::{ - LookupIpStrategy, NameServerConfig, Protocol, ResolverConfig, ResolverOpts, -}; -use trust_dns_resolver::lookup::SrvLookup; -use trust_dns_resolver::TokioAsyncResolver; pub type DnsError = dns_service_client::Error; #[derive(Debug, Clone, thiserror::Error)] pub enum ResolveError { #[error(transparent)] - Resolve(#[from] trust_dns_resolver::error::ResolveError), + Resolve(#[from] hickory_resolver::error::ResolveError), #[error("Record not found for SRV key: {}", .0.dns_name())] NotFound(crate::ServiceName), @@ -52,6 +52,19 @@ impl reqwest::dns::Resolve for Resolver { } impl Resolver { + /// Construct a new DNS resolver from the system configuration. + pub fn new_from_system_conf( + log: slog::Logger, + ) -> Result { + let (rc, mut opts) = hickory_resolver::system_conf::read_system_conf()?; + // Enable edns for potentially larger records + opts.edns0 = true; + + let resolver = TokioAsyncResolver::tokio(rc, opts); + + Ok(Self { log, resolver }) + } + /// Construct a new DNS resolver from specific DNS server addresses. pub fn new_from_addrs( log: slog::Logger, @@ -66,18 +79,20 @@ impl Resolver { socket_addr, protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); } let mut opts = ResolverOpts::default(); + // Enable edns for potentially larger records + opts.edns0 = true; opts.use_hosts_file = false; opts.num_concurrent_reqs = dns_server_count; // The underlay is IPv6 only, so this helps avoid needless lookups of // the IPv4 variant. opts.ip_strategy = LookupIpStrategy::Ipv6Only; opts.negative_max_ttl = Some(std::time::Duration::from_secs(15)); - let resolver = TokioAsyncResolver::tokio(rc, opts)?; + let resolver = TokioAsyncResolver::tokio(rc, opts); Ok(Self { log, resolver }) } @@ -163,7 +178,7 @@ impl Resolver { .iter() .next() .ok_or_else(|| ResolveError::NotFound(srv))?; - Ok(*address) + Ok(address.0) } /// Returns the targets of the SRV records for a DNS name @@ -313,7 +328,7 @@ impl Resolver { // (1) it returns `IpAddr`'s rather than `SocketAddr`'s // (2) it doesn't actually return all the addresses from the Additional // section of the DNS server's response. - // See bluejekyll/trust-dns#1980 + // See hickory-dns/hickory-dns#1980 // // (1) is not a huge deal as we can try to match up the targets ourselves // to grab the port for creating a `SocketAddr` but (2) means we need to do @@ -350,10 +365,9 @@ impl Resolver { .await .into_iter() .flat_map(move |target| match target { - Ok((ips, port)) => Some( - ips.into_iter() - .map(move |ip| SocketAddrV6::new(ip, port, 0, 0)), - ), + Ok((ips, port)) => Some(ips.into_iter().map(move |aaaa| { + SocketAddrV6::new(aaaa.into(), port, 0, 0) + })), Err((target, err)) => { error!( log, @@ -511,7 +525,7 @@ mod test { assert!( matches!( dns_error.kind(), - trust_dns_resolver::error::ResolveErrorKind::NoRecordsFound { .. }, + hickory_resolver::error::ResolveErrorKind::NoRecordsFound { .. }, ), "Saw error: {dns_error}", ); @@ -664,7 +678,7 @@ mod test { error, ResolveError::Resolve(error) if matches!(error.kind(), - trust_dns_resolver::error::ResolveErrorKind::NoRecordsFound { .. } + hickory_resolver::error::ResolveErrorKind::NoRecordsFound { .. } ) ); diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 1128cd8f0f..8977507505 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -35,6 +35,7 @@ futures.workspace = true gateway-client.workspace = true headers.workspace = true hex.workspace = true +hickory-resolver.workspace = true http.workspace = true hyper.workspace = true illumos-utils.workspace = true @@ -87,7 +88,6 @@ tokio = { workspace = true, features = ["full"] } tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } tokio-util = { workspace = true, features = ["codec"] } tough.workspace = true -trust-dns-resolver.workspace = true uuid.workspace = true nexus-auth.workspace = true @@ -143,7 +143,7 @@ sp-sim.workspace = true rustls.workspace = true subprocess.workspace = true term.workspace = true -trust-dns-resolver.workspace = true +hickory-resolver.workspace = true tufaceous.workspace = true tufaceous-lib.workspace = true httptest.workspace = true diff --git a/nexus/src/app/external_dns.rs b/nexus/src/app/external_dns.rs index c6a8d833c2..4732146ce2 100644 --- a/nexus/src/app/external_dns.rs +++ b/nexus/src/app/external_dns.rs @@ -5,15 +5,15 @@ use std::net::IpAddr; use std::net::SocketAddr; +use hickory_resolver::config::NameServerConfig; +use hickory_resolver::config::Protocol; +use hickory_resolver::config::ResolverConfig; +use hickory_resolver::config::ResolverOpts; +use hickory_resolver::TokioAsyncResolver; use hyper::client::connect::dns::Name; use omicron_common::address::DNS_PORT; -use trust_dns_resolver::config::NameServerConfig; -use trust_dns_resolver::config::Protocol; -use trust_dns_resolver::config::ResolverConfig; -use trust_dns_resolver::config::ResolverOpts; -use trust_dns_resolver::TokioAsyncResolver; -/// Wrapper around trust-dns-resolver to provide name resolution +/// Wrapper around hickory-resolver to provide name resolution /// using a given set of DNS servers for use with reqwest. pub struct Resolver(TokioAsyncResolver); @@ -26,18 +26,17 @@ impl Resolver { socket_addr: SocketAddr::new(*addr, DNS_PORT), protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); } let mut opts = ResolverOpts::default(); + // Enable edns for potentially larger records + opts.edns0 = true; opts.use_hosts_file = false; // Do as many requests in parallel as we have configured servers opts.num_concurrent_reqs = dns_servers.len(); - Resolver( - TokioAsyncResolver::tokio(rc, opts) - .expect("creating resovler shouldn't fail"), - ) + Resolver(TokioAsyncResolver::tokio(rc, opts)) } } @@ -48,7 +47,7 @@ impl reqwest::dns::Resolve for Resolver { let ips = resolver.lookup_ip(name.as_str()).await?; let addrs = ips .into_iter() - // trust-dns-resolver returns `IpAddr`s but reqwest wants + // hickory-resolver returns `IpAddr`s but reqwest wants // `SocketAddr`s (useful if you have a custom resolver that // returns a scoped IPv6 address). The port provided here // is ignored in favour of the scheme default (http/80, diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index a883bc83c5..50110ecaca 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -46,7 +46,7 @@ sled-agent-client.workspace = true slog.workspace = true tokio.workspace = true tokio-util.workspace = true -trust-dns-resolver.workspace = true +hickory-resolver.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 7c190974a1..3dcffb399b 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -17,6 +17,11 @@ use dropshot::HandlerTaskMode; use futures::future::BoxFuture; use futures::FutureExt; use gateway_test_utils::setup::GatewayTestContext; +use hickory_resolver::config::NameServerConfig; +use hickory_resolver::config::Protocol; +use hickory_resolver::config::ResolverConfig; +use hickory_resolver::config::ResolverOpts; +use hickory_resolver::TokioAsyncResolver; use nexus_config::Database; use nexus_config::DpdConfig; use nexus_config::InternalDns; @@ -73,11 +78,6 @@ use std::collections::HashMap; use std::fmt::Debug; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::time::Duration; -use trust_dns_resolver::config::NameServerConfig; -use trust_dns_resolver::config::Protocol; -use trust_dns_resolver::config::ResolverConfig; -use trust_dns_resolver::config::ResolverOpts; -use trust_dns_resolver::TokioAsyncResolver; use uuid::Uuid; pub use sim::TEST_HARDWARE_THREADS; @@ -1586,12 +1586,12 @@ pub async fn start_dns_server( socket_addr: dns_server.local_address(), protocol: Protocol::Udp, tls_dns_name: None, - trust_nx_responses: false, + trust_negative_responses: false, bind_addr: None, }); - let resolver = - TokioAsyncResolver::tokio(resolver_config, ResolverOpts::default()) - .context("creating DNS resolver")?; + let mut resolver_opts = ResolverOpts::default(); + resolver_opts.edns0 = true; + let resolver = TokioAsyncResolver::tokio(resolver_config, resolver_opts); Ok((dns_server, http_server, resolver)) } diff --git a/nexus/tests/integration_tests/silos.rs b/nexus/tests/integration_tests/silos.rs index 2c861ff159..0de4d31395 100644 --- a/nexus/tests/integration_tests/silos.rs +++ b/nexus/tests/integration_tests/silos.rs @@ -37,6 +37,7 @@ use std::fmt::Write; use std::str::FromStr; use base64::Engine; +use hickory_resolver::error::ResolveErrorKind; use http::method::Method; use http::StatusCode; use httptest::{matchers::*, responders::*, Expectation, Server}; @@ -44,7 +45,6 @@ use nexus_types::external_api::shared::{FleetRole, SiloRole}; use std::convert::Infallible; use std::net::Ipv4Addr; use std::time::Duration; -use trust_dns_resolver::error::ResolveErrorKind; use uuid::Uuid; type ControlPlaneTestContext = @@ -2164,7 +2164,7 @@ pub async fn verify_silo_dns_name( .await { Ok(result) => { - let addrs: Vec<_> = result.iter().collect(); + let addrs: Vec<_> = result.iter().map(|a| &a.0).collect(); if addrs.is_empty() { false } else { diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml index 324ae01b42..6e2c27a97e 100644 --- a/wicketd/Cargo.toml +++ b/wicketd/Cargo.toml @@ -25,6 +25,7 @@ flume.workspace = true futures.workspace = true gateway-messages.workspace = true hex.workspace = true +hickory-resolver.workspace = true http.workspace = true hubtools.workspace = true hyper.workspace = true @@ -46,7 +47,6 @@ tokio-stream.workspace = true tokio-util.workspace = true toml.workspace = true tough.workspace = true -trust-dns-resolver.workspace = true uuid.workspace = true bootstrap-agent-client.workspace = true diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index 36a4f61779..fb0914e836 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -14,6 +14,11 @@ use dpd_client::types::PortSpeed as DpdPortSpeed; use dpd_client::Client as DpdClient; use dpd_client::ClientState as DpdClientState; use either::Either; +use hickory_resolver::config::NameServerConfigGroup; +use hickory_resolver::config::ResolverConfig; +use hickory_resolver::config::ResolverOpts; +use hickory_resolver::error::ResolveErrorKind; +use hickory_resolver::TokioAsyncResolver; use illumos_utils::zone::SVCCFG; use illumos_utils::PFEXEC; use omicron_common::address::DENDRITE_PORT; @@ -35,12 +40,6 @@ use std::time::Duration; use std::time::Instant; use tokio::process::Command; use tokio::sync::mpsc; -use trust_dns_resolver::config::NameServerConfigGroup; -use trust_dns_resolver::config::ResolverConfig; -use trust_dns_resolver::config::ResolverOpts; -use trust_dns_resolver::error::ResolveError; -use trust_dns_resolver::error::ResolveErrorKind; -use trust_dns_resolver::TokioAsyncResolver; use wicket_common::preflight_check::EventBuffer; use wicket_common::preflight_check::StepContext; use wicket_common::preflight_check::StepProgress; @@ -930,16 +929,7 @@ impl DnsLookupStep { }; 'dns_servers: for &dns_ip in dns_servers { - let resolver = match self.build_resolver(dns_ip) { - Ok(resolver) => resolver, - Err(err) => { - self.warnings.push(format!( - "failed to create resolver for {dns_ip}: {}", - DisplayErrorChain::new(&err) - )); - continue; - } - }; + let resolver = self.build_resolver(dns_ip); // Attempt to resolve any NTP servers that aren't IP addresses. for &ntp_name in &ntp_names_to_resolve { @@ -1052,14 +1042,18 @@ impl DnsLookupStep { ( "A", resolver.ipv4_lookup(name).await.map(|records| { - Either::Left(records.into_iter().map(IpAddr::V4)) + Either::Left( + records.into_iter().map(|x| IpAddr::V4(x.into())), + ) }), ) } else { ( "AAAA", resolver.ipv6_lookup(name).await.map(|records| { - Either::Right(records.into_iter().map(IpAddr::V6)) + Either::Right( + records.into_iter().map(|x| IpAddr::V6(x.into())), + ) }), ) }; @@ -1175,12 +1169,12 @@ impl DnsLookupStep { /// /// If building it fails, we'll append to our internal `warnings` and return /// `None`. - fn build_resolver( - &mut self, - dns_ip: IpAddr, - ) -> Result { + fn build_resolver(&mut self, dns_ip: IpAddr) -> TokioAsyncResolver { let mut options = ResolverOpts::default(); + // Enable edns for potentially larger records + options.edns0 = true; + // We will retry ourselves; we don't want the resolver // retrying internally too. options.attempts = 1; diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 35c266cdf3..5edfbccf93 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -60,6 +60,7 @@ getrandom = { version = "0.2.14", default-features = false, features = ["js", "r group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3", features = ["serde"] } +hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } indexmap = { version = "2.4.0", features = ["serde"] } @@ -113,7 +114,6 @@ tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.20", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } -trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.15" } unicode-normalization = { version = "0.1.23" } usdt = { version = "0.5.0" } @@ -167,6 +167,7 @@ getrandom = { version = "0.2.14", default-features = false, features = ["js", "r group = { version = "0.13.0", default-features = false, features = ["alloc"] } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3", features = ["serde"] } +hickory-proto = { version = "0.24.1", features = ["text-parsing"] } hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } indexmap = { version = "2.4.0", features = ["serde"] } @@ -222,7 +223,6 @@ tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.20", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } -trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.15" } unicode-normalization = { version = "0.1.23" } unicode-xid = { version = "0.2.4" } From 8ae0833152dd4b01ad7426f5c931f1e74f99cbd4 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 16 Aug 2024 03:40:56 +0000 Subject: [PATCH 50/91] [internal-dns] remove lookup_ipv6 in favor of lookup_socket_v6 (#6320) `lookup_ipv6` is both buggy and easy to misuse: * it sends an AAAA query for a domain which should have a SRV record - this works only because https://github.com/oxidecomputer/omicron/issues/4051 means the SRV record is incorrectly returned, along with the actually-desired AAAA for the SRV's target in Additionals * it looks up an IPv6 address from a SRV record *but ignores the port*. in places `lookup_ipv6` was used, it was paired consistently with the hardcoded port NEXUS_INTERNAL_PORT and matched what should be in the resolved SRV record. if we for example wanted to move Nexus' port (or start a test Nexus on an atypical port), the authoritative port number in the SRV response would be ignored for the hardcoded port. lets just use the port that we told DNS we're at! we may still want a bare IPv6 address for a service if we're going to test network reachability, for example, but we're not doing that with this function today. this all is distinct from helpers like `lookup_all_ipv6`. if we need a service's IPv6 address to use with an alternate port to access a different API, we *probably* should have a distinct SRV record for that lookup to use instead? i've found three instances of this: * wicket assumes the techport proxy is on the same IP as Nexus' API, but that isn't necessarily true * we assume the CRDB admin service listens on the same IP as CRDB itself, but that doesn't have to be true * we look up addresses for MGS via `ServiceName::Dendrite`, but there's a `ServiceName::ManagementGatewayService`, so either that's a typo or can be made to have its own SRV records there are some uses of `lookup_all_ipv6` that make a lot of sense still, where we're discovering the rack's network and _really_ do not care about the port that Dendrite happens to be on. --- internal-dns/src/resolver.rs | 73 ++++++++++++++------------------- oximeter/collector/src/agent.rs | 8 +--- oximeter/collector/src/lib.rs | 17 ++++---- 3 files changed, 40 insertions(+), 58 deletions(-) diff --git a/internal-dns/src/resolver.rs b/internal-dns/src/resolver.rs index b3dadf16d2..5d3832a417 100644 --- a/internal-dns/src/resolver.rs +++ b/internal-dns/src/resolver.rs @@ -160,27 +160,6 @@ impl Resolver { self.resolver.clear_cache(); } - /// Looks up a single [`Ipv6Addr`] based on the SRV name. - /// Returns an error if the record does not exist. - // TODO: There are lots of ways this API can expand: Caching, - // actually respecting TTL, looking up ports, etc. - // - // For now, however, it serves as a very simple "get everyone using DNS" - // API that can be improved upon later. - pub async fn lookup_ipv6( - &self, - srv: crate::ServiceName, - ) -> Result { - let name = srv.srv_name(); - debug!(self.log, "lookup_ipv6 srv"; "dns_name" => &name); - let response = self.resolver.ipv6_lookup(&name).await?; - let address = response - .iter() - .next() - .ok_or_else(|| ResolveError::NotFound(srv))?; - Ok(address.0) - } - /// Returns the targets of the SRV records for a DNS name /// /// The returned values are generally other DNS names that themselves would @@ -235,6 +214,12 @@ impl Resolver { // TODO-robustness: any callers of this should probably be using // all the targets for a given SRV and not just the first one // we get, see [`Resolver::lookup_all_socket_v6`]. + // + // TODO: There are lots of ways this API can expand: Caching, + // actually respecting TTL, looking up ports, etc. + // + // For now, however, it serves as a very simple "get everyone using DNS" + // API that can be improved upon later. pub async fn lookup_socket_v6( &self, service: crate::ServiceName, @@ -549,11 +534,11 @@ mod test { dns_server.update(&dns_config).await.unwrap(); let resolver = dns_server.resolver().unwrap(); - let found_ip = resolver - .lookup_ipv6(ServiceName::Cockroach) + let found_addr = resolver + .lookup_socket_v6(ServiceName::Cockroach) .await .expect("Should have been able to look up IP address"); - assert_eq!(found_ip, ip,); + assert_eq!(found_addr.ip(), &ip,); dns_server.cleanup_successful(); logctx.cleanup_successful(); @@ -631,11 +616,13 @@ mod test { // Look up Cockroach let resolver = dns_server.resolver().unwrap(); - let ip = resolver - .lookup_ipv6(ServiceName::Cockroach) + let resolved_addr = resolver + .lookup_socket_v6(ServiceName::Cockroach) .await .expect("Should have been able to look up IP address"); - assert!(cockroach_addrs.iter().any(|addr| addr.ip() == &ip)); + assert!(cockroach_addrs + .iter() + .any(|addr| addr.ip() == resolved_addr.ip())); // Look up all the Cockroach addresses. let mut ips = @@ -649,18 +636,18 @@ mod test { ); // Look up Clickhouse - let ip = resolver - .lookup_ipv6(ServiceName::Clickhouse) + let addr = resolver + .lookup_socket_v6(ServiceName::Clickhouse) .await .expect("Should have been able to look up IP address"); - assert_eq!(&ip, clickhouse_addr.ip()); + assert_eq!(addr.ip(), clickhouse_addr.ip()); // Look up Backend Service - let ip = resolver - .lookup_ipv6(srv_backend) + let addr = resolver + .lookup_socket_v6(srv_backend) .await .expect("Should have been able to look up IP address"); - assert_eq!(&ip, crucible_addr.ip()); + assert_eq!(addr.ip(), crucible_addr.ip()); // If we deploy a new generation that removes all records, then we don't // find anything any more. @@ -671,7 +658,7 @@ mod test { // If we remove the records for all services, we won't find them any // more. (e.g., there's no hidden caching going on) let error = resolver - .lookup_ipv6(ServiceName::Cockroach) + .lookup_socket_v6(ServiceName::Cockroach) .await .expect_err("unexpectedly found records"); assert_matches!( @@ -708,11 +695,11 @@ mod test { dns_builder.service_backend_zone(srv_crdb, &zone, 12345).unwrap(); let dns_config = dns_builder.build_full_config_for_initial_generation(); dns_server.update(&dns_config).await.unwrap(); - let found_ip = resolver - .lookup_ipv6(ServiceName::Cockroach) + let found_addr = resolver + .lookup_socket_v6(ServiceName::Cockroach) .await .expect("Should have been able to look up IP address"); - assert_eq!(found_ip, ip1); + assert_eq!(found_addr.ip(), &ip1); // If we insert the same record with a new address, it should be // updated. @@ -726,11 +713,11 @@ mod test { dns_builder.build_full_config_for_initial_generation(); dns_config.generation += 1; dns_server.update(&dns_config).await.unwrap(); - let found_ip = resolver - .lookup_ipv6(ServiceName::Cockroach) + let found_addr = resolver + .lookup_socket_v6(ServiceName::Cockroach) .await .expect("Should have been able to look up IP address"); - assert_eq!(found_ip, ip2); + assert_eq!(found_addr.ip(), &ip2); dns_server.cleanup_successful(); logctx.cleanup_successful(); @@ -861,11 +848,11 @@ mod test { dns_server.update(&dns_config).await.unwrap(); // Confirm that we can access this record manually. - let found_ip = resolver - .lookup_ipv6(ServiceName::Nexus) + let found_addr = resolver + .lookup_socket_v6(ServiceName::Nexus) .await .expect("Should have been able to look up IP address"); - assert_eq!(found_ip, ip); + assert_eq!(found_addr.ip(), &ip); // Confirm that the progenitor client can access this record too. let value = client.test_endpoint().await.unwrap(); diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 5da9a1dfa8..8271b2e068 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -18,7 +18,6 @@ use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use nexus_client::types::IdSortMode; use omicron_common::address::CLICKHOUSE_PORT; -use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::backoff; use omicron_common::backoff::BackoffError; use oximeter::types::ProducerResults; @@ -816,7 +815,7 @@ async fn refresh_producer_list(agent: OximeterAgent, resolver: Resolver) { async fn resolve_nexus_with_backoff( log: &Logger, resolver: &Resolver, -) -> SocketAddr { +) -> SocketAddrV6 { let log_failure = |error, delay| { warn!( log, @@ -827,12 +826,9 @@ async fn resolve_nexus_with_backoff( }; let do_lookup = || async { resolver - .lookup_ipv6(ServiceName::Nexus) + .lookup_socket_v6(ServiceName::Nexus) .await .map_err(|e| BackoffError::transient(e.to_string())) - .map(|ip| { - SocketAddr::V6(SocketAddrV6::new(ip, NEXUS_INTERNAL_PORT, 0, 0)) - }) }; backoff::retry_notify( backoff::retry_policy_internal_service(), diff --git a/oximeter/collector/src/lib.rs b/oximeter/collector/src/lib.rs index 02bf9152f4..7dd423d074 100644 --- a/oximeter/collector/src/lib.rs +++ b/oximeter/collector/src/lib.rs @@ -14,7 +14,6 @@ use dropshot::HttpServerStarter; use internal_dns::resolver::ResolveError; use internal_dns::resolver::Resolver; use internal_dns::ServiceName; -use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::backoff; use omicron_common::FileKv; @@ -251,14 +250,14 @@ impl Oximeter { let nexus_address = if let Some(address) = config.nexus_address { address } else { - SocketAddr::V6(SocketAddrV6::new( - resolver.lookup_ipv6(ServiceName::Nexus).await.map_err( - |e| backoff::BackoffError::transient(e.to_string()), - )?, - NEXUS_INTERNAL_PORT, - 0, - 0, - )) + SocketAddr::V6( + resolver + .lookup_socket_v6(ServiceName::Nexus) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?, + ) }; let client = nexus_client::Client::new( &format!("http://{nexus_address}"), From d7d4beaf0dbfa82c6ae0da10a6ce43b3c5a89142 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 04:47:25 +0000 Subject: [PATCH 51/91] Update Rust crate tokio to 1.39.2 (#6249) --- Cargo.lock | 15 +++++++-------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5fd17fd158..f561aed2ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6488,7 +6488,7 @@ dependencies = [ "log", "managed", "memchr", - "mio 0.8.11", + "mio 1.0.2", "nix 0.28.0", "nom", "num-bigint-dig", @@ -10546,28 +10546,27 @@ dependencies = [ [[package]] name = "tokio" -version = "1.38.1" +version = "1.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" +checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" dependencies = [ "backtrace", "bytes", "libc", - "mio 0.8.11", - "num_cpus", + "mio 1.0.2", "parking_lot 0.12.2", "pin-project-lite", "signal-hook-registry", "socket2 0.5.7", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 2bb189b6c0..1b62af959d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -568,7 +568,7 @@ textwrap = "0.16.1" test-strategy = "0.3.1" thiserror = "1.0" tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "main" } -tokio = "1.38.1" +tokio = "1.39.2" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.15" tokio-tungstenite = "0.20" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 5edfbccf93..7983c38052 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -107,7 +107,7 @@ string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } -tokio = { version = "1.38.1", features = ["full", "test-util"] } +tokio = { version = "1.39.2", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } @@ -216,7 +216,7 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extr syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] } -tokio = { version = "1.38.1", features = ["full", "test-util"] } +tokio = { version = "1.39.2", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } @@ -235,7 +235,7 @@ zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } [target.x86_64-unknown-linux-gnu.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -244,35 +244,35 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su [target.x86_64-unknown-linux-gnu.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.dependencies] -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.build-dependencies] -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.dependencies] -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.build-dependencies] -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -280,7 +280,7 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su [target.x86_64-unknown-illumos.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -290,7 +290,7 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu [target.x86_64-unknown-illumos.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } -mio = { version = "0.8.11", features = ["net", "os-ext"] } +mio = { version = "1.0.2", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } From 3c585f2180f149c532e268eaaed17329fdc935e4 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 06:22:35 +0000 Subject: [PATCH 52/91] Update Rust crate camino to v1.1.8 (#6357) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f561aed2ae..a62b4e2983 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -815,9 +815,9 @@ dependencies = [ [[package]] name = "camino" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ec6b951b160caa93cc0c7b209e5a3bff7aae9062213451ac99493cd844c239" +checksum = "3054fea8a20d8ff3968d5b22cc27501d2b08dc4decdb31b184323f00c5ef23bb" dependencies = [ "serde", ] From a7885d1da2e2f60c2cc648d9b80d25c1216fde3b Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:41:12 -0700 Subject: [PATCH 53/91] Update Rust crate clap to v4.5.16 (#6360) --- Cargo.lock | 4 ++-- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a62b4e2983..849c82f1f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1074,9 +1074,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.15" +version = "4.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" dependencies = [ "clap_builder", "clap_derive", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 7983c38052..5dc3bc11e7 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -33,7 +33,7 @@ byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } chrono = { version = "0.4.38", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } -clap = { version = "4.5.15", features = ["cargo", "derive", "env", "wrap_help"] } +clap = { version = "4.5.16", features = ["cargo", "derive", "env", "wrap_help"] } clap_builder = { version = "4.5.15", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } @@ -140,7 +140,7 @@ bytes = { version = "1.7.1", features = ["serde"] } cc = { version = "1.0.97", default-features = false, features = ["parallel"] } chrono = { version = "0.4.38", features = ["serde"] } cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] } -clap = { version = "4.5.15", features = ["cargo", "derive", "env", "wrap_help"] } +clap = { version = "4.5.16", features = ["cargo", "derive", "env", "wrap_help"] } clap_builder = { version = "4.5.15", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] } console = { version = "0.15.8" } const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] } From 921ec6d58ca1e0f5797924050e585bb7803618db Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 16 Aug 2024 15:04:19 -0700 Subject: [PATCH 54/91] remove redundant background task log entries (#6323) --- nexus/src/app/background/tasks/lookup_region_port.rs | 3 --- nexus/src/app/background/tasks/phantom_disks.rs | 4 +--- .../app/background/tasks/physical_disk_adoption.rs | 3 --- nexus/src/app/background/tasks/region_replacement.rs | 3 --- .../app/background/tasks/region_replacement_driver.rs | 5 ----- .../region_snapshot_replacement_garbage_collect.rs | 11 ----------- .../tasks/region_snapshot_replacement_start.rs | 5 ----- 7 files changed, 1 insertion(+), 33 deletions(-) diff --git a/nexus/src/app/background/tasks/lookup_region_port.rs b/nexus/src/app/background/tasks/lookup_region_port.rs index fbfc5c5af2..df501fe6b1 100644 --- a/nexus/src/app/background/tasks/lookup_region_port.rs +++ b/nexus/src/app/background/tasks/lookup_region_port.rs @@ -53,7 +53,6 @@ impl BackgroundTask for LookupRegionPort { ) -> BoxFuture<'a, serde_json::Value> { async { let log = &opctx.log; - info!(&log, "lookup region port task started"); let mut status = LookupRegionPortStatus::default(); @@ -147,8 +146,6 @@ impl BackgroundTask for LookupRegionPort { } } - info!(&log, "lookup region port task done"); - json!(status) } .boxed() diff --git a/nexus/src/app/background/tasks/phantom_disks.rs b/nexus/src/app/background/tasks/phantom_disks.rs index 4b0d8bec38..7f3fceab1c 100644 --- a/nexus/src/app/background/tasks/phantom_disks.rs +++ b/nexus/src/app/background/tasks/phantom_disks.rs @@ -43,7 +43,6 @@ impl BackgroundTask for PhantomDiskDetector { ) -> BoxFuture<'a, serde_json::Value> { async { let log = &opctx.log; - warn!(&log, "phantom disk task started"); let phantom_disks = match self.datastore.find_phantom_disks().await { @@ -83,14 +82,13 @@ impl BackgroundTask for PhantomDiskDetector { } else { info!( &log, - "phandom disk {} un-deleted andset to faulted ok", + "phandom disk {} un-deleted and set to faulted ok", disk.id(), ); phantom_disk_deleted_ok += 1; } } - warn!(&log, "phantom disk task done"); json!({ "phantom_disk_deleted_ok": phantom_disk_deleted_ok, "phantom_disk_deleted_err": phantom_disk_deleted_err, diff --git a/nexus/src/app/background/tasks/physical_disk_adoption.rs b/nexus/src/app/background/tasks/physical_disk_adoption.rs index f3b9e8ac62..b1eceed0b6 100644 --- a/nexus/src/app/background/tasks/physical_disk_adoption.rs +++ b/nexus/src/app/background/tasks/physical_disk_adoption.rs @@ -96,8 +96,6 @@ impl BackgroundTask for PhysicalDiskAdoption { } let mut disks_added = 0; - let log = &opctx.log; - warn!(&log, "physical disk adoption task started"); let collection_id = *self.rx_inventory_collection.borrow(); let Some(collection_id) = collection_id else { @@ -171,7 +169,6 @@ impl BackgroundTask for PhysicalDiskAdoption { ); } - warn!(&log, "physical disk adoption task done"); json!({ "physical_disks_added": disks_added, }) diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index f852f21734..ba0e7f86fb 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -61,7 +61,6 @@ impl BackgroundTask for RegionReplacementDetector { ) -> BoxFuture<'a, serde_json::Value> { async { let log = &opctx.log; - warn!(&log, "region replacement task started"); let mut ok = 0; let mut err = 0; @@ -182,8 +181,6 @@ impl BackgroundTask for RegionReplacementDetector { } } - warn!(&log, "region replacement task done"); - json!({ "region_replacement_started_ok": ok, "region_replacement_started_err": err, diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index 284ed2c368..02db86eab3 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -227,16 +227,11 @@ impl BackgroundTask for RegionReplacementDriver { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async { - let log = &opctx.log; - info!(&log, "region replacement driver task started"); - let mut status = RegionReplacementDriverStatus::default(); self.drive_running_replacements_forward(opctx, &mut status).await; self.complete_done_replacements(opctx, &mut status).await; - info!(&log, "region replacement driver task done"); - json!(status) } .boxed() diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index 4c66c166ff..77dc87c060 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -129,12 +129,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async move { - let log = &opctx.log; - info!( - &log, - "region snapshot replacement garbage collect task started", - ); - let mut status = RegionSnapshotReplacementGarbageCollectStatus::default(); @@ -144,11 +138,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { ) .await; - info!( - &log, - "region snapshot replacement garbage collect task done" - ); - json!(status) } .boxed() diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 9bc66d48c8..1fdc17690d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -232,9 +232,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async { - let log = &opctx.log; - info!(&log, "region snapshot replacement start task started"); - let mut status = RegionSnapshotReplacementStartStatus::default(); self.create_requests_for_region_snapshots_on_expunged_disks( @@ -249,8 +246,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { ) .await; - info!(&log, "region snapshot replacement start task done"); - json!(status) } .boxed() From 6bb3c13e79488efccd39b6daa8f9def6a727616f Mon Sep 17 00:00:00 2001 From: David Crespo Date: Fri, 16 Aug 2024 17:09:48 -0500 Subject: [PATCH 55/91] Bump web console (vpc routers + routes, instance polling, edit quotas) (#6366) https://github.com/oxidecomputer/console/compare/17ae890c...33b7a505 * [33b7a505](https://github.com/oxidecomputer/console/commit/33b7a505) oxidecomputer/console#2360 * [1a2cb52d](https://github.com/oxidecomputer/console/commit/1a2cb52d) oxidecomputer/console#2369 * [9e831174](https://github.com/oxidecomputer/console/commit/9e831174) oxidecomputer/console#2374 * [e30f2eb8](https://github.com/oxidecomputer/console/commit/e30f2eb8) oxidecomputer/console#2373 * [bb53f1b2](https://github.com/oxidecomputer/console/commit/bb53f1b2) oxidecomputer/console#2371 * [29398e74](https://github.com/oxidecomputer/console/commit/29398e74) oxidecomputer/console#2343 * [68e2dc89](https://github.com/oxidecomputer/console/commit/68e2dc89) oxidecomputer/console#2359 * [11e29ed8](https://github.com/oxidecomputer/console/commit/11e29ed8) bump omicron to latest main * [b6ed3757](https://github.com/oxidecomputer/console/commit/b6ed3757) oxidecomputer/console#2370 * [af6c1f4a](https://github.com/oxidecomputer/console/commit/af6c1f4a) oxidecomputer/console#2368 * [60ef745c](https://github.com/oxidecomputer/console/commit/60ef745c) disallow unreachable code in ts config, fix one case of it * [3a6f815a](https://github.com/oxidecomputer/console/commit/3a6f815a) oxidecomputer/console#2364 * [80b3f2f3](https://github.com/oxidecomputer/console/commit/80b3f2f3) oxidecomputer/console#2366 * [dab60d9d](https://github.com/oxidecomputer/console/commit/dab60d9d) oxidecomputer/console#2358 * [8e3314f1](https://github.com/oxidecomputer/console/commit/8e3314f1) oxidecomputer/console#2362 * [9b5cdfa0](https://github.com/oxidecomputer/console/commit/9b5cdfa0) bump TS generator for bugfix (just adds whitespace) * [07b6c151](https://github.com/oxidecomputer/console/commit/07b6c151) oxidecomputer/console#2349 * [d32fddc2](https://github.com/oxidecomputer/console/commit/d32fddc2) Revert "Focus confirm button instead of cancel in modals (oxidecomputer/console#2340)" * [84a1501e](https://github.com/oxidecomputer/console/commit/84a1501e) oxidecomputer/console#2341 * [6615cb6b](https://github.com/oxidecomputer/console/commit/6615cb6b) oxidecomputer/console#2340 * [e48b0096](https://github.com/oxidecomputer/console/commit/e48b0096) delete unused vscode tasks * [22a6c50f](https://github.com/oxidecomputer/console/commit/22a6c50f) tighten TypeValueCell spacing * [4eacb3d7](https://github.com/oxidecomputer/console/commit/4eacb3d7) oxidecomputer/console#2338 * [f278a747](https://github.com/oxidecomputer/console/commit/f278a747) oxidecomputer/console#2332 * [016ad1b4](https://github.com/oxidecomputer/console/commit/016ad1b4) oxidecomputer/console#2337 * [2d1a22a2](https://github.com/oxidecomputer/console/commit/2d1a22a2) oxidecomputer/console#2336 * [be0f087f](https://github.com/oxidecomputer/console/commit/be0f087f) oxidecomputer/console#2329 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 4f67064733..994d30396b 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="17ae890c68a5277fbefe773694e790a8f1b178b4" -SHA2="273a31ba14546305bfafeb9aedb2d9a7530328a0359cda363380c9ca3240b948" +COMMIT="33b7a505a222b258a155636e8ee79c7ee3c132d2" +SHA2="f9089e18d52d7a54149b364a0b3ae4efba421c13eca6f7752a23b74dc3fa1a8e" From c86ff799803a918d858b744478af3100de7d927c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 16 Aug 2024 15:56:32 -0700 Subject: [PATCH 56/91] clear `OMDB_` environment variables when running omdb tests (#6368) --- dev-tools/omdb/tests/test_all_output.rs | 24 ++++++++++++++++++++++++ nexus/test-utils/src/lib.rs | 1 + 2 files changed, 25 insertions(+) diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 45492c14ce..1afee71122 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -56,6 +56,7 @@ fn assert_oximeter_list_producers_output( #[tokio::test] async fn test_omdb_usage_errors() { + clear_omdb_env(); let cmd_path = path_to_executable(CMD_OMDB); let mut output = String::new(); let invocations: &[&[&'static str]] = &[ @@ -111,6 +112,8 @@ async fn test_omdb_usage_errors() { #[nexus_test] async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { + clear_omdb_env(); + let gwtestctx = gateway_test_utils::setup::test_setup( "test_omdb_success_case", gateway_messages::SpPort::One, @@ -271,6 +274,8 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { /// that's covered by the success tests above. #[nexus_test] async fn test_omdb_env_settings(cptestctx: &ControlPlaneTestContext) { + clear_omdb_env(); + let cmd_path = path_to_executable(CMD_OMDB); let postgres_url = cptestctx.database.listen_url().to_string(); let nexus_internal_url = @@ -504,3 +509,22 @@ async fn do_run_extra( write!(output, "=============================================\n").unwrap(); } + +// We're testing behavior that can be affected by OMDB-related environment +// variables. Clear all of them from the current process so that all child +// processes don't have them. OMDB environment variables can affect even the +// help output provided by clap. See clap-rs/clap#5673 for an example. +fn clear_omdb_env() { + // Rust documents that it's not safe to manipulate the environment in a + // multi-threaded process outside of Windows because it's possible that + // other threads are reading or writing the environment and most systems do + // not support this. On illumos, the underlying interfaces are broadly + // thread-safe. Further, Omicron only supports running tests under `cargo + // nextest`, in which case there are no threads running concurrently here + // that may be reading or modifying the environment. + for (env_var, _) in std::env::vars().filter(|(k, _)| k.starts_with("OMDB_")) + { + eprintln!("removing {:?} from environment", env_var); + std::env::remove_var(env_var); + } +} diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 3dcffb399b..ea46f2d017 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -4,6 +4,7 @@ //! Integration testing facilities for Nexus +#[cfg(feature = "omicron-dev")] use anyhow::Context; use anyhow::Result; use camino::Utf8Path; From f334531ecf2c570fa1be931fcb518911bc2b6e1c Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 16 Aug 2024 16:19:35 -0700 Subject: [PATCH 57/91] Make OxQL UUID parsing case-insensitive (#6359) Fixes #6358 --- oximeter/db/src/oxql/ast/grammar.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/oximeter/db/src/oxql/ast/grammar.rs b/oximeter/db/src/oxql/ast/grammar.rs index a7585402b6..cbca4470f9 100644 --- a/oximeter/db/src/oxql/ast/grammar.rs +++ b/oximeter/db/src/oxql/ast/grammar.rs @@ -189,11 +189,11 @@ peg::parser! { rule dashed_uuid_literal() -> Uuid = s:$( "\"" - ['a'..='f' | '0'..='9']*<8> "-" - ['a'..='f' | '0'..='9']*<4> "-" - ['a'..='f' | '0'..='9']*<4> "-" - ['a'..='f' | '0'..='9']*<4> "-" - ['a'..='f' | '0'..='9']*<12> + ['a'..='f' | 'A'..='F' | '0'..='9']*<8> "-" + ['a'..='f' | 'A'..='F' | '0'..='9']*<4> "-" + ['a'..='f' | 'A'..='F' | '0'..='9']*<4> "-" + ['a'..='f' | 'A'..='F' | '0'..='9']*<4> "-" + ['a'..='f' | 'A'..='F' | '0'..='9']*<12> "\"" ) {? let Some(middle) = s.get(1..37) else { @@ -202,7 +202,7 @@ peg::parser! { middle.parse().or(Err("invalid UUID literal")) } rule undashed_uuid_literal() -> Uuid - = s:$("\"" ['a'..='f' | '0'..='9']*<32> "\"") {? + = s:$("\"" ['a'..='f' | 'A'..='F' | '0'..='9']*<32> "\"") {? let Some(middle) = s.get(1..33) else { return Err("invalid UUID literal"); }; @@ -734,6 +734,15 @@ mod tests { .is_err()); } + #[test] + fn test_uuid_literal_is_case_insensitive() { + const ID: Uuid = uuid::uuid!("880D82A1-102F-4699-BE1A-7E2A6A469E8E"); + let as_str = format!("\"{ID}\""); + let as_lower = as_str.to_lowercase(); + assert_eq!(query_parser::uuid_literal_impl(&as_str).unwrap(), ID,); + assert_eq!(query_parser::uuid_literal_impl(&as_lower).unwrap(), ID,); + } + #[test] fn test_integer_literal() { assert_eq!(query_parser::integer_literal_impl("1").unwrap(), 1); From c28455a0625704a9b9af874cbb40202fe5429166 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 16 Aug 2024 16:22:35 -0700 Subject: [PATCH 58/91] [oximeter] Use `fmt::Display` for TOML errors (#6365) --- oximeter/timeseries-macro/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/oximeter/timeseries-macro/src/lib.rs b/oximeter/timeseries-macro/src/lib.rs index 317a8533a4..0c70e73445 100644 --- a/oximeter/timeseries-macro/src/lib.rs +++ b/oximeter/timeseries-macro/src/lib.rs @@ -59,9 +59,8 @@ pub fn use_timeseries( Err(e) => { let msg = format!( "Failed to generate timeseries types \ - from '{}': {:?}", + from '{}': {e}", path.display(), - e, ); return syn::Error::new(token.span(), msg) .into_compile_error() From 8e4ac4cf1a1f255f8bde4532de42fa38102b44e2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 16 Aug 2024 16:37:00 -0700 Subject: [PATCH 59/91] re-assign sagas from expunged Nexus instances (#6215) --- nexus/db-queries/src/db/datastore/saga.rs | 249 ++++++++++++++++-- nexus/db-queries/src/db/sec_store.rs | 9 +- .../execution/src/cockroachdb.rs | 3 +- nexus/reconfigurator/execution/src/dns.rs | 10 +- nexus/reconfigurator/execution/src/lib.rs | 61 +++-- nexus/reconfigurator/execution/src/sagas.rs | 71 +++++ nexus/src/app/background/init.rs | 3 +- .../background/tasks/blueprint_execution.rs | 34 ++- 8 files changed, 376 insertions(+), 64 deletions(-) create mode 100644 nexus/reconfigurator/execution/src/sagas.rs diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index 939929e665..0b626804e1 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -9,7 +9,6 @@ use super::SQL_BATCH_SIZE; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::model::Generation; use crate::db::pagination::paginated; use crate::db::pagination::paginated_multicolumn; use crate::db::pagination::Paginator; @@ -17,10 +16,12 @@ use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use nexus_auth::authz; use nexus_auth::context::OpContext; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use std::ops::Add; impl DataStore { pub async fn saga_create( @@ -80,21 +81,15 @@ impl DataStore { /// now, we're implementing saga adoption only in cases where the original /// SEC/Nexus has been expunged.) /// - /// However, in the future, it may be possible for multiple SECs to try and - /// update the same saga, and overwrite each other's state. For example, - /// one SEC might try and update the state to Running while the other one - /// updates it to Done. That case would have to be carefully considered and - /// tested here, probably using the (currently unused) - /// `current_adopt_generation` field to enable optimistic concurrency. - /// - /// To reiterate, we are *not* considering the case where several SECs try - /// to update the same saga. That will be a future enhancement. + /// It's conceivable that multiple SECs do try to udpate the same saga + /// concurrently. That would be a bug. This is noticed and prevented by + /// making this query conditional on current_sec and failing with a conflict + /// if the current SEC has changed. pub async fn saga_update_state( &self, saga_id: steno::SagaId, new_state: steno::SagaCachedState, current_sec: db::saga_types::SecId, - current_adopt_generation: Generation, ) -> Result<(), Error> { use db::schema::saga::dsl; @@ -102,7 +97,6 @@ impl DataStore { let result = diesel::update(dsl::saga) .filter(dsl::id.eq(saga_id)) .filter(dsl::current_sec.eq(current_sec)) - .filter(dsl::adopt_generation.eq(current_adopt_generation)) .set(dsl::saga_state.eq(db::saga_types::SagaCachedState(new_state))) .check_if_exists::(saga_id) .execute_and_check(&*self.pool_connection_unauthorized().await?) @@ -119,20 +113,19 @@ impl DataStore { match result.status { UpdateStatus::Updated => Ok(()), - UpdateStatus::NotUpdatedButExists => Err(Error::invalid_request( - format!( - "failed to update saga {:?} with state {:?}: preconditions not met: \ - expected current_sec = {:?}, adopt_generation = {:?}, \ - but found current_sec = {:?}, adopt_generation = {:?}, state = {:?}", + UpdateStatus::NotUpdatedButExists => { + Err(Error::invalid_request(format!( + "failed to update saga {:?} with state {:?}:\ + preconditions not met: \ + expected current_sec = {:?}, \ + but found current_sec = {:?}, state = {:?}", saga_id, new_state, current_sec, - current_adopt_generation, result.found.current_sec, - result.found.adopt_generation, result.found.saga_state, - ) - )), + ))) + } } } @@ -207,16 +200,75 @@ impl DataStore { Ok(events) } + + /// Updates all sagas that are currently assigned to any of the SEC ids in + /// `sec_ids`, assigning them to `new_sec_id` instead. + /// + /// Generally, an SEC id corresponds to a Nexus id. This change causes the + /// Nexus instance `new_sec_id` to discover these sagas and resume executing + /// them the next time it performs saga recovery (which is normally on + /// startup and periodically). Generally, `new_sec_id` is the _current_ + /// Nexus instance and the caller should activate the saga recovery + /// background task after calling this function to immediately resume the + /// newly-assigned sagas. + /// + /// **Warning:** This operation is only safe if the other SECs `sec_ids` are + /// not currently running. If those SECs are still running, then two (or + /// more) SECs may wind up running the same saga concurrently. This would + /// likely violate implicit assumptions made by various saga actions, + /// leading to hard-to-debug errors and state corruption. + pub async fn sagas_reassign_sec( + &self, + opctx: &OpContext, + sec_ids: &[db::saga_types::SecId], + new_sec_id: db::saga_types::SecId, + ) -> Result { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + let now = chrono::Utc::now(); + let conn = self.pool_connection_authorized(opctx).await?; + + // It would be more robust to do this in batches. However, Diesel does + // not appear to support the UPDATE ... LIMIT syntax using the normal + // builder. In practice, it's extremely unlikely we'd have so many + // in-progress sagas that this would be a problem. + use db::schema::saga::dsl; + diesel::update( + dsl::saga + .filter(dsl::current_sec.is_not_null()) + .filter( + dsl::current_sec.eq_any( + sec_ids.into_iter().cloned().collect::>(), + ), + ) + .filter(dsl::saga_state.ne(db::saga_types::SagaCachedState( + steno::SagaCachedState::Done, + ))), + ) + .set(( + dsl::current_sec.eq(Some(new_sec_id)), + dsl::adopt_generation.eq(dsl::adopt_generation.add(1)), + dsl::adopt_time.eq(now), + )) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } #[cfg(test)] mod test { use super::*; use crate::db::datastore::test_utils::datastore_test; + use async_bb8_diesel::AsyncConnection; + use async_bb8_diesel::AsyncSimpleConnection; + use db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_model::{SagaNodeEvent, SecId}; use nexus_test_utils::db::test_setup_database; + use omicron_common::api::external::Generation; use omicron_test_utils::dev; use rand::seq::SliceRandom; + use std::collections::BTreeSet; use uuid::Uuid; // Tests pagination in listing sagas that are candidates for recovery @@ -440,7 +492,6 @@ mod test { node_cx.saga_id, steno::SagaCachedState::Running, node_cx.sec_id, - db::model::Generation::new(), ) .await .expect("updating state to Running again"); @@ -451,7 +502,6 @@ mod test { node_cx.saga_id, steno::SagaCachedState::Done, node_cx.sec_id, - db::model::Generation::new(), ) .await .expect("updating state to Done"); @@ -463,7 +513,6 @@ mod test { node_cx.saga_id, steno::SagaCachedState::Done, node_cx.sec_id, - db::model::Generation::new(), ) .await .expect("updating state to Done again"); @@ -509,4 +558,156 @@ mod test { SagaNodeEvent::new(event, self.sec_id) } } + + #[tokio::test] + async fn test_saga_reassignment() { + // Test setup + let logctx = dev::test_setup_log("test_saga_reassignment"); + let mut db = test_setup_database(&logctx.log).await; + let (_, datastore) = datastore_test(&logctx, &db).await; + let opctx = OpContext::for_tests(logctx.log.clone(), datastore.clone()); + + // Populate the database with a few different sagas: + // + // - assigned to SEC A: done, running, and unwinding + // - assigned to SEC B: done, running, and unwinding + // - assigned to SEC C: done, running, and unwinding + // - assigned to SEC D: done, running, and unwinding + // + // Then we'll reassign SECs B's and C's sagas to SEC A and check exactly + // which sagas were changed by this. This exercises: + // - that we don't touch A's sagas (the one we're assigning *to*) + // - that we do touch both B's and C's sagas (the ones we're assigning + // *from*) + // - that we don't touch D's sagas (some other SEC) + // - that we don't touch any "done" sagas + // - that we do touch both running and unwinding sagas + let mut sagas_to_insert = Vec::new(); + let sec_a = SecId(Uuid::new_v4()); + let sec_b = SecId(Uuid::new_v4()); + let sec_c = SecId(Uuid::new_v4()); + let sec_d = SecId(Uuid::new_v4()); + + for sec_id in [sec_a, sec_b, sec_c, sec_d] { + for state in [ + steno::SagaCachedState::Running, + steno::SagaCachedState::Unwinding, + steno::SagaCachedState::Done, + ] { + let params = steno::SagaCreateParams { + id: steno::SagaId(Uuid::new_v4()), + name: steno::SagaName::new("tewst saga"), + dag: serde_json::value::Value::Null, + state, + }; + + sagas_to_insert + .push(db::model::saga_types::Saga::new(sec_id, params)); + } + } + println!("sagas to insert: {:?}", sagas_to_insert); + + // These two sets are complements, but we write out the conditions to + // double-check that we've got it right. + let sagas_affected: BTreeSet<_> = sagas_to_insert + .iter() + .filter_map(|saga| { + ((saga.creator == sec_b || saga.creator == sec_c) + && (saga.saga_state.0 == steno::SagaCachedState::Running + || saga.saga_state.0 + == steno::SagaCachedState::Unwinding)) + .then(|| saga.id) + }) + .collect(); + let sagas_unaffected: BTreeSet<_> = sagas_to_insert + .iter() + .filter_map(|saga| { + (saga.creator == sec_a + || saga.creator == sec_d + || saga.saga_state.0 == steno::SagaCachedState::Done) + .then(|| saga.id) + }) + .collect(); + println!("sagas affected: {:?}", sagas_affected); + println!("sagas UNaffected: {:?}", sagas_unaffected); + assert_eq!(sagas_affected.intersection(&sagas_unaffected).count(), 0); + assert_eq!( + sagas_affected.len() + sagas_unaffected.len(), + sagas_to_insert.len() + ); + + // Insert the sagas. + let count = { + use db::schema::saga::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + diesel::insert_into(dsl::saga) + .values(sagas_to_insert) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .expect("successful insertion") + }; + assert_eq!(count, sagas_affected.len() + sagas_unaffected.len()); + + // Reassign uncompleted sagas from SECs B and C to SEC A. + let nreassigned = datastore + .sagas_reassign_sec(&opctx, &[sec_b, sec_c], sec_a) + .await + .expect("failed to re-assign sagas"); + + // Fetch all the sagas and check their states. + let all_sagas: Vec<_> = datastore + .pool_connection_for_tests() + .await + .unwrap() + .transaction_async(|conn| async move { + use db::schema::saga::dsl; + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + dsl::saga + .select(nexus_db_model::Saga::as_select()) + .load_async(&conn) + .await + }) + .await + .unwrap(); + + for saga in all_sagas { + println!("checking saga: {:?}", saga); + let current_sec = saga.current_sec.unwrap(); + if sagas_affected.contains(&saga.id) { + assert!(saga.creator == sec_b || saga.creator == sec_c); + assert_eq!(current_sec, sec_a); + assert_eq!(*saga.adopt_generation, Generation::from(2)); + assert!( + saga.saga_state.0 == steno::SagaCachedState::Running + || saga.saga_state.0 + == steno::SagaCachedState::Unwinding + ); + } else if sagas_unaffected.contains(&saga.id) { + assert_eq!(current_sec, saga.creator); + assert_eq!(*saga.adopt_generation, Generation::from(1)); + // Its SEC and state could be anything since we've deliberately + // included sagas with various states and SECs that should not + // be affected by the reassignment. + } else { + println!( + "ignoring saga that was not created by this test: {:?}", + saga + ); + } + } + + assert_eq!(nreassigned, sagas_affected.len()); + + // If we do it again, we should make no changes. + let nreassigned = datastore + .sagas_reassign_sec(&opctx, &[sec_b, sec_c], sec_a) + .await + .expect("failed to re-assign sagas"); + assert_eq!(nreassigned, 0); + + // Test cleanup + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/sec_store.rs b/nexus/db-queries/src/db/sec_store.rs index 0dcc3aa717..920ff3aee1 100644 --- a/nexus/db-queries/src/db/sec_store.rs +++ b/nexus/db-queries/src/db/sec_store.rs @@ -4,7 +4,7 @@ //! Implementation of [`steno::SecStore`] backed by Omicron's database -use crate::db::{self, model::Generation}; +use crate::db; use anyhow::Context; use async_trait::async_trait; use dropshot::HttpError; @@ -102,12 +102,7 @@ impl steno::SecStore for CockroachDbSecStore { &log, || { self.datastore - .saga_update_state( - id, - update, - self.sec_id, - Generation::new(), - ) + .saga_update_state(id, update, self.sec_id) .map_err(backoff::BackoffError::transient) }, "updating saga state", diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs index 498944598d..277f5f91c4 100644 --- a/nexus/reconfigurator/execution/src/cockroachdb.rs +++ b/nexus/reconfigurator/execution/src/cockroachdb.rs @@ -39,6 +39,7 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::CockroachDbClusterVersion; use std::sync::Arc; + use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -101,7 +102,7 @@ mod test { datastore, resolver, &blueprint, - "test-suite", + Uuid::new_v4(), &overrides, ) .await diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 846d19ead3..4395944b25 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1250,7 +1250,7 @@ mod test { datastore, resolver, &blueprint, - "test-suite", + Uuid::new_v4(), &overrides, ) .await @@ -1390,7 +1390,7 @@ mod test { datastore, resolver, &blueprint2, - "test-suite", + Uuid::new_v4(), &overrides, ) .await @@ -1464,7 +1464,7 @@ mod test { datastore, resolver, &blueprint2, - "test-suite", + Uuid::new_v4(), &overrides, ) .await @@ -1500,7 +1500,7 @@ mod test { datastore, resolver, &blueprint2, - "test-suite", + Uuid::new_v4(), &overrides, ) .await @@ -1594,7 +1594,7 @@ mod test { datastore, resolver, &blueprint, - "test-suite", + Uuid::new_v4(), &overrides, ) .await diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index bb525b1b8b..8606187762 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -24,6 +24,7 @@ use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::net::SocketAddrV6; +use uuid::Uuid; mod cockroachdb; mod datasets; @@ -31,6 +32,7 @@ mod dns; mod omicron_physical_disks; mod omicron_zones; mod overridables; +mod sagas; mod sled_state; pub use dns::blueprint_external_dns_config; @@ -73,38 +75,32 @@ impl From for Sled { /// /// The assumption is that callers are running this periodically or in a loop to /// deal with transient errors or changes in the underlying system state. -pub async fn realize_blueprint( +pub async fn realize_blueprint( opctx: &OpContext, datastore: &DataStore, resolver: &Resolver, blueprint: &Blueprint, - nexus_label: S, -) -> Result<(), Vec> -where - String: From, -{ + nexus_id: Uuid, +) -> Result> { realize_blueprint_with_overrides( opctx, datastore, resolver, blueprint, - nexus_label, + nexus_id, &Default::default(), ) .await } -pub async fn realize_blueprint_with_overrides( +pub async fn realize_blueprint_with_overrides( opctx: &OpContext, datastore: &DataStore, resolver: &Resolver, blueprint: &Blueprint, - nexus_label: S, + nexus_id: Uuid, overrides: &Overridables, -) -> Result<(), Vec> -where - String: From, -{ +) -> Result> { let opctx = opctx.child(BTreeMap::from([( "comment".to_string(), blueprint.comment.clone(), @@ -182,7 +178,7 @@ where dns::deploy_dns( &opctx, datastore, - String::from(nexus_label), + nexus_id.to_string(), blueprint, &sleds_by_id, overrides, @@ -215,14 +211,43 @@ where omicron_physical_disks::decommission_expunged_disks(&opctx, datastore) .await?; + // From this point on, we'll assume that any errors that we encounter do + // *not* require stopping execution. We'll just accumulate them and return + // them all at the end. + // + // TODO We should probably do this with more of the errors above, too. + let mut errors = Vec::new(); + + // For any expunged Nexus zones, re-assign in-progress sagas to some other + // Nexus. If this fails for some reason, it doesn't affect anything else. + let sec_id = nexus_db_model::SecId(nexus_id); + let reassigned = sagas::reassign_sagas_from_expunged( + &opctx, datastore, blueprint, sec_id, + ) + .await + .context("failed to re-assign sagas"); + let needs_saga_recovery = match reassigned { + Ok(needs_recovery) => needs_recovery, + Err(error) => { + errors.push(error); + false + } + }; + // This is likely to error if any cluster upgrades are in progress (which // can take some time), so it should remain at the end so that other parts // of the blueprint can progress normally. - cockroachdb::ensure_settings(&opctx, datastore, blueprint) - .await - .map_err(|err| vec![err])?; + if let Err(error) = + cockroachdb::ensure_settings(&opctx, datastore, blueprint).await + { + errors.push(error); + } - Ok(()) + if errors.is_empty() { + Ok(needs_saga_recovery) + } else { + Err(errors) + } } #[cfg(test)] diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs new file mode 100644 index 0000000000..458328ef00 --- /dev/null +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -0,0 +1,71 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Re-assign sagas from expunged Nexus zones + +use nexus_db_model::SecId; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintZoneFilter; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use slog::{debug, info, warn}; + +/// For each expunged Nexus zone, re-assign sagas owned by that Nexus to the +/// specified nexus (`nexus_id`). +pub(crate) async fn reassign_sagas_from_expunged( + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, + nexus_id: SecId, +) -> Result { + let log = &opctx.log; + + // Identify any Nexus zones that have been expunged and need to have sagas + // re-assigned. + // + // TODO: Currently, we take any expunged Nexus instances and attempt to + // assign all their sagas to ourselves. Per RFD 289, we can only re-assign + // sagas between two instances of Nexus that are at the same version. Right + // now this can't happen so there's nothing to do here to ensure that + // constraint. However, once we support allowing the control plane to be + // online _during_ an upgrade, there may be multiple different Nexus + // instances running at the same time. At that point, we will need to make + // sure that we only ever try to assign ourselves sagas from other Nexus + // instances that we know are running the same version as ourselves. + let nexus_zone_ids: Vec<_> = blueprint + .all_omicron_zones(BlueprintZoneFilter::Expunged) + .filter_map(|(_, z)| { + z.zone_type + .is_nexus() + .then(|| nexus_db_model::SecId(z.id.into_untyped_uuid())) + }) + .collect(); + + debug!(log, "re-assign sagas: found Nexus instances"; + "nexus_zone_ids" => ?nexus_zone_ids); + + let result = + datastore.sagas_reassign_sec(opctx, &nexus_zone_ids, nexus_id).await; + + match result { + Ok(count) => { + info!(log, "re-assigned sagas"; + "nexus_zone_ids" => ?nexus_zone_ids, + "count" => count, + ); + + Ok(count != 0) + } + Err(error) => { + warn!(log, "failed to re-assign sagas"; + "nexus_zone_ids" => ?nexus_zone_ids, + &error, + ); + + Err(error) + } + } +} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 6bd805a491..37c276fa07 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -448,7 +448,8 @@ impl BackgroundTasksInitializer { datastore.clone(), resolver.clone(), rx_blueprint.clone(), - nexus_id.to_string(), + nexus_id, + task_saga_recovery.clone(), ); let rx_blueprint_exec = blueprint_executor.watcher(); driver.register(TaskDefinition { diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index ee780812ae..b430270ec9 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -4,7 +4,7 @@ //! Background task for realizing a plan blueprint -use crate::app::background::BackgroundTask; +use crate::app::background::{Activator, BackgroundTask}; use futures::future::BoxFuture; use futures::FutureExt; use internal_dns::resolver::Resolver; @@ -14,6 +14,7 @@ use nexus_types::deployment::{Blueprint, BlueprintTarget}; use serde_json::json; use std::sync::Arc; use tokio::sync::watch; +use uuid::Uuid; /// Background task that takes a [`Blueprint`] and realizes the change to /// the state of the system based on the `Blueprint`. @@ -21,8 +22,9 @@ pub struct BlueprintExecutor { datastore: Arc, resolver: Resolver, rx_blueprint: watch::Receiver>>, - nexus_label: String, + nexus_id: Uuid, tx: watch::Sender, + saga_recovery: Activator, } impl BlueprintExecutor { @@ -32,10 +34,18 @@ impl BlueprintExecutor { rx_blueprint: watch::Receiver< Option>, >, - nexus_label: String, + nexus_id: Uuid, + saga_recovery: Activator, ) -> BlueprintExecutor { let (tx, _) = watch::channel(0); - BlueprintExecutor { datastore, resolver, rx_blueprint, nexus_label, tx } + BlueprintExecutor { + datastore, + resolver, + rx_blueprint, + nexus_id, + tx, + saga_recovery, + } } pub fn watcher(&self) -> watch::Receiver { @@ -81,16 +91,23 @@ impl BlueprintExecutor { &self.datastore, &self.resolver, blueprint, - &self.nexus_label, + self.nexus_id, ) .await; // Trigger anybody waiting for this to finish. self.tx.send_modify(|count| *count = *count + 1); + // If executing the blueprint requires activating the saga recovery + // background task, do that now. + info!(&opctx.log, "activating saga recovery task"); + if let Ok(true) = result { + self.saga_recovery.activate(); + } + // Return the result as a `serde_json::Value` match result { - Ok(()) => json!({}), + Ok(_) => json!({}), Err(errors) => { let errors: Vec<_> = errors.into_iter().map(|e| format!("{:#}", e)).collect(); @@ -115,7 +132,7 @@ impl BackgroundTask for BlueprintExecutor { #[cfg(test)] mod test { use super::BlueprintExecutor; - use crate::app::background::BackgroundTask; + use crate::app::background::{Activator, BackgroundTask}; use httptest::matchers::{all_of, request}; use httptest::responders::status_code; use httptest::Expectation; @@ -261,7 +278,8 @@ mod test { datastore.clone(), resolver.clone(), blueprint_rx, - String::from("test-suite"), + Uuid::new_v4(), + Activator::new(), ); // Now we're ready. From b927049862e2161f08fae2480dadc1b0a4572b26 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 16 Aug 2024 17:10:24 -0700 Subject: [PATCH 60/91] add issue templates for flaky tests (#4833) --- .../test-flake-from-buildomat.md | 65 +++++++++++++++++++ .../test-flake-from-local-failure.md | 42 ++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/test-flake-from-buildomat.md create mode 100644 .github/ISSUE_TEMPLATE/test-flake-from-local-failure.md diff --git a/.github/ISSUE_TEMPLATE/test-flake-from-buildomat.md b/.github/ISSUE_TEMPLATE/test-flake-from-buildomat.md new file mode 100644 index 0000000000..eb1ac2c6e9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/test-flake-from-buildomat.md @@ -0,0 +1,65 @@ +--- +name: Test flake from buildomat +about: Report a test failure from a CI run either on "main" or on a PR where you don't + think the PR changes caused the failure +title: 'test failed in CI: NAME_OF_TEST' +labels: Test Flake +assignees: '' + +--- + + + +This test failed on a CI run on **"main" (or pull request XXX)**: + + Link here to the GitHub page showing the test failure. + If it's from a PR, this might look like: + https://github.com/oxidecomputer/omicron/pull/4588/checks?check_run_id=19198066410 + It could also be a link to a failure on "main", which would look like: + https://github.com/oxidecomputer/omicron/runs/20589829185 + This is useful because it shows which commit failed and all the surrounding context. + +Log showing the specific test failure: + + + Link here to the specific line of output from the buildomat log showing the failure: + https://buildomat.eng.oxide.computer/wg/0/details/01HGH32FQYKZJNX9J62HNABKPA/31C5jyox8tyHUIuDDevKkXlDZCyNw143z4nOq8wLl3xtjKzT/01HGH32V3P0HH6B56S46AJAT63#S4455 + This is useful because it shows all the details about the test failure. + +Excerpt from the log showing the failure: + +``` +Paste here an excerpt from the log. +This is redundant with the log above but helps people searching for the error message +or test name. It also works if the link above becomes unavailable. +Here's an example: + +------ + +failures: + integration_tests::updates::test_update_races + +test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 4 filtered out; finished in 4.84s + + +--- STDERR: wicketd::mod integration_tests::updates::test_update_races --- +log file: /var/tmp/omicron_tmp/mod-ae2eb84a30e4213e-test_artifact_upload_while_updating.14133.0.log +note: configured to log to "/var/tmp/omicron_tmp/mod-ae2eb84a30e4213e-test_artifact_upload_while_updating.14133.0.log" +hint: Generated a random key: +hint: +hint: ed25519:826a8f799d4cc767158c990a60f721215bfd71f8f94fa88ba1960037bd6e5554 +hint: +hint: To modify this repository, you will need this key. Use the -k/--key +hint: command line flag or the TUFACEOUS_KEY environment variable: +hint: +hint: export TUFACEOUS_KEY=ed25519:826a8f799d4cc767158c990a60f721215bfd71f8f94fa88ba1960037bd6e5554 +hint: +hint: To prevent this default behavior, use --no-generate-key. +thread 'integration_tests::updates::test_update_races' panicked at wicketd/tests/integration_tests/updates.rs:482:41: +at least one event +stack backtrace: +... +``` diff --git a/.github/ISSUE_TEMPLATE/test-flake-from-local-failure.md b/.github/ISSUE_TEMPLATE/test-flake-from-local-failure.md new file mode 100644 index 0000000000..e963c83926 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/test-flake-from-local-failure.md @@ -0,0 +1,42 @@ +--- +name: Test flake from local failure +about: Report a test failure that happened locally (not CI) that you believe is not + related to local changes +title: 'test failure: TEST_NAME' +labels: Test Flake +assignees: '' + +--- + +On branch **BRANCH** commit **COMMIT**, I saw this test failure: + +``` +Include the trimmed, relevant output from `cargo nextest`. Here's an example: + +------- +failures: + integration_tests::updates::test_update_races + +test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 4 filtered out; finished in 4.84s + + +--- STDERR: wicketd::mod integration_tests::updates::test_update_races --- +log file: /var/tmp/omicron_tmp/mod-ae2eb84a30e4213e-test_artifact_upload_while_updating.14133.0.log +note: configured to log to "/var/tmp/omicron_tmp/mod-ae2eb84a30e4213e-test_artifact_upload_while_updating.14133.0.log" +hint: Generated a random key: +hint: +hint: ed25519:826a8f799d4cc767158c990a60f721215bfd71f8f94fa88ba1960037bd6e5554 +hint: +hint: To modify this repository, you will need this key. Use the -k/--key +hint: command line flag or the TUFACEOUS_KEY environment variable: +hint: +hint: export TUFACEOUS_KEY=ed25519:826a8f799d4cc767158c990a60f721215bfd71f8f94fa88ba1960037bd6e5554 +hint: +hint: To prevent this default behavior, use --no-generate-key. +thread 'integration_tests::updates::test_update_races' panicked at wicketd/tests/integration_tests/updates.rs:482:41: +at least one event +stack backtrace: +... +``` + +**NOTE: Consider attaching any log files produced by the test.** From fac7e2780043fa60f35c7fc8c673af0d3dc7842d Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 16 Aug 2024 20:05:38 -0700 Subject: [PATCH 61/91] [oximeter] split oximeter-impl into -types, -schema and -test-utils (#6355) The Nexus external API uses several types from oximeter, and in the interest of keeping the upcoming nexus-external-api crate's graph small, it makes sense to split it into three crates. The crates are: * **oximeter-types**: Core type definitions (there are a lot of them). * **oximeter-schema**: Library for working with schemas. * **oximeter-test-utils**: Test utilities. The names match the other services in omicron, e.g. `sled-agent-types` and `nexus-test-utils`. --- Cargo.lock | 97 +++-- Cargo.toml | 12 +- oximeter/db/Cargo.toml | 1 + oximeter/db/src/client/mod.rs | 27 +- oximeter/db/src/model.rs | 9 +- oximeter/db/tests/integration_test.rs | 13 +- oximeter/impl/src/test_util.rs | 130 ------- oximeter/oximeter/Cargo.toml | 3 +- oximeter/oximeter/src/lib.rs | 9 +- oximeter/schema/Cargo.toml | 24 ++ .../src/bin/oximeter-schema.rs | 6 +- .../src/schema => schema/src}/codegen.rs | 349 +++++++++--------- .../{impl/src/schema => schema/src}/ir.rs | 20 +- oximeter/schema/src/lib.rs | 12 + oximeter/test-utils/Cargo.toml | 15 + oximeter/test-utils/src/lib.rs | 295 +++++++++++++++ oximeter/timeseries-macro/Cargo.toml | 3 +- oximeter/timeseries-macro/src/lib.rs | 4 +- oximeter/{impl => types}/Cargo.toml | 12 +- oximeter/{impl => types}/benches/quantile.rs | 2 +- oximeter/{impl => types}/src/histogram.rs | 8 +- oximeter/{impl => types}/src/lib.rs | 7 +- oximeter/{impl => types}/src/quantile.rs | 16 +- .../src/schema/mod.rs => types/src/schema.rs} | 125 ------- oximeter/{impl => types}/src/traits.rs | 22 +- oximeter/{impl => types}/src/types.rs | 32 +- .../{impl => types}/tests/fail/failures.rs | 0 .../tests/fail/failures.stderr | 0 .../{impl => types}/tests/test_compilation.rs | 0 29 files changed, 663 insertions(+), 590 deletions(-) delete mode 100644 oximeter/impl/src/test_util.rs create mode 100644 oximeter/schema/Cargo.toml rename oximeter/{oximeter => schema}/src/bin/oximeter-schema.rs (93%) rename oximeter/{impl/src/schema => schema/src}/codegen.rs (73%) rename oximeter/{impl/src/schema => schema/src}/ir.rs (99%) create mode 100644 oximeter/schema/src/lib.rs create mode 100644 oximeter/test-utils/Cargo.toml create mode 100644 oximeter/test-utils/src/lib.rs rename oximeter/{impl => types}/Cargo.toml (78%) rename oximeter/{impl => types}/benches/quantile.rs (97%) rename oximeter/{impl => types}/src/histogram.rs (99%) rename oximeter/{impl => types}/src/lib.rs (92%) rename oximeter/{impl => types}/src/quantile.rs (97%) rename oximeter/{impl/src/schema/mod.rs => types/src/schema.rs} (75%) rename oximeter/{impl => types}/src/traits.rs (96%) rename oximeter/{impl => types}/src/types.rs (97%) rename oximeter/{impl => types}/tests/fail/failures.rs (100%) rename oximeter/{impl => types}/tests/fail/failures.stderr (100%) rename oximeter/{impl => types}/tests/test_compilation.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 849c82f1f0..3b62f3001a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6821,9 +6821,10 @@ dependencies = [ "chrono", "clap", "omicron-workspace-hack", - "oximeter-impl", "oximeter-macro-impl", + "oximeter-schema", "oximeter-timeseries-macro", + "oximeter-types", "prettyplease", "syn 2.0.74", "toml 0.8.19", @@ -6925,6 +6926,7 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "oximeter", + "oximeter-test-utils", "peg", "reedline", "regex", @@ -6948,39 +6950,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "oximeter-impl" -version = "0.1.0" -dependencies = [ - "approx", - "bytes", - "chrono", - "criterion", - "float-ord", - "heck 0.5.0", - "num", - "omicron-common", - "omicron-workspace-hack", - "oximeter-macro-impl", - "prettyplease", - "proc-macro2", - "quote", - "rand", - "rand_distr", - "regex", - "rstest", - "schemars", - "serde", - "serde_json", - "slog-error-chain", - "strum", - "syn 2.0.74", - "thiserror", - "toml 0.8.19", - "trybuild", - "uuid", -] - [[package]] name = "oximeter-instruments" version = "0.1.0" @@ -7041,17 +7010,75 @@ dependencies = [ "uuid", ] +[[package]] +name = "oximeter-schema" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "clap", + "heck 0.5.0", + "omicron-workspace-hack", + "oximeter-types", + "prettyplease", + "proc-macro2", + "quote", + "schemars", + "serde", + "slog-error-chain", + "syn 2.0.74", + "toml 0.8.19", +] + +[[package]] +name = "oximeter-test-utils" +version = "0.1.0" +dependencies = [ + "chrono", + "omicron-workspace-hack", + "oximeter-macro-impl", + "oximeter-types", + "uuid", +] + [[package]] name = "oximeter-timeseries-macro" version = "0.1.0" dependencies = [ "omicron-workspace-hack", - "oximeter-impl", + "oximeter-schema", + "oximeter-types", "proc-macro2", "quote", "syn 2.0.74", ] +[[package]] +name = "oximeter-types" +version = "0.1.0" +dependencies = [ + "approx", + "bytes", + "chrono", + "criterion", + "float-ord", + "num", + "omicron-common", + "omicron-workspace-hack", + "oximeter-macro-impl", + "rand", + "rand_distr", + "regex", + "rstest", + "schemars", + "serde", + "serde_json", + "strum", + "thiserror", + "trybuild", + "uuid", +] + [[package]] name = "oxlog" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 1b62af959d..ea687936e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,12 +77,14 @@ members = [ "oximeter/api", "oximeter/collector", "oximeter/db", - "oximeter/impl", "oximeter/instruments", "oximeter/oximeter-macro-impl", "oximeter/oximeter", "oximeter/producer", + "oximeter/schema", + "oximeter/test-utils", "oximeter/timeseries-macro", + "oximeter/types", "package", "passwords", "rpaths", @@ -191,12 +193,14 @@ default-members = [ "oximeter/api", "oximeter/collector", "oximeter/db", - "oximeter/impl", "oximeter/instruments", "oximeter/oximeter-macro-impl", "oximeter/oximeter", "oximeter/producer", + "oximeter/schema", + "oximeter/test-utils", "oximeter/timeseries-macro", + "oximeter/types", "package", "passwords", "rpaths", @@ -459,11 +463,13 @@ oximeter-api = { path = "oximeter/api" } oximeter-client = { path = "clients/oximeter-client" } oximeter-db = { path = "oximeter/db/", default-features = false } oximeter-collector = { path = "oximeter/collector" } -oximeter-impl = { path = "oximeter/impl" } oximeter-instruments = { path = "oximeter/instruments" } oximeter-macro-impl = { path = "oximeter/oximeter-macro-impl" } oximeter-producer = { path = "oximeter/producer" } +oximeter-schema = { path = "oximeter/schema" } +oximeter-test-utils = { path = "oximeter/test-utils" } oximeter-timeseries-macro = { path = "oximeter/timeseries-macro" } +oximeter-types = { path = "oximeter/types" } p256 = "0.13" parse-display = "0.10.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } diff --git a/oximeter/db/Cargo.toml b/oximeter/db/Cargo.toml index e3cf089cb5..6a7cedbc22 100644 --- a/oximeter/db/Cargo.toml +++ b/oximeter/db/Cargo.toml @@ -89,6 +89,7 @@ expectorate.workspace = true indexmap.workspace = true itertools.workspace = true omicron-test-utils.workspace = true +oximeter-test-utils.workspace = true slog-dtrace.workspace = true sqlformat.workspace = true sqlparser.workspace = true diff --git a/oximeter/db/src/client/mod.rs b/oximeter/db/src/client/mod.rs index 30ae4b68d2..176e1bd5f8 100644 --- a/oximeter/db/src/client/mod.rs +++ b/oximeter/db/src/client/mod.rs @@ -1191,7 +1191,6 @@ mod tests { }; use omicron_test_utils::dev::test_setup_log; use oximeter::histogram::Histogram; - use oximeter::test_util; use oximeter::types::MissingDatum; use oximeter::Datum; use oximeter::FieldValue; @@ -1723,7 +1722,7 @@ mod tests { let samples = { let mut s = Vec::with_capacity(8); for _ in 0..s.capacity() { - s.push(test_util::make_hist_sample()) + s.push(oximeter_test_utils::make_hist_sample()) } s }; @@ -1762,7 +1761,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let sample = test_util::make_sample(); + let sample = oximeter_test_utils::make_sample(); client.insert_samples(&[sample]).await.unwrap(); let bad_name = name_mismatch::TestTarget { @@ -1770,7 +1769,7 @@ mod tests { name2: "second_name".into(), num: 2, }; - let metric = test_util::TestMetric { + let metric = oximeter_test_utils::TestMetric { id: uuid::Uuid::new_v4(), good: true, datum: 1, @@ -1792,7 +1791,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let sample = test_util::make_sample(); + let sample = oximeter_test_utils::make_sample(); // Verify that this sample is considered new, i.e., we return rows to update the timeseries // schema table. @@ -1867,7 +1866,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = test_util::generate_test_samples(2, 2, 2, 2); + let samples = oximeter_test_utils::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; let sample = samples.first().unwrap(); @@ -1956,7 +1955,7 @@ mod tests { // we'd like to exercise the logic of ClickHouse's replacing merge tree engine. let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = test_util::generate_test_samples(2, 2, 2, 2); + let samples = oximeter_test_utils::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; async fn assert_table_count( @@ -2631,7 +2630,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = test_util::generate_test_samples(2, 2, 2, 2); + let samples = oximeter_test_utils::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; let original_schema = client.schema.lock().await.clone(); @@ -2656,7 +2655,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = test_util::generate_test_samples(2, 2, 2, 2); + let samples = oximeter_test_utils::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; let limit = 100u32.try_into().unwrap(); @@ -2691,7 +2690,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = test_util::generate_test_samples(2, 2, 2, 2); + let samples = oximeter_test_utils::generate_test_samples(2, 2, 2, 2); client.insert_samples(&samples).await?; let limit = 7u32.try_into().unwrap(); @@ -3364,7 +3363,7 @@ mod tests { // The values here don't matter much, we just want to check that // the database data hasn't been dropped. assert_eq!(0, get_schema_count(&client).await); - let sample = test_util::make_sample(); + let sample = oximeter_test_utils::make_sample(); client.insert_samples(&[sample.clone()]).await.unwrap(); assert_eq!(1, get_schema_count(&client).await); @@ -3438,7 +3437,7 @@ mod tests { // The values here don't matter much, we just want to check that // the database data gets dropped later. assert_eq!(0, get_schema_count(&client).await); - let sample = test_util::make_sample(); + let sample = oximeter_test_utils::make_sample(); client.insert_samples(&[sample.clone()]).await.unwrap(); assert_eq!(1, get_schema_count(&client).await); @@ -3464,7 +3463,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = [test_util::make_sample()]; + let samples = [oximeter_test_utils::make_sample()]; client.insert_samples(&samples).await.unwrap(); // Get the count of schema directly from the DB, which should have just @@ -3549,7 +3548,7 @@ mod tests { let client = Client::new(address, &log); db_type.init_db(&client).await.unwrap(); - let samples = [test_util::make_sample()]; + let samples = [oximeter_test_utils::make_sample()]; // We're using the components of the `insert_samples()` method here, // which has been refactored explicitly for this test. We need to insert diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index f27df4ed49..3e34ad10e3 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -1880,7 +1880,6 @@ mod tests { use super::*; use chrono::Timelike; use oximeter::histogram::Record; - use oximeter::test_util; use oximeter::Datum; #[test] @@ -1983,7 +1982,7 @@ mod tests { #[test] fn test_unroll_from_source() { - let sample = test_util::make_sample(); + let sample = oximeter_test_utils::make_sample(); let out = unroll_from_source(&sample); assert_eq!(out["oximeter.fields_string"].len(), 2); assert_eq!(out["oximeter.fields_i64"].len(), 1); @@ -2003,8 +2002,8 @@ mod tests { // datum. #[test] fn test_unroll_missing_measurement_row() { - let sample = test_util::make_sample(); - let missing_sample = test_util::make_missing_sample(); + let sample = oximeter_test_utils::make_sample(); + let missing_sample = oximeter_test_utils::make_missing_sample(); let (table_name, row) = unroll_measurement_row(&sample); let (missing_table_name, missing_row) = unroll_measurement_row(&missing_sample); @@ -2022,7 +2021,7 @@ mod tests { #[test] fn test_unroll_measurement_row() { - let sample = test_util::make_hist_sample(); + let sample = oximeter_test_utils::make_hist_sample(); let (table_name, row) = unroll_measurement_row(&sample); assert_eq!(table_name, "oximeter.measurements_histogramf64"); let unpacked: HistogramF64MeasurementRow = diff --git a/oximeter/db/tests/integration_test.rs b/oximeter/db/tests/integration_test.rs index 732683c414..f5d81d51d1 100644 --- a/oximeter/db/tests/integration_test.rs +++ b/oximeter/db/tests/integration_test.rs @@ -10,7 +10,6 @@ use clickward::{ use dropshot::test_util::log_prefix_for_test; use omicron_test_utils::dev::poll; use omicron_test_utils::dev::test_setup_log; -use oximeter::test_util; use oximeter_db::{Client, DbWrite, OxqlResult, Sample, TestDbWrite}; use slog::{debug, info, Logger}; use std::collections::BTreeSet; @@ -199,7 +198,7 @@ async fn test_cluster() -> anyhow::Result<()> { // Let's write some samples to our first replica and wait for them to show // up on replica 2. let start = tokio::time::Instant::now(); - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, @@ -261,7 +260,7 @@ async fn test_cluster() -> anyhow::Result<()> { info!(log, "successfully stopped server 1"); // Generate some new samples and insert them at replica3 - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, @@ -298,7 +297,7 @@ async fn test_cluster() -> anyhow::Result<()> { .expect("failed to get samples from client1"); // We still have a quorum (2 of 3 keepers), so we should be able to insert - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, @@ -321,7 +320,7 @@ async fn test_cluster() -> anyhow::Result<()> { .expect("failed to get samples from client1"); info!(log, "Attempting to insert samples without keeper quorum"); - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, @@ -350,7 +349,7 @@ async fn test_cluster() -> anyhow::Result<()> { ) .await .expect("failed to sync keepers"); - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, @@ -370,7 +369,7 @@ async fn test_cluster() -> anyhow::Result<()> { ) .await .expect("failed to sync keepers"); - let samples = test_util::generate_test_samples( + let samples = oximeter_test_utils::generate_test_samples( input.n_projects, input.n_instances, input.n_cpus, diff --git a/oximeter/impl/src/test_util.rs b/oximeter/impl/src/test_util.rs deleted file mode 100644 index c2ac7b34bd..0000000000 --- a/oximeter/impl/src/test_util.rs +++ /dev/null @@ -1,130 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Utilities for testing the oximeter crate. -// Copyright 2024 Oxide Computer Company - -use crate::histogram; -use crate::histogram::{Histogram, Record}; -use crate::types::{Cumulative, Sample}; -use uuid::Uuid; - -#[derive(oximeter::Target)] -pub struct TestTarget { - pub name1: String, - pub name2: String, - pub num: i64, -} - -impl Default for TestTarget { - fn default() -> Self { - TestTarget { - name1: "first_name".into(), - name2: "second_name".into(), - num: 0, - } - } -} - -#[derive(oximeter::Metric)] -pub struct TestMetric { - pub id: Uuid, - pub good: bool, - pub datum: i64, -} - -#[derive(oximeter::Metric)] -pub struct TestCumulativeMetric { - pub id: Uuid, - pub good: bool, - pub datum: Cumulative, -} - -#[derive(oximeter::Metric)] -pub struct TestHistogram { - pub id: Uuid, - pub good: bool, - pub datum: Histogram, -} - -const ID: Uuid = uuid::uuid!("e00ced4d-39d1-446a-ae85-a67f05c9750b"); - -pub fn make_sample() -> Sample { - let target = TestTarget::default(); - let metric = TestMetric { id: ID, good: true, datum: 1 }; - Sample::new(&target, &metric).unwrap() -} - -pub fn make_missing_sample() -> Sample { - let target = TestTarget::default(); - let metric = TestMetric { id: ID, good: true, datum: 1 }; - Sample::new_missing(&target, &metric).unwrap() -} - -pub fn make_hist_sample() -> Sample { - let target = TestTarget::default(); - let mut hist = histogram::Histogram::new(&[0.0, 5.0, 10.0]).unwrap(); - hist.sample(1.0).unwrap(); - hist.sample(2.0).unwrap(); - hist.sample(6.0).unwrap(); - let metric = TestHistogram { id: ID, good: true, datum: hist }; - Sample::new(&target, &metric).unwrap() -} - -/// A target identifying a single virtual machine instance -#[derive(Debug, Clone, Copy, oximeter::Target)] -pub struct VirtualMachine { - pub project_id: Uuid, - pub instance_id: Uuid, -} - -/// A metric recording the total time a vCPU is busy, by its ID -#[derive(Debug, Clone, Copy, oximeter::Metric)] -pub struct CpuBusy { - cpu_id: i64, - datum: Cumulative, -} - -pub fn generate_test_samples( - n_projects: usize, - n_instances: usize, - n_cpus: usize, - n_samples: usize, -) -> Vec { - let n_timeseries = n_projects * n_instances * n_cpus; - let mut samples = Vec::with_capacity(n_samples * n_timeseries); - for _ in 0..n_projects { - let project_id = Uuid::new_v4(); - for _ in 0..n_instances { - let vm = VirtualMachine { project_id, instance_id: Uuid::new_v4() }; - for cpu in 0..n_cpus { - for sample in 0..n_samples { - let cpu_busy = CpuBusy { - cpu_id: cpu as _, - datum: Cumulative::new(sample as f64), - }; - let sample = Sample::new(&vm, &cpu_busy).unwrap(); - samples.push(sample); - } - } - } - } - samples -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_gen_test_samples() { - let (n_projects, n_instances, n_cpus, n_samples) = (2, 2, 2, 2); - let samples = - generate_test_samples(n_projects, n_instances, n_cpus, n_samples); - assert_eq!( - samples.len(), - n_projects * n_instances * n_cpus * n_samples - ); - } -} diff --git a/oximeter/oximeter/Cargo.toml b/oximeter/oximeter/Cargo.toml index c04d1bd3ae..63b370bee6 100644 --- a/oximeter/oximeter/Cargo.toml +++ b/oximeter/oximeter/Cargo.toml @@ -13,9 +13,10 @@ anyhow.workspace = true clap.workspace = true chrono.workspace = true omicron-workspace-hack.workspace = true -oximeter-impl.workspace = true oximeter-macro-impl.workspace = true +oximeter-schema.workspace = true oximeter-timeseries-macro.workspace = true +oximeter-types.workspace = true prettyplease.workspace = true syn.workspace = true toml.workspace = true diff --git a/oximeter/oximeter/src/lib.rs b/oximeter/oximeter/src/lib.rs index 5ec6a49e5c..913318b8a8 100644 --- a/oximeter/oximeter/src/lib.rs +++ b/oximeter/oximeter/src/lib.rs @@ -185,14 +185,15 @@ //! `Producer`s may be registered with the same `ProducerServer`, each with potentially different //! sampling intervals. -pub use oximeter_impl::*; +pub use oximeter_macro_impl::{Metric, Target}; pub use oximeter_timeseries_macro::use_timeseries; +pub use oximeter_types::*; #[cfg(test)] mod test { - use oximeter_impl::schema::ir::load_schema; - use oximeter_impl::schema::{FieldSource, SCHEMA_DIRECTORY}; - use oximeter_impl::TimeseriesSchema; + use oximeter_schema::ir::load_schema; + use oximeter_types::schema::{FieldSource, SCHEMA_DIRECTORY}; + use oximeter_types::TimeseriesSchema; use std::collections::BTreeMap; use std::fs; diff --git a/oximeter/schema/Cargo.toml b/oximeter/schema/Cargo.toml new file mode 100644 index 0000000000..fe2e28705a --- /dev/null +++ b/oximeter/schema/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "oximeter-schema" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +clap.workspace = true +heck.workspace = true +omicron-workspace-hack.workspace = true +oximeter-types.workspace = true +prettyplease.workspace = true +proc-macro2.workspace = true +quote.workspace = true +schemars.workspace = true +serde.workspace = true +slog-error-chain.workspace = true +syn.workspace = true +toml.workspace = true diff --git a/oximeter/oximeter/src/bin/oximeter-schema.rs b/oximeter/schema/src/bin/oximeter-schema.rs similarity index 93% rename from oximeter/oximeter/src/bin/oximeter-schema.rs rename to oximeter/schema/src/bin/oximeter-schema.rs index 14fb31b1e8..5595a28639 100644 --- a/oximeter/oximeter/src/bin/oximeter-schema.rs +++ b/oximeter/schema/src/bin/oximeter-schema.rs @@ -9,7 +9,7 @@ use anyhow::Context as _; use clap::Parser; use clap::Subcommand; -use oximeter::schema::ir::TimeseriesDefinition; +use oximeter_schema::ir::TimeseriesDefinition; use std::num::NonZeroU8; use std::path::PathBuf; @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { println!("{def:#?}"); } Cmd::Schema { timeseries, version } => { - let schema = oximeter_impl::schema::ir::load_schema(&contents)?; + let schema = oximeter_schema::ir::load_schema(&contents)?; match (timeseries, version) { (None, None) => { for each in schema.into_iter() { @@ -87,7 +87,7 @@ fn main() -> anyhow::Result<()> { } } Cmd::Emit => { - let code = oximeter::schema::codegen::use_timeseries(&contents)?; + let code = oximeter_schema::codegen::use_timeseries(&contents)?; let formatted = prettyplease::unparse(&syn::parse_file(&format!("{code}"))?); println!("{formatted}"); diff --git a/oximeter/impl/src/schema/codegen.rs b/oximeter/schema/src/codegen.rs similarity index 73% rename from oximeter/impl/src/schema/codegen.rs rename to oximeter/schema/src/codegen.rs index 4778cf4970..0429cf0534 100644 --- a/oximeter/impl/src/schema/codegen.rs +++ b/oximeter/schema/src/codegen.rs @@ -6,18 +6,18 @@ //! Generate Rust types and code from oximeter schema definitions. -use crate::schema::ir::find_schema_version; -use crate::schema::ir::load_schema; -use crate::schema::AuthzScope; -use crate::schema::FieldSource; -use crate::schema::Units; -use crate::DatumType; -use crate::FieldSchema; -use crate::FieldType; -use crate::MetricsError; -use crate::TimeseriesSchema; +use crate::ir::find_schema_version; +use crate::ir::load_schema; use chrono::prelude::DateTime; use chrono::prelude::Utc; +use oximeter_types::AuthzScope; +use oximeter_types::DatumType; +use oximeter_types::FieldSchema; +use oximeter_types::FieldSource; +use oximeter_types::FieldType; +use oximeter_types::MetricsError; +use oximeter_types::TimeseriesSchema; +use oximeter_types::Units; use proc_macro2::TokenStream; use quote::quote; @@ -34,7 +34,7 @@ pub fn use_timeseries(contents: &str) -> Result { let latest = find_schema_version(schema.iter().cloned(), None); let mod_name = quote::format_ident!("{}", latest[0].target_name()); let types = emit_schema_types(latest); - let func = emit_schema_function(schema.into_iter()); + let func = emit_schema_function(schema.iter()); Ok(quote! { pub mod #mod_name { #types @@ -43,9 +43,10 @@ pub fn use_timeseries(contents: &str) -> Result { }) } -fn emit_schema_function( - list: impl Iterator, +fn emit_schema_function<'a>( + list: impl Iterator, ) -> TokenStream { + let list = list.map(quote_timeseries_schema); quote! { pub fn timeseries_schema() -> Vec<::oximeter::schema::TimeseriesSchema> { vec![ @@ -310,66 +311,63 @@ fn emit_one(source: FieldSource, schema: &TimeseriesSchema) -> TokenStream { // This is used so that we can emit a function that will return the same data as // we parse from the TOML file with the timeseries definition, as a way to // export the definitions without needing that actual file at runtime. -impl quote::ToTokens for DatumType { - fn to_tokens(&self, tokens: &mut TokenStream) { - let toks = match self { - DatumType::Bool => quote! { ::oximeter::DatumType::Bool }, - DatumType::I8 => quote! { ::oximeter::DatumType::I8 }, - DatumType::U8 => quote! { ::oximeter::DatumType::U8 }, - DatumType::I16 => quote! { ::oximeter::DatumType::I16 }, - DatumType::U16 => quote! { ::oximeter::DatumType::U16 }, - DatumType::I32 => quote! { ::oximeter::DatumType::I32 }, - DatumType::U32 => quote! { ::oximeter::DatumType::U32 }, - DatumType::I64 => quote! { ::oximeter::DatumType::I64 }, - DatumType::U64 => quote! { ::oximeter::DatumType::U64 }, - DatumType::F32 => quote! { ::oximeter::DatumType::F32 }, - DatumType::F64 => quote! { ::oximeter::DatumType::F64 }, - DatumType::String => quote! { ::oximeter::DatumType::String }, - DatumType::Bytes => quote! { ::oximeter::DatumType::Bytes }, - DatumType::CumulativeI64 => { - quote! { ::oximeter::DatumType::CumulativeI64 } - } - DatumType::CumulativeU64 => { - quote! { ::oximeter::DatumType::CumulativeU64 } - } - DatumType::CumulativeF32 => { - quote! { ::oximeter::DatumType::CumulativeF32 } - } - DatumType::CumulativeF64 => { - quote! { ::oximeter::DatumType::CumulativeF64 } - } - DatumType::HistogramI8 => { - quote! { ::oximeter::DatumType::HistogramI8 } - } - DatumType::HistogramU8 => { - quote! { ::oximeter::DatumType::HistogramU8 } - } - DatumType::HistogramI16 => { - quote! { ::oximeter::DatumType::HistogramI16 } - } - DatumType::HistogramU16 => { - quote! { ::oximeter::DatumType::HistogramU16 } - } - DatumType::HistogramI32 => { - quote! { ::oximeter::DatumType::HistogramI32 } - } - DatumType::HistogramU32 => { - quote! { ::oximeter::DatumType::HistogramU32 } - } - DatumType::HistogramI64 => { - quote! { ::oximeter::DatumType::HistogramI64 } - } - DatumType::HistogramU64 => { - quote! { ::oximeter::DatumType::HistogramU64 } - } - DatumType::HistogramF32 => { - quote! { ::oximeter::DatumType::HistogramF32 } - } - DatumType::HistogramF64 => { - quote! { ::oximeter::DatumType::HistogramF64 } - } - }; - toks.to_tokens(tokens); +fn quote_datum_type(datum_type: DatumType) -> TokenStream { + match datum_type { + DatumType::Bool => quote! { ::oximeter::DatumType::Bool }, + DatumType::I8 => quote! { ::oximeter::DatumType::I8 }, + DatumType::U8 => quote! { ::oximeter::DatumType::U8 }, + DatumType::I16 => quote! { ::oximeter::DatumType::I16 }, + DatumType::U16 => quote! { ::oximeter::DatumType::U16 }, + DatumType::I32 => quote! { ::oximeter::DatumType::I32 }, + DatumType::U32 => quote! { ::oximeter::DatumType::U32 }, + DatumType::I64 => quote! { ::oximeter::DatumType::I64 }, + DatumType::U64 => quote! { ::oximeter::DatumType::U64 }, + DatumType::F32 => quote! { ::oximeter::DatumType::F32 }, + DatumType::F64 => quote! { ::oximeter::DatumType::F64 }, + DatumType::String => quote! { ::oximeter::DatumType::String }, + DatumType::Bytes => quote! { ::oximeter::DatumType::Bytes }, + DatumType::CumulativeI64 => { + quote! { ::oximeter::DatumType::CumulativeI64 } + } + DatumType::CumulativeU64 => { + quote! { ::oximeter::DatumType::CumulativeU64 } + } + DatumType::CumulativeF32 => { + quote! { ::oximeter::DatumType::CumulativeF32 } + } + DatumType::CumulativeF64 => { + quote! { ::oximeter::DatumType::CumulativeF64 } + } + DatumType::HistogramI8 => { + quote! { ::oximeter::DatumType::HistogramI8 } + } + DatumType::HistogramU8 => { + quote! { ::oximeter::DatumType::HistogramU8 } + } + DatumType::HistogramI16 => { + quote! { ::oximeter::DatumType::HistogramI16 } + } + DatumType::HistogramU16 => { + quote! { ::oximeter::DatumType::HistogramU16 } + } + DatumType::HistogramI32 => { + quote! { ::oximeter::DatumType::HistogramI32 } + } + DatumType::HistogramU32 => { + quote! { ::oximeter::DatumType::HistogramU32 } + } + DatumType::HistogramI64 => { + quote! { ::oximeter::DatumType::HistogramI64 } + } + DatumType::HistogramU64 => { + quote! { ::oximeter::DatumType::HistogramU64 } + } + DatumType::HistogramF32 => { + quote! { ::oximeter::DatumType::HistogramF32 } + } + DatumType::HistogramF64 => { + quote! { ::oximeter::DatumType::HistogramF64 } + } } } @@ -452,55 +450,46 @@ fn emit_rust_type_for_field(field_type: FieldType) -> TokenStream { } } -impl quote::ToTokens for FieldSource { - fn to_tokens(&self, tokens: &mut TokenStream) { - let toks = match self { - FieldSource::Target => { - quote! { ::oximeter::schema::FieldSource::Target } - } - FieldSource::Metric => { - quote! { ::oximeter::schema::FieldSource::Metric } - } - }; - toks.to_tokens(tokens); +fn quote_field_source(source: FieldSource) -> TokenStream { + match source { + FieldSource::Target => { + quote! { ::oximeter::schema::FieldSource::Target } + } + FieldSource::Metric => { + quote! { ::oximeter::schema::FieldSource::Metric } + } } } -impl quote::ToTokens for FieldType { - fn to_tokens(&self, tokens: &mut TokenStream) { - let toks = match self { - FieldType::String => quote! { ::oximeter::FieldType::String }, - FieldType::I8 => quote! { ::oximeter::FieldType::I8 }, - FieldType::U8 => quote! { ::oximeter::FieldType::U8 }, - FieldType::I16 => quote! { ::oximeter::FieldType::I16 }, - FieldType::U16 => quote! { ::oximeter::FieldType::U16 }, - FieldType::I32 => quote! { ::oximeter::FieldType::I32 }, - FieldType::U32 => quote! { ::oximeter::FieldType::U32 }, - FieldType::I64 => quote! { ::oximeter::FieldType::I64 }, - FieldType::U64 => quote! { ::oximeter::FieldType::U64 }, - FieldType::IpAddr => quote! { ::oximeter::FieldType::IpAddr }, - FieldType::Uuid => quote! { ::oximeter::FieldType::Uuid }, - FieldType::Bool => quote! { ::oximeter::FieldType::Bool }, - }; - toks.to_tokens(tokens); +fn quote_field_type(field_type: FieldType) -> TokenStream { + match field_type { + FieldType::String => quote! { ::oximeter::FieldType::String }, + FieldType::I8 => quote! { ::oximeter::FieldType::I8 }, + FieldType::U8 => quote! { ::oximeter::FieldType::U8 }, + FieldType::I16 => quote! { ::oximeter::FieldType::I16 }, + FieldType::U16 => quote! { ::oximeter::FieldType::U16 }, + FieldType::I32 => quote! { ::oximeter::FieldType::I32 }, + FieldType::U32 => quote! { ::oximeter::FieldType::U32 }, + FieldType::I64 => quote! { ::oximeter::FieldType::I64 }, + FieldType::U64 => quote! { ::oximeter::FieldType::U64 }, + FieldType::IpAddr => quote! { ::oximeter::FieldType::IpAddr }, + FieldType::Uuid => quote! { ::oximeter::FieldType::Uuid }, + FieldType::Bool => quote! { ::oximeter::FieldType::Bool }, } } -impl quote::ToTokens for AuthzScope { - fn to_tokens(&self, tokens: &mut TokenStream) { - let toks = match self { - AuthzScope::Fleet => { - quote! { ::oximeter::schema::AuthzScope::Fleet } - } - AuthzScope::Silo => quote! { ::oximeter::schema::AuthzScope::Silo }, - AuthzScope::Project => { - quote! { ::oximeter::schema::AuthzScope::Project } - } - AuthzScope::ViewableToAll => { - quote! { ::oximeter::schema::AuthzScope::ViewableToAll } - } - }; - toks.to_tokens(tokens); +fn quote_authz_scope(authz_scope: AuthzScope) -> TokenStream { + match authz_scope { + AuthzScope::Fleet => { + quote! { ::oximeter::schema::AuthzScope::Fleet } + } + AuthzScope::Silo => quote! { ::oximeter::schema::AuthzScope::Silo }, + AuthzScope::Project => { + quote! { ::oximeter::schema::AuthzScope::Project } + } + AuthzScope::ViewableToAll => { + quote! { ::oximeter::schema::AuthzScope::ViewableToAll } + } } } @@ -512,85 +501,79 @@ fn quote_creation_time(created: DateTime) -> TokenStream { } } -impl quote::ToTokens for Units { - fn to_tokens(&self, tokens: &mut TokenStream) { - let toks = match self { - Units::None => quote! { ::oximeter::schema::Units::None }, - Units::Count => quote! { ::oximeter::schema::Units::Count }, - Units::Bytes => quote! { ::oximeter::schema::Units::Bytes }, - Units::Seconds => quote! { ::oximeter::schema::Units::Seconds }, - Units::Nanoseconds => { - quote! { ::oximeter::schema::Units::Nanoseconds } - } - Units::Amps => quote! { ::oximeter::schema::Units::Amps }, - Units::Volts => quote! { ::oximeter::schema::Units::Volts }, - Units::DegreesCelcius => { - quote! { ::oximeter::schema::Units::DegreesCelcius } - } - Units::Rpm => quote! { ::oximeter::schema::Units::Rpm }, - }; - toks.to_tokens(tokens); +fn quote_units(units: Units) -> TokenStream { + match units { + Units::None => quote! { ::oximeter::schema::Units::None }, + Units::Count => quote! { ::oximeter::schema::Units::Count }, + Units::Bytes => quote! { ::oximeter::schema::Units::Bytes }, + Units::Seconds => quote! { ::oximeter::schema::Units::Seconds }, + Units::Nanoseconds => { + quote! { ::oximeter::schema::Units::Nanoseconds } + } + Units::Amps => quote! { ::oximeter::schema::Units::Amps }, + Units::Volts => quote! { ::oximeter::schema::Units::Volts }, + Units::DegreesCelcius => { + quote! { ::oximeter::schema::Units::DegreesCelcius } + } + Units::Rpm => quote! { ::oximeter::schema::Units::Rpm }, } } -impl quote::ToTokens for FieldSchema { - fn to_tokens(&self, tokens: &mut TokenStream) { - let name = self.name.as_str(); - let field_type = self.field_type; - let source = self.source; - let description = self.description.as_str(); - let toks = quote! { - ::oximeter::FieldSchema { - name: String::from(#name), - field_type: #field_type, - source: #source, - description: String::from(#description), - } - }; - toks.to_tokens(tokens); +fn quote_field_schema(field_schema: &FieldSchema) -> TokenStream { + let name = field_schema.name.as_str(); + let field_type = quote_field_type(field_schema.field_type); + let source = quote_field_source(field_schema.source); + let description = field_schema.description.as_str(); + quote! { + ::oximeter::FieldSchema { + name: String::from(#name), + field_type: #field_type, + source: #source, + description: String::from(#description), + } } } -impl quote::ToTokens for TimeseriesSchema { - fn to_tokens(&self, tokens: &mut TokenStream) { - let field_schema = &self.field_schema; - let timeseries_name = self.timeseries_name.to_string(); - let target_description = self.description.target.as_str(); - let metric_description = self.description.metric.as_str(); - let authz_scope = self.authz_scope; - let units = self.units; - let datum_type = self.datum_type; - let ver = self.version.get(); - let version = quote! { ::core::num::NonZeroU8::new(#ver).unwrap() }; - let created = quote_creation_time(self.created); - let toks = quote! { - ::oximeter::schema::TimeseriesSchema { - timeseries_name: - <::oximeter::TimeseriesName as ::std::convert::TryFrom<&str>>::try_from( - #timeseries_name - ).unwrap(), - description: ::oximeter::schema::TimeseriesDescription { - target: String::from(#target_description), - metric: String::from(#metric_description), - }, - authz_scope: #authz_scope, - units: #units, - field_schema: ::std::collections::BTreeSet::from([ - #(#field_schema),* - ]), - datum_type: #datum_type, - version: #version, - created: #created, - } - }; - toks.to_tokens(tokens); +fn quote_timeseries_schema( + timeseries_schema: &TimeseriesSchema, +) -> TokenStream { + let field_schema = + timeseries_schema.field_schema.iter().map(quote_field_schema); + let timeseries_name = timeseries_schema.timeseries_name.to_string(); + let target_description = timeseries_schema.description.target.as_str(); + let metric_description = timeseries_schema.description.metric.as_str(); + let authz_scope = quote_authz_scope(timeseries_schema.authz_scope); + let units = quote_units(timeseries_schema.units); + let datum_type = quote_datum_type(timeseries_schema.datum_type); + let ver = timeseries_schema.version.get(); + let version = quote! { ::core::num::NonZeroU8::new(#ver).unwrap() }; + let created = quote_creation_time(timeseries_schema.created); + quote! { + ::oximeter::schema::TimeseriesSchema { + timeseries_name: + <::oximeter::TimeseriesName as ::std::convert::TryFrom<&str>>::try_from( + #timeseries_name + ).unwrap(), + description: ::oximeter::schema::TimeseriesDescription { + target: String::from(#target_description), + metric: String::from(#metric_description), + }, + authz_scope: #authz_scope, + units: #units, + field_schema: ::std::collections::BTreeSet::from([ + #(#field_schema),* + ]), + datum_type: #datum_type, + version: #version, + created: #created, + } } } #[cfg(test)] mod tests { use super::*; - use crate::schema::TimeseriesDescription; + use oximeter_types::TimeseriesDescription; use std::{collections::BTreeSet, num::NonZeroU8}; #[test] diff --git a/oximeter/impl/src/schema/ir.rs b/oximeter/schema/src/ir.rs similarity index 99% rename from oximeter/impl/src/schema/ir.rs rename to oximeter/schema/src/ir.rs index f7a209294f..370236000a 100644 --- a/oximeter/impl/src/schema/ir.rs +++ b/oximeter/schema/src/ir.rs @@ -11,17 +11,17 @@ //! inspected or used to generate code that contains the equivalent Rust types //! and trait implementations. -use crate::schema::AuthzScope; -use crate::schema::DatumType; -use crate::schema::FieldSource; -use crate::schema::FieldType; -use crate::schema::TimeseriesDescription; -use crate::schema::Units; -use crate::FieldSchema; -use crate::MetricsError; -use crate::TimeseriesName; -use crate::TimeseriesSchema; use chrono::Utc; +use oximeter_types::AuthzScope; +use oximeter_types::DatumType; +use oximeter_types::FieldSchema; +use oximeter_types::FieldSource; +use oximeter_types::FieldType; +use oximeter_types::MetricsError; +use oximeter_types::TimeseriesDescription; +use oximeter_types::TimeseriesName; +use oximeter_types::TimeseriesSchema; +use oximeter_types::Units; use serde::Deserialize; use std::collections::btree_map::Entry; use std::collections::BTreeMap; diff --git a/oximeter/schema/src/lib.rs b/oximeter/schema/src/lib.rs new file mode 100644 index 0000000000..b1ce73a940 --- /dev/null +++ b/oximeter/schema/src/lib.rs @@ -0,0 +1,12 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company + +//! Tools for working with schemas for fields and timeseries. +//! +//! The actual schema type definitions are in [`oximeter_types::schema`]. + +pub mod codegen; +pub mod ir; diff --git a/oximeter/test-utils/Cargo.toml b/oximeter/test-utils/Cargo.toml new file mode 100644 index 0000000000..f463e74aca --- /dev/null +++ b/oximeter/test-utils/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "oximeter-test-utils" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +chrono.workspace = true +omicron-workspace-hack.workspace = true +oximeter-macro-impl.workspace = true +oximeter-types.workspace = true +uuid.workspace = true diff --git a/oximeter/test-utils/src/lib.rs b/oximeter/test-utils/src/lib.rs new file mode 100644 index 0000000000..04c49add65 --- /dev/null +++ b/oximeter/test-utils/src/lib.rs @@ -0,0 +1,295 @@ +// Copyright 2024 Oxide Computer Company + +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Utilities for testing the oximeter crate. + +// Export the current crate as `oximeter`. The macros defined in `oximeter-macro-impl` generate +// code referring to symbols like `oximeter::traits::Target`. In consumers of this crate, that's +// fine, but internally there _is_ no crate named `oximeter`, it's just `self` or `crate`. +// +// See https://github.com/rust-lang/rust/pull/55275 for the PR introducing this fix, which links to +// lots of related issues and discussion. +extern crate self as oximeter; + +use oximeter_macro_impl::{Metric, Target}; +use oximeter_types::histogram; +use oximeter_types::histogram::{Histogram, Record}; +use oximeter_types::traits; +use oximeter_types::types::{ + Cumulative, Datum, DatumType, FieldType, FieldValue, Measurement, Sample, +}; +use oximeter_types::{Metric, Target}; +use uuid::Uuid; + +#[derive(Target)] +pub struct TestTarget { + pub name1: String, + pub name2: String, + pub num: i64, +} + +impl Default for TestTarget { + fn default() -> Self { + TestTarget { + name1: "first_name".into(), + name2: "second_name".into(), + num: 0, + } + } +} + +#[derive(Metric)] +pub struct TestMetric { + pub id: Uuid, + pub good: bool, + pub datum: i64, +} + +#[derive(Metric)] +pub struct TestCumulativeMetric { + pub id: Uuid, + pub good: bool, + pub datum: Cumulative, +} + +#[derive(Metric)] +pub struct TestHistogram { + pub id: Uuid, + pub good: bool, + pub datum: Histogram, +} + +const ID: Uuid = uuid::uuid!("e00ced4d-39d1-446a-ae85-a67f05c9750b"); + +pub fn make_sample() -> Sample { + let target = TestTarget::default(); + let metric = TestMetric { id: ID, good: true, datum: 1 }; + Sample::new(&target, &metric).unwrap() +} + +pub fn make_missing_sample() -> Sample { + let target = TestTarget::default(); + let metric = TestMetric { id: ID, good: true, datum: 1 }; + Sample::new_missing(&target, &metric).unwrap() +} + +pub fn make_hist_sample() -> Sample { + let target = TestTarget::default(); + let mut hist = histogram::Histogram::new(&[0.0, 5.0, 10.0]).unwrap(); + hist.sample(1.0).unwrap(); + hist.sample(2.0).unwrap(); + hist.sample(6.0).unwrap(); + let metric = TestHistogram { id: ID, good: true, datum: hist }; + Sample::new(&target, &metric).unwrap() +} + +/// A target identifying a single virtual machine instance +#[derive(Debug, Clone, Copy, oximeter::Target)] +pub struct VirtualMachine { + pub project_id: Uuid, + pub instance_id: Uuid, +} + +/// A metric recording the total time a vCPU is busy, by its ID +#[derive(Debug, Clone, Copy, oximeter::Metric)] +pub struct CpuBusy { + cpu_id: i64, + datum: Cumulative, +} + +pub fn generate_test_samples( + n_projects: usize, + n_instances: usize, + n_cpus: usize, + n_samples: usize, +) -> Vec { + let n_timeseries = n_projects * n_instances * n_cpus; + let mut samples = Vec::with_capacity(n_samples * n_timeseries); + for _ in 0..n_projects { + let project_id = Uuid::new_v4(); + for _ in 0..n_instances { + let vm = VirtualMachine { project_id, instance_id: Uuid::new_v4() }; + for cpu in 0..n_cpus { + for sample in 0..n_samples { + let cpu_busy = CpuBusy { + cpu_id: cpu as _, + datum: Cumulative::new(sample as f64), + }; + let sample = Sample::new(&vm, &cpu_busy).unwrap(); + samples.push(sample); + } + } + } + } + samples +} + +#[cfg(test)] +mod tests { + use chrono::Utc; + use oximeter_types::{ + schema::{ + default_schema_version, AuthzScope, FieldSchema, FieldSource, + TimeseriesSchema, Units, + }, + TimeseriesName, + }; + + use super::*; + + #[test] + fn test_gen_test_samples() { + let (n_projects, n_instances, n_cpus, n_samples) = (2, 2, 2, 2); + let samples = + generate_test_samples(n_projects, n_instances, n_cpus, n_samples); + assert_eq!( + samples.len(), + n_projects * n_instances * n_cpus * n_samples + ); + } + + #[test] + fn test_sample_struct() { + let t = TestTarget::default(); + let m = TestMetric { id: Uuid::new_v4(), good: true, datum: 1i64 }; + let sample = Sample::new(&t, &m).unwrap(); + assert_eq!( + sample.timeseries_name, + format!("{}:{}", t.name(), m.name()) + ); + assert!(sample.measurement.start_time().is_none()); + assert_eq!(sample.measurement.datum(), &Datum::from(1i64)); + + let m = TestCumulativeMetric { + id: Uuid::new_v4(), + good: true, + datum: 1i64.into(), + }; + let sample = Sample::new(&t, &m).unwrap(); + assert!(sample.measurement.start_time().is_some()); + } + + #[derive(Target)] + struct MyTarget { + id: Uuid, + name: String, + } + + const ID: Uuid = uuid::uuid!("ca565ef4-65dc-4ab0-8622-7be43ed72105"); + + impl Default for MyTarget { + fn default() -> Self { + Self { id: ID, name: String::from("name") } + } + } + + #[derive(Metric)] + struct MyMetric { + happy: bool, + datum: u64, + } + + impl Default for MyMetric { + fn default() -> Self { + Self { happy: true, datum: 0 } + } + } + + #[test] + fn test_timeseries_schema_from_parts() { + let target = MyTarget::default(); + let metric = MyMetric::default(); + let schema = TimeseriesSchema::new(&target, &metric).unwrap(); + + assert_eq!(schema.timeseries_name, "my_target:my_metric"); + let f = schema.schema_for_field("id").unwrap(); + assert_eq!(f.name, "id"); + assert_eq!(f.field_type, FieldType::Uuid); + assert_eq!(f.source, FieldSource::Target); + + let f = schema.schema_for_field("name").unwrap(); + assert_eq!(f.name, "name"); + assert_eq!(f.field_type, FieldType::String); + assert_eq!(f.source, FieldSource::Target); + + let f = schema.schema_for_field("happy").unwrap(); + assert_eq!(f.name, "happy"); + assert_eq!(f.field_type, FieldType::Bool); + assert_eq!(f.source, FieldSource::Metric); + assert_eq!(schema.datum_type, DatumType::U64); + } + + #[test] + fn test_timeseries_schema_from_sample() { + let target = MyTarget::default(); + let metric = MyMetric::default(); + let sample = Sample::new(&target, &metric).unwrap(); + let schema = TimeseriesSchema::new(&target, &metric).unwrap(); + let schema_from_sample = TimeseriesSchema::from(&sample); + assert_eq!(schema, schema_from_sample); + } + + // Test that we correctly order field across a target and metric. + // + // In an earlier commit, we switched from storing fields in an unordered Vec + // to using a BTree{Map,Set} to ensure ordering by name. However, the + // `TimeseriesSchema` type stored all its fields by chaining the sorted + // fields from the target and metric, without then sorting _across_ them. + // + // This was exacerbated by the error reporting, where we did in fact sort + // all fields across the target and metric, making it difficult to tell how + // the derived schema was different, if at all. + // + // This test generates a sample with a schema where the target and metric + // fields are sorted within them, but not across them. We check that the + // derived schema are actually equal, which means we've imposed that + // ordering when deriving the schema. + #[test] + fn test_schema_field_ordering_across_target_metric() { + let target_field = FieldSchema { + name: String::from("later"), + field_type: FieldType::U64, + source: FieldSource::Target, + description: String::new(), + }; + let metric_field = FieldSchema { + name: String::from("earlier"), + field_type: FieldType::U64, + source: FieldSource::Metric, + description: String::new(), + }; + let timeseries_name: TimeseriesName = "foo:bar".parse().unwrap(); + let datum_type = DatumType::U64; + let field_schema = + [target_field.clone(), metric_field.clone()].into_iter().collect(); + let expected_schema = TimeseriesSchema { + timeseries_name, + description: Default::default(), + field_schema, + datum_type, + version: default_schema_version(), + authz_scope: AuthzScope::Fleet, + units: Units::Count, + created: Utc::now(), + }; + + #[derive(oximeter::Target)] + struct Foo { + later: u64, + } + #[derive(oximeter::Metric)] + struct Bar { + earlier: u64, + datum: u64, + } + + let target = Foo { later: 1 }; + let metric = Bar { earlier: 2, datum: 10 }; + let sample = Sample::new(&target, &metric).unwrap(); + let derived_schema = TimeseriesSchema::from(&sample); + assert_eq!(derived_schema, expected_schema); + } +} diff --git a/oximeter/timeseries-macro/Cargo.toml b/oximeter/timeseries-macro/Cargo.toml index db591aed06..2fb8b8f312 100644 --- a/oximeter/timeseries-macro/Cargo.toml +++ b/oximeter/timeseries-macro/Cargo.toml @@ -8,7 +8,8 @@ proc-macro = true [dependencies] omicron-workspace-hack.workspace = true -oximeter-impl.workspace = true +oximeter-schema.workspace = true +oximeter-types.workspace = true proc-macro2.workspace = true quote.workspace = true syn.workspace = true diff --git a/oximeter/timeseries-macro/src/lib.rs b/oximeter/timeseries-macro/src/lib.rs index 0c70e73445..12ec2cc417 100644 --- a/oximeter/timeseries-macro/src/lib.rs +++ b/oximeter/timeseries-macro/src/lib.rs @@ -8,7 +8,7 @@ extern crate proc_macro; -use oximeter_impl::schema::SCHEMA_DIRECTORY; +use oximeter_types::schema::SCHEMA_DIRECTORY; /// Generate code to use the timeseries from one target. /// @@ -45,7 +45,7 @@ pub fn use_timeseries( .into(); } }; - match oximeter_impl::schema::codegen::use_timeseries(&contents) { + match oximeter_schema::codegen::use_timeseries(&contents) { Ok(toks) => { let path_ = path.display().to_string(); return quote::quote! { diff --git a/oximeter/impl/Cargo.toml b/oximeter/types/Cargo.toml similarity index 78% rename from oximeter/impl/Cargo.toml rename to oximeter/types/Cargo.toml index 91277d9d47..6d6bbc07e6 100644 --- a/oximeter/impl/Cargo.toml +++ b/oximeter/types/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "oximeter-impl" +name = "oximeter-types" version = "0.1.0" edition = "2021" license = "MPL-2.0" @@ -11,22 +11,13 @@ workspace = true bytes = { workspace = true, features = [ "serde" ] } chrono.workspace = true float-ord.workspace = true -heck.workspace = true num.workspace = true omicron-common.workspace = true omicron-workspace-hack.workspace = true -oximeter-macro-impl.workspace = true -prettyplease.workspace = true -proc-macro2.workspace = true -quote.workspace = true regex.workspace = true schemars = { workspace = true, features = [ "uuid1", "bytes", "chrono" ] } serde.workspace = true -serde_json.workspace = true -slog-error-chain.workspace = true strum.workspace = true -syn.workspace = true -toml.workspace = true thiserror.workspace = true uuid.workspace = true @@ -34,6 +25,7 @@ uuid.workspace = true approx.workspace = true # For benchmark criterion.workspace = true +oximeter-macro-impl.workspace = true rand = { workspace = true, features = ["std_rng"] } rand_distr.workspace = true rstest.workspace = true diff --git a/oximeter/impl/benches/quantile.rs b/oximeter/types/benches/quantile.rs similarity index 97% rename from oximeter/impl/benches/quantile.rs rename to oximeter/types/benches/quantile.rs index 4540ba8f6a..b88cb211e6 100644 --- a/oximeter/impl/benches/quantile.rs +++ b/oximeter/types/benches/quantile.rs @@ -8,7 +8,7 @@ // Copyright 2024 Oxide Computer Company use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use oximeter_impl::Quantile; +use oximeter_types::Quantile; use rand_distr::{Distribution, Normal}; /// Emulates baseline code in a Python implementation of the P² diff --git a/oximeter/impl/src/histogram.rs b/oximeter/types/src/histogram.rs similarity index 99% rename from oximeter/impl/src/histogram.rs rename to oximeter/types/src/histogram.rs index 40df0a1b41..0b85727ee0 100644 --- a/oximeter/impl/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -523,9 +523,9 @@ where /// Example /// ------- /// ```rust - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::histogram::Histogram; /// /// let hist = Histogram::with_bins(&[(0..10).into(), (10..100).into()]).unwrap(); @@ -922,9 +922,9 @@ where /// ------- /// /// ```rust - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::histogram::{Histogram, BinRange}; /// use std::ops::{RangeBounds, Bound}; /// diff --git a/oximeter/impl/src/lib.rs b/oximeter/types/src/lib.rs similarity index 92% rename from oximeter/impl/src/lib.rs rename to oximeter/types/src/lib.rs index 5acbeb9422..7a1a480f8d 100644 --- a/oximeter/impl/src/lib.rs +++ b/oximeter/types/src/lib.rs @@ -4,8 +4,6 @@ // Copyright 2024 Oxide Computer Company -pub use oximeter_macro_impl::*; - // Export the current crate as `oximeter`. The macros defined in `oximeter-macro-impl` generate // code referring to symbols like `oximeter::traits::Target`. In consumers of this crate, that's // fine, but internally there _is_ no crate named `oximeter`, it's just `self` or `crate`. @@ -17,15 +15,18 @@ extern crate self as oximeter; pub mod histogram; pub mod quantile; pub mod schema; -pub mod test_util; pub mod traits; pub mod types; pub use quantile::Quantile; pub use quantile::QuantileError; +pub use schema::AuthzScope; pub use schema::FieldSchema; +pub use schema::FieldSource; +pub use schema::TimeseriesDescription; pub use schema::TimeseriesName; pub use schema::TimeseriesSchema; +pub use schema::Units; pub use traits::Metric; pub use traits::Producer; pub use traits::Target; diff --git a/oximeter/impl/src/quantile.rs b/oximeter/types/src/quantile.rs similarity index 97% rename from oximeter/impl/src/quantile.rs rename to oximeter/types/src/quantile.rs index fafe9c9ece..40777217e5 100644 --- a/oximeter/impl/src/quantile.rs +++ b/oximeter/types/src/quantile.rs @@ -78,9 +78,9 @@ impl Quantile { /// # Examples /// /// ``` - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::Quantile; /// let q = Quantile::new(0.5).unwrap(); /// @@ -116,9 +116,9 @@ impl Quantile { /// /// # Examples /// ``` - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::Quantile; /// let q = Quantile::from_parts( /// 0.5, @@ -200,9 +200,9 @@ impl Quantile { /// # Examples /// /// ``` - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::Quantile; /// let mut q = Quantile::new(0.5).unwrap(); /// for o in 1..=100 { @@ -243,9 +243,9 @@ impl Quantile { /// # Examples /// /// ``` - /// # // Rename the impl crate so the doctests can refer to the public + /// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. - /// # use oximeter_impl as oximeter; + /// # use oximeter_types as oximeter; /// use oximeter::Quantile; /// let mut q = Quantile::new(0.9).unwrap(); /// q.append(10).unwrap(); diff --git a/oximeter/impl/src/schema/mod.rs b/oximeter/types/src/schema.rs similarity index 75% rename from oximeter/impl/src/schema/mod.rs rename to oximeter/types/src/schema.rs index 250604d7be..2efd5265ff 100644 --- a/oximeter/impl/src/schema/mod.rs +++ b/oximeter/types/src/schema.rs @@ -6,9 +6,6 @@ //! Tools for working with schema for fields and timeseries. -pub mod codegen; -pub mod ir; - use crate::types::DatumType; use crate::types::FieldType; use crate::types::MetricsError; @@ -402,7 +399,6 @@ pub enum AuthzScope { mod tests { use super::*; use std::convert::TryFrom; - use uuid::Uuid; #[test] fn test_timeseries_name() { @@ -426,127 +422,6 @@ mod tests { assert!(TimeseriesName::try_from("x.a:b").is_err()); } - #[derive(Target)] - struct MyTarget { - id: Uuid, - name: String, - } - - const ID: Uuid = uuid::uuid!("ca565ef4-65dc-4ab0-8622-7be43ed72105"); - - impl Default for MyTarget { - fn default() -> Self { - Self { id: ID, name: String::from("name") } - } - } - - #[derive(Metric)] - struct MyMetric { - happy: bool, - datum: u64, - } - - impl Default for MyMetric { - fn default() -> Self { - Self { happy: true, datum: 0 } - } - } - - #[test] - fn test_timeseries_schema_from_parts() { - let target = MyTarget::default(); - let metric = MyMetric::default(); - let schema = TimeseriesSchema::new(&target, &metric).unwrap(); - - assert_eq!(schema.timeseries_name, "my_target:my_metric"); - let f = schema.schema_for_field("id").unwrap(); - assert_eq!(f.name, "id"); - assert_eq!(f.field_type, FieldType::Uuid); - assert_eq!(f.source, FieldSource::Target); - - let f = schema.schema_for_field("name").unwrap(); - assert_eq!(f.name, "name"); - assert_eq!(f.field_type, FieldType::String); - assert_eq!(f.source, FieldSource::Target); - - let f = schema.schema_for_field("happy").unwrap(); - assert_eq!(f.name, "happy"); - assert_eq!(f.field_type, FieldType::Bool); - assert_eq!(f.source, FieldSource::Metric); - assert_eq!(schema.datum_type, DatumType::U64); - } - - #[test] - fn test_timeseries_schema_from_sample() { - let target = MyTarget::default(); - let metric = MyMetric::default(); - let sample = Sample::new(&target, &metric).unwrap(); - let schema = TimeseriesSchema::new(&target, &metric).unwrap(); - let schema_from_sample = TimeseriesSchema::from(&sample); - assert_eq!(schema, schema_from_sample); - } - - // Test that we correctly order field across a target and metric. - // - // In an earlier commit, we switched from storing fields in an unordered Vec - // to using a BTree{Map,Set} to ensure ordering by name. However, the - // `TimeseriesSchema` type stored all its fields by chaining the sorted - // fields from the target and metric, without then sorting _across_ them. - // - // This was exacerbated by the error reporting, where we did in fact sort - // all fields across the target and metric, making it difficult to tell how - // the derived schema was different, if at all. - // - // This test generates a sample with a schema where the target and metric - // fields are sorted within them, but not across them. We check that the - // derived schema are actually equal, which means we've imposed that - // ordering when deriving the schema. - #[test] - fn test_schema_field_ordering_across_target_metric() { - let target_field = FieldSchema { - name: String::from("later"), - field_type: FieldType::U64, - source: FieldSource::Target, - description: String::new(), - }; - let metric_field = FieldSchema { - name: String::from("earlier"), - field_type: FieldType::U64, - source: FieldSource::Metric, - description: String::new(), - }; - let timeseries_name: TimeseriesName = "foo:bar".parse().unwrap(); - let datum_type = DatumType::U64; - let field_schema = - [target_field.clone(), metric_field.clone()].into_iter().collect(); - let expected_schema = TimeseriesSchema { - timeseries_name, - description: Default::default(), - field_schema, - datum_type, - version: default_schema_version(), - authz_scope: AuthzScope::Fleet, - units: Units::Count, - created: Utc::now(), - }; - - #[derive(oximeter::Target)] - struct Foo { - later: u64, - } - #[derive(oximeter::Metric)] - struct Bar { - earlier: u64, - datum: u64, - } - - let target = Foo { later: 1 }; - let metric = Bar { earlier: 2, datum: 10 }; - let sample = Sample::new(&target, &metric).unwrap(); - let derived_schema = TimeseriesSchema::from(&sample); - assert_eq!(derived_schema, expected_schema); - } - #[test] fn test_field_schema_ordering() { let mut fields = BTreeSet::new(); diff --git a/oximeter/impl/src/traits.rs b/oximeter/types/src/traits.rs similarity index 96% rename from oximeter/impl/src/traits.rs rename to oximeter/types/src/traits.rs index 16baa4f619..91ecca817d 100644 --- a/oximeter/impl/src/traits.rs +++ b/oximeter/types/src/traits.rs @@ -45,9 +45,9 @@ use std::ops::AddAssign; /// -------- /// /// ```rust -/// # // Rename the impl crate so the doctests can refer to the public +/// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. -/// # extern crate oximeter_impl as oximeter; +/// # extern crate oximeter_types as oximeter; /// # use oximeter_macro_impl::*; /// use oximeter::{traits::Target, types::FieldType}; /// use uuid::Uuid; @@ -75,9 +75,9 @@ use std::ops::AddAssign; /// supported types. /// /// ```compile_fail -/// # // Rename the impl crate so the doctests can refer to the public +/// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. -/// # extern crate oximeter_impl as oximeter; +/// # extern crate oximeter_types as oximeter; /// # use oximeter_macro_impl::*; /// #[derive(oximeter::Target)] /// struct Bad { @@ -160,9 +160,9 @@ pub trait Target { /// Example /// ------- /// ```rust -/// # // Rename the impl crate so the doctests can refer to the public +/// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. -/// # extern crate oximeter_impl as oximeter; +/// # extern crate oximeter_types as oximeter; /// # use oximeter_macro_impl::*; /// use chrono::Utc; /// use oximeter::Metric; @@ -185,9 +185,9 @@ pub trait Target { /// an unsupported type. /// /// ```compile_fail -/// # // Rename the impl crate so the doctests can refer to the public +/// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. -/// # extern crate oximeter_impl as oximeter; +/// # extern crate oximeter_types as oximeter; /// # use oximeter_macro_impl::*; /// #[derive(Metric)] /// pub struct BadType { @@ -364,9 +364,9 @@ pub use crate::histogram::HistogramSupport; /// Example /// ------- /// ```rust -/// # // Rename the impl crate so the doctests can refer to the public +/// # // Rename the types crate so the doctests can refer to the public /// # // `oximeter` crate, not the private impl. -/// # extern crate oximeter_impl as oximeter; +/// # extern crate oximeter_types as oximeter; /// # use oximeter_macro_impl::*; /// use oximeter::{Datum, MetricsError, Metric, Producer, Target}; /// use oximeter::types::{Measurement, Sample, Cumulative}; @@ -464,6 +464,8 @@ pub trait Producer: Send + Sync + std::fmt::Debug + 'static { #[cfg(test)] mod tests { + use oximeter_macro_impl::{Metric, Target}; + use crate::types; use crate::{ Datum, DatumType, FieldType, FieldValue, Metric, MetricsError, diff --git a/oximeter/impl/src/types.rs b/oximeter/types/src/types.rs similarity index 97% rename from oximeter/impl/src/types.rs rename to oximeter/types/src/types.rs index 370557f7f7..60260e3649 100644 --- a/oximeter/impl/src/types.rs +++ b/oximeter/types/src/types.rs @@ -850,7 +850,7 @@ pub struct Sample { /// The version of the timeseries this sample belongs to // // TODO-cleanup: This should be removed once schema are tracked in CRDB. - #[serde(default = "::oximeter::schema::default_schema_version")] + #[serde(default = "crate::schema::default_schema_version")] pub timeseries_version: NonZeroU8, // Target name and fields @@ -1104,15 +1104,10 @@ mod tests { use super::Measurement; use super::MetricsError; use super::Sample; - use crate::test_util; - use crate::types; - use crate::Metric; - use crate::Target; use bytes::Bytes; use std::collections::BTreeMap; use std::net::Ipv4Addr; use std::net::Ipv6Addr; - use uuid::Uuid; #[test] fn test_cumulative_i64() { @@ -1176,31 +1171,6 @@ mod tests { assert!(measurement.timestamp() >= measurement.start_time().unwrap()); } - #[test] - fn test_sample_struct() { - let t = test_util::TestTarget::default(); - let m = test_util::TestMetric { - id: Uuid::new_v4(), - good: true, - datum: 1i64, - }; - let sample = types::Sample::new(&t, &m).unwrap(); - assert_eq!( - sample.timeseries_name, - format!("{}:{}", t.name(), m.name()) - ); - assert!(sample.measurement.start_time().is_none()); - assert_eq!(sample.measurement.datum(), &Datum::from(1i64)); - - let m = test_util::TestCumulativeMetric { - id: Uuid::new_v4(), - good: true, - datum: 1i64.into(), - }; - let sample = types::Sample::new(&t, &m).unwrap(); - assert!(sample.measurement.start_time().is_some()); - } - #[rstest::rstest] #[case::as_string("some string", FieldValue::String("some string".into()))] #[case::as_i8("2", FieldValue::I8(2))] diff --git a/oximeter/impl/tests/fail/failures.rs b/oximeter/types/tests/fail/failures.rs similarity index 100% rename from oximeter/impl/tests/fail/failures.rs rename to oximeter/types/tests/fail/failures.rs diff --git a/oximeter/impl/tests/fail/failures.stderr b/oximeter/types/tests/fail/failures.stderr similarity index 100% rename from oximeter/impl/tests/fail/failures.stderr rename to oximeter/types/tests/fail/failures.stderr diff --git a/oximeter/impl/tests/test_compilation.rs b/oximeter/types/tests/test_compilation.rs similarity index 100% rename from oximeter/impl/tests/test_compilation.rs rename to oximeter/types/tests/test_compilation.rs From dae480a84551638886a4c66c1d29d2f8d6669a9d Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 17 Aug 2024 05:36:18 +0000 Subject: [PATCH 62/91] Update Rust crate libc to 0.2.156 (#6375) Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b62f3001a..bad1af8f6b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4307,9 +4307,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.155" +version = "0.2.156" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "a5f43f184355eefb8d17fc948dbecf6c13be3c141f20d834ae842193a448c72a" [[package]] name = "libdlpi-sys" diff --git a/Cargo.toml b/Cargo.toml index ea687936e3..cac3395a7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -396,7 +396,7 @@ ipnetwork = { version = "0.20", features = ["schemars"] } ispf = { git = "https://github.com/oxidecomputer/ispf" } key-manager = { path = "key-manager" } kstat-rs = "0.2.4" -libc = "0.2.155" +libc = "0.2.156" libfalcon = { git = "https://github.com/oxidecomputer/falcon", rev = "e69694a1f7cc9fe31fab27f321017280531fb5f7" } libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "dd5bb221d327a1bc9287961718c3c10d6bd37da0" } linear-map = "1.2.0" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 5dc3bc11e7..854179a4db 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -69,7 +69,7 @@ itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } -libc = { version = "0.2.155", features = ["extra_traits"] } +libc = { version = "0.2.156", features = ["extra_traits"] } log = { version = "0.4.21", default-features = false, features = ["std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.2" } @@ -176,7 +176,7 @@ itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } lazy_static = { version = "1.5.0", default-features = false, features = ["spin_no_std"] } -libc = { version = "0.2.155", features = ["extra_traits"] } +libc = { version = "0.2.156", features = ["extra_traits"] } log = { version = "0.4.21", default-features = false, features = ["std"] } managed = { version = "0.8.0", default-features = false, features = ["alloc", "map"] } memchr = { version = "2.7.2" } From f69ea9d4ec7f71ccf7babc53e94ee175674bfe06 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 17 Aug 2024 01:41:19 -0700 Subject: [PATCH 63/91] Update Rust crate rstest to 0.22.0 (#6347) --- Cargo.lock | 31 ++++++++++++++++++++++++++----- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bad1af8f6b..3f084cd822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5673,7 +5673,7 @@ version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" dependencies = [ - "proc-macro-crate", + "proc-macro-crate 1.3.1", "proc-macro2", "quote", "syn 1.0.109", @@ -7786,6 +7786,15 @@ dependencies = [ "toml_edit 0.19.15", ] +[[package]] +name = "proc-macro-crate" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +dependencies = [ + "toml_edit 0.21.1", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -8562,9 +8571,9 @@ dependencies = [ [[package]] name = "rstest" -version = "0.19.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d5316d2a1479eeef1ea21e7f9ddc67c191d497abc8fc3ba2467857abbb68330" +checksum = "7b423f0e62bdd61734b67cd21ff50871dfaeb9cc74f869dcd6af974fbcb19936" dependencies = [ "futures", "futures-timer", @@ -8574,12 +8583,13 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.19.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04a9df72cc1f67020b0d63ad9bfe4a323e459ea7eb68e03bd9824db49f9a4c25" +checksum = "c5e1711e7d14f74b12a58411c542185ef7fb7f2e7f8ee6e2940a883628522b42" dependencies = [ "cfg-if", "glob", + "proc-macro-crate 3.1.0", "proc-macro2", "quote", "regex", @@ -10751,6 +10761,17 @@ dependencies = [ "winnow 0.5.40", ] +[[package]] +name = "toml_edit" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +dependencies = [ + "indexmap 2.4.0", + "toml_datetime", + "winnow 0.5.40", +] + [[package]] name = "toml_edit" version = "0.22.20" diff --git a/Cargo.toml b/Cargo.toml index cac3395a7d..8cb1b667d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -506,7 +506,7 @@ regress = "0.9.1" reqwest = { version = "0.11", default-features = false } ring = "0.17.8" rpassword = "7.3.1" -rstest = "0.19.0" +rstest = "0.22.0" rustfmt-wrapper = "0.2" rustls = "0.22.2" rustls-pemfile = "2.1.3" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 854179a4db..b42f8093fb 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -112,6 +112,7 @@ tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serd tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } +toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.20", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } unicode-bidi = { version = "0.3.15" } @@ -221,6 +222,7 @@ tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serd tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } +toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.20", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } unicode-bidi = { version = "0.3.15" } @@ -285,7 +287,6 @@ nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signa once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } -toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } [target.x86_64-unknown-illumos.build-dependencies] @@ -295,7 +296,6 @@ nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signa once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } -toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", features = ["serde"] } ### END HAKARI SECTION From f1e28fd27f4060580ff0a8945a0923c25864ee09 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 17 Aug 2024 10:12:45 +0000 Subject: [PATCH 64/91] Update Rust crate serde to v1.0.208 (#6376) --- Cargo.lock | 8 ++++---- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f084cd822..d0ac123970 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9142,9 +9142,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.207" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] @@ -9180,9 +9180,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.207" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index b42f8093fb..973b87c00a 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -95,7 +95,7 @@ rsa = { version = "0.9.6", features = ["serde", "sha2"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.208", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } @@ -203,7 +203,7 @@ rsa = { version = "0.9.6", features = ["serde", "sha2"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.208", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } From 8fcd26f86398ed7732b02ed8ab584bdb489c40a7 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 17 Aug 2024 10:13:39 +0000 Subject: [PATCH 65/91] Update Rust crate serde_with to 3.9.0 (#6377) --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d0ac123970..4ce34b9f84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9286,9 +9286,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.8.3" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e73139bc5ec2d45e6c5fd85be5a46949c1c39a4c18e56915f5eb4c12f975e377" +checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" dependencies = [ "base64 0.22.1", "chrono", @@ -9304,9 +9304,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.8.3" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b80d3d6b56b64335c0180e5ffde23b3c5e08c14c585b51a15bd0e95393f46703" +checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350" dependencies = [ "darling", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 8cb1b667d7..a2b4363455 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -521,7 +521,7 @@ serde_json = "1.0.125" serde_path_to_error = "0.1.16" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" -serde_with = "3.8.3" +serde_with = "3.9.0" sha2 = "0.10.8" sha3 = "0.10.8" shell-words = "1.1.0" From b1b5572a44c9b636c49e897f1173e4f8d4f7b235 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sat, 17 Aug 2024 10:11:22 -0700 Subject: [PATCH 66/91] Update Rust crate similar to 2.6.0 (#6378) --- Cargo.lock | 7 +++---- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 10 ++++------ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4ce34b9f84..51bf324679 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6439,7 +6439,6 @@ dependencies = [ "bit-vec", "bitflags 1.3.2", "bitflags 2.6.0", - "bstr 0.2.17", "bstr 1.9.1", "byteorder", "bytes", @@ -9435,11 +9434,11 @@ dependencies = [ [[package]] name = "similar" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42c91313f1d05da9b26f267f931cf178d4aba455b4c4622dd7355eb80c6640" +checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" dependencies = [ - "bstr 0.2.17", + "bstr 1.9.1", "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index a2b4363455..a86d986e80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -528,7 +528,7 @@ shell-words = "1.1.0" signal-hook = "0.3" signal-hook-tokio = { version = "0.3", features = [ "futures-v0_3" ] } sigpipe = "0.1.3" -similar = { version = "2.5.0", features = ["bytes"] } +similar = { version = "2.6.0", features = ["bytes"] } similar-asserts = "1.5.0" # Don't change sled's version on accident; sled's on-disk format is not yet # stable and requires manual migrations. In the limit this won't matter because diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 973b87c00a..fb1b94ae0f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -27,8 +27,7 @@ bit-set = { version = "0.5.3" } bit-vec = { version = "0.6.3" } bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" } bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.6.0", default-features = false, features = ["serde", "std"] } -bstr-6f8ce4dd05d13bba = { package = "bstr", version = "0.2.17" } -bstr-dff4ba8e3ae991db = { package = "bstr", version = "1.9.1" } +bstr = { version = "1.9.1" } byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } chrono = { version = "0.4.38", features = ["serde"] } @@ -99,7 +98,7 @@ serde = { version = "1.0.208", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } -similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } +similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } spin = { version = "0.9.8" } @@ -134,8 +133,7 @@ bit-set = { version = "0.5.3" } bit-vec = { version = "0.6.3" } bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" } bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.6.0", default-features = false, features = ["serde", "std"] } -bstr-6f8ce4dd05d13bba = { package = "bstr", version = "0.2.17" } -bstr-dff4ba8e3ae991db = { package = "bstr", version = "1.9.1" } +bstr = { version = "1.9.1" } byteorder = { version = "1.5.0" } bytes = { version = "1.7.1", features = ["serde"] } cc = { version = "1.0.97", default-features = false, features = ["parallel"] } @@ -207,7 +205,7 @@ serde = { version = "1.0.208", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.125", features = ["raw_value", "unbounded_depth"] } sha1 = { version = "0.10.6", features = ["oid"] } sha2 = { version = "0.10.8", features = ["oid"] } -similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } +similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } spin = { version = "0.9.8" } From 24c129b40579d9f51e664f9cea114705f16444d8 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sun, 18 Aug 2024 04:33:07 +0000 Subject: [PATCH 67/91] Update taiki-e/install-action digest to 2d7ff60 (#6381) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`4f13fb6` -> `2d7ff60`](https://togithub.com/taiki-e/install-action/compare/4f13fb6...2d7ff60) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index ddc4ffc021..4b4d09ba35 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@4f13fb62448d53782828736cd5b0fd395b5f0c06 # v2 + uses: taiki-e/install-action@2d7ff60c815c5236dc38fd3909d97d6d605315d2 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 4e8c731a6848a73eff1c449f6785228563dec016 Mon Sep 17 00:00:00 2001 From: Rain Date: Sat, 17 Aug 2024 21:47:19 -0700 Subject: [PATCH 68/91] [2/6] [oxql] move core types to a new oxql-types crate (#6364) Means that the Nexus external API crate doesn't have to pull in oximeter-db. I did have to expose a few mutators, but my general reasoning is that because these types derive `Deserialize`, it's always possible to make these changes by serializing into JSON and working with them there. --- Cargo.lock | 16 + Cargo.toml | 3 + nexus/Cargo.toml | 1 + nexus/src/app/metrics.rs | 4 +- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/tests/integration_tests/metrics.rs | 6 +- openapi/nexus.json | 14 +- oximeter/db/Cargo.toml | 1 + oximeter/db/src/client/mod.rs | 4 +- oximeter/db/src/client/oxql.rs | 30 +- oximeter/db/src/lib.rs | 3 +- oximeter/db/src/model.rs | 2 +- oximeter/db/src/oxql/ast/table_ops/align.rs | 35 +- oximeter/db/src/oxql/ast/table_ops/filter.rs | 47 +- .../db/src/oxql/ast/table_ops/group_by.rs | 73 ++- oximeter/db/src/oxql/ast/table_ops/join.rs | 292 +--------- oximeter/db/src/oxql/ast/table_ops/limit.rs | 108 ++-- oximeter/db/src/oxql/ast/table_ops/mod.rs | 2 +- oximeter/db/src/oxql/mod.rs | 4 - oximeter/db/src/oxql/query/mod.rs | 10 - oximeter/db/src/query.rs | 5 +- oximeter/db/src/shells/oxql.rs | 3 +- oximeter/oxql-types/Cargo.toml | 18 + oximeter/oxql-types/src/lib.rs | 23 + .../{db/src/oxql => oxql-types/src}/point.rs | 525 ++++++++++++++---- .../{db/src/oxql => oxql-types/src}/table.rs | 89 ++- oximeter/types/src/schema.rs | 2 + 27 files changed, 710 insertions(+), 612 deletions(-) create mode 100644 oximeter/oxql-types/Cargo.toml create mode 100644 oximeter/oxql-types/src/lib.rs rename oximeter/{db/src/oxql => oxql-types/src}/point.rs (82%) rename oximeter/{db/src/oxql => oxql-types/src}/table.rs (75%) diff --git a/Cargo.lock b/Cargo.lock index 51bf324679..2943a4f2c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6077,6 +6077,7 @@ dependencies = [ "oximeter-instruments", "oximeter-producer", "oxnet", + "oxql-types", "parse-display", "paste", "pem", @@ -6926,6 +6927,7 @@ dependencies = [ "omicron-workspace-hack", "oximeter", "oximeter-test-utils", + "oxql-types", "peg", "reedline", "regex", @@ -7102,6 +7104,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "oxql-types" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "highway", + "num", + "omicron-workspace-hack", + "oximeter-types", + "schemars", + "serde", +] + [[package]] name = "p256" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index a86d986e80..83aea83ddf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,7 @@ members = [ "oximeter/instruments", "oximeter/oximeter-macro-impl", "oximeter/oximeter", + "oximeter/oxql-types", "oximeter/producer", "oximeter/schema", "oximeter/test-utils", @@ -196,6 +197,7 @@ default-members = [ "oximeter/instruments", "oximeter/oximeter-macro-impl", "oximeter/oximeter", + "oximeter/oxql-types", "oximeter/producer", "oximeter/schema", "oximeter/test-utils", @@ -470,6 +472,7 @@ oximeter-schema = { path = "oximeter/schema" } oximeter-test-utils = { path = "oximeter/test-utils" } oximeter-timeseries-macro = { path = "oximeter/timeseries-macro" } oximeter-types = { path = "oximeter/types" } +oxql-types = { path = "oximeter/oxql-types" } p256 = "0.13" parse-display = "0.10.0" partial-io = { version = "0.5.4", features = ["proptest1", "tokio1"] } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 8977507505..5b181c7fa0 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -57,6 +57,7 @@ openssl.workspace = true oximeter-client.workspace = true oximeter-db = { workspace = true, default-features = false, features = [ "oxql" ] } oxnet.workspace = true +oxql-types.workspace = true parse-display.workspace = true paste.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. diff --git a/nexus/src/app/metrics.rs b/nexus/src/app/metrics.rs index 3728a3bdc1..3a6e7e27be 100644 --- a/nexus/src/app/metrics.rs +++ b/nexus/src/app/metrics.rs @@ -14,7 +14,7 @@ use nexus_db_queries::{ }; use omicron_common::api::external::{Error, InternalContext}; use oximeter_db::{ - oxql, Measurement, TimeseriesSchema, TimeseriesSchemaPaginationParams, + Measurement, TimeseriesSchema, TimeseriesSchemaPaginationParams, }; use std::num::NonZeroU32; @@ -138,7 +138,7 @@ impl super::Nexus { &self, opctx: &OpContext, query: impl AsRef, - ) -> Result, Error> { + ) -> Result, Error> { // Must be a fleet user to list timeseries schema. // // TODO-security: We need to figure out how to implement proper security diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 8e8b63229b..df522f18ab 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6386,7 +6386,7 @@ async fn timeseries_schema_list( async fn timeseries_query( rqctx: RequestContext, body: TypedBody, -) -> Result>, HttpError> { +) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 9cfa0350e8..e24de2a3ad 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -284,7 +284,7 @@ async fn test_timeseries_schema_list( pub async fn timeseries_query( cptestctx: &ControlPlaneTestContext, query: impl ToString, -) -> Vec { +) -> Vec { // first, make sure the latest timeseries have been collected. cptestctx.oximeter.force_collect().await; @@ -429,11 +429,11 @@ async fn test_instance_watcher_metrics( #[track_caller] fn count_state( - table: &oximeter_db::oxql::Table, + table: &oxql_types::Table, instance_id: InstanceUuid, state: &'static str, ) -> i64 { - use oximeter_db::oxql::point::ValueArray; + use oxql_types::point::ValueArray; let uuid = FieldValue::Uuid(instance_id.into_untyped_uuid()); let state = FieldValue::String(state.into()); let mut timeserieses = table.timeseries().filter(|ts| { diff --git a/openapi/nexus.json b/openapi/nexus.json index 27e2870b6e..a0cbfa2f63 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -20131,10 +20131,20 @@ "type": "object", "properties": { "metric_type": { - "$ref": "#/components/schemas/MetricType" + "description": "The type of this metric.", + "allOf": [ + { + "$ref": "#/components/schemas/MetricType" + } + ] }, "values": { - "$ref": "#/components/schemas/ValueArray" + "description": "The data values.", + "allOf": [ + { + "$ref": "#/components/schemas/ValueArray" + } + ] } }, "required": [ diff --git a/oximeter/db/Cargo.toml b/oximeter/db/Cargo.toml index 6a7cedbc22..2a9c615da2 100644 --- a/oximeter/db/Cargo.toml +++ b/oximeter/db/Cargo.toml @@ -24,6 +24,7 @@ num.workspace = true omicron-common.workspace = true omicron-workspace-hack.workspace = true oximeter.workspace = true +oxql-types.workspace = true regex.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/oximeter/db/src/client/mod.rs b/oximeter/db/src/client/mod.rs index 176e1bd5f8..c2b07ebaa6 100644 --- a/oximeter/db/src/client/mod.rs +++ b/oximeter/db/src/client/mod.rs @@ -22,8 +22,6 @@ use crate::Error; use crate::Metric; use crate::Target; use crate::Timeseries; -use crate::TimeseriesKey; -use crate::TimeseriesName; use crate::TimeseriesPageSelector; use crate::TimeseriesScanParams; use crate::TimeseriesSchema; @@ -31,7 +29,9 @@ use dropshot::EmptyScanParams; use dropshot::PaginationOrder; use dropshot::ResultsPage; use dropshot::WhichPage; +use oximeter::schema::TimeseriesKey; use oximeter::types::Sample; +use oximeter::TimeseriesName; use regex::Regex; use regex::RegexBuilder; use slog::debug; diff --git a/oximeter/db/src/client/oxql.rs b/oximeter/db/src/client/oxql.rs index 29586b8189..4005fa873e 100644 --- a/oximeter/db/src/client/oxql.rs +++ b/oximeter/db/src/client/oxql.rs @@ -18,7 +18,7 @@ use crate::query::field_table_name; use crate::Error; use crate::Metric; use crate::Target; -use crate::TimeseriesKey; +use oximeter::schema::TimeseriesKey; use oximeter::TimeseriesSchema; use slog::debug; use slog::trace; @@ -68,7 +68,7 @@ pub struct OxqlResult { pub query_summaries: Vec, /// The list of OxQL tables returned from the query. - pub tables: Vec, + pub tables: Vec, } /// The maximum number of data values fetched from the database for an OxQL @@ -479,7 +479,9 @@ impl Client { query_id, total_duration: query_start.elapsed(), query_summaries, - tables: vec![oxql::Table::new(schema.timeseries_name.as_str())], + tables: vec![oxql_types::Table::new( + schema.timeseries_name.as_str(), + )], }; return Ok(result); } @@ -503,7 +505,7 @@ impl Client { // At this point, let's construct a set of tables and run the results // through the transformation pipeline. - let mut tables = vec![oxql::Table::from_timeseries( + let mut tables = vec![oxql_types::Table::from_timeseries( schema.timeseries_name.as_str(), timeseries_by_key.into_values(), )?]; @@ -553,7 +555,7 @@ impl Client { limit: Option, total_rows_fetched: &mut u64, ) -> Result< - (Vec, BTreeMap), + (Vec, BTreeMap), Error, > { // We'll create timeseries for each key on the fly. To enable computing @@ -624,25 +626,25 @@ impl Client { for (key, measurements) in measurements_by_key.into_iter() { // Constuct a new timeseries, from the target/metric info. let (target, metric) = info.get(&key).unwrap(); - let mut timeseries = oxql::Timeseries::new( + let mut timeseries = oxql_types::Timeseries::new( target .fields .iter() .chain(metric.fields.iter()) .map(|field| (field.name.clone(), field.value.clone())), - oxql::point::DataType::try_from(schema.datum_type)?, + oxql_types::point::DataType::try_from(schema.datum_type)?, if schema.datum_type.is_cumulative() { - oxql::point::MetricType::Delta + oxql_types::point::MetricType::Delta } else { - oxql::point::MetricType::Gauge + oxql_types::point::MetricType::Gauge }, )?; // Covert its oximeter measurements into OxQL data types. let points = if schema.datum_type.is_cumulative() { - oxql::point::Points::delta_from_cumulative(&measurements)? + oxql_types::point::Points::delta_from_cumulative(&measurements)? } else { - oxql::point::Points::gauge_from_gauge(&measurements)? + oxql_types::point::Points::gauge_from_gauge(&measurements)? }; timeseries.points = points; debug!( @@ -1108,10 +1110,7 @@ fn update_total_rows_and_check( mod tests { use super::ConsistentKeyGroup; use crate::client::oxql::chunk_consistent_key_groups_impl; - use crate::{ - oxql::{point::Points, Table, Timeseries}, - Client, DbWrite, - }; + use crate::{Client, DbWrite}; use crate::{Metric, Target}; use chrono::{DateTime, Utc}; use dropshot::test_util::LogContext; @@ -1119,6 +1118,7 @@ mod tests { use omicron_test_utils::dev::test_setup_log; use oximeter::{types::Cumulative, FieldValue}; use oximeter::{DatumType, Sample}; + use oxql_types::{point::Points, Table, Timeseries}; use std::collections::BTreeMap; use std::time::Duration; diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs index 9ad382c97d..5d56d802c9 100644 --- a/oximeter/db/src/lib.rs +++ b/oximeter/db/src/lib.rs @@ -14,6 +14,7 @@ use dropshot::EmptyScanParams; use dropshot::PaginationParams; pub use oximeter::schema::FieldSchema; pub use oximeter::schema::FieldSource; +use oximeter::schema::TimeseriesKey; pub use oximeter::schema::TimeseriesName; pub use oximeter::schema::TimeseriesSchema; pub use oximeter::DatumType; @@ -267,8 +268,6 @@ pub async fn make_client( Ok(client) } -pub(crate) type TimeseriesKey = u64; - // TODO-cleanup: Add the timeseries version in to the computation of the key. // This will require a full drop of the database, since we're changing the // sorting key and the timeseries key on each past sample. See diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index 3e34ad10e3..986bf00225 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -11,13 +11,13 @@ use crate::FieldSchema; use crate::FieldSource; use crate::Metric; use crate::Target; -use crate::TimeseriesKey; use crate::TimeseriesSchema; use bytes::Bytes; use chrono::DateTime; use chrono::Utc; use num::traits::Zero; use oximeter::histogram::Histogram; +use oximeter::schema::TimeseriesKey; use oximeter::traits; use oximeter::types::Cumulative; use oximeter::types::Datum; diff --git a/oximeter/db/src/oxql/ast/table_ops/align.rs b/oximeter/db/src/oxql/ast/table_ops/align.rs index cf54ebc312..b0cd7d80f1 100644 --- a/oximeter/db/src/oxql/ast/table_ops/align.rs +++ b/oximeter/db/src/oxql/ast/table_ops/align.rs @@ -6,19 +6,19 @@ // Copyright 2024 Oxide Computer Company -use crate::oxql::point::DataType; -use crate::oxql::point::MetricType; -use crate::oxql::point::Points; -use crate::oxql::point::ValueArray; -use crate::oxql::point::Values; -use crate::oxql::query::Alignment; -use crate::oxql::Error; -use crate::oxql::Table; -use crate::oxql::Timeseries; use anyhow::Context; +use anyhow::Error; use chrono::DateTime; use chrono::TimeDelta; use chrono::Utc; +use oxql_types::point::DataType; +use oxql_types::point::MetricType; +use oxql_types::point::Points; +use oxql_types::point::ValueArray; +use oxql_types::point::Values; +use oxql_types::Alignment; +use oxql_types::Table; +use oxql_types::Timeseries; use std::time::Duration; // The maximum factor by which an alignment operation may upsample data. @@ -144,7 +144,7 @@ fn align_mean_within( "Alignment by mean requires a gauge or delta metric, not {}", metric_type, ); - verify_max_upsampling_ratio(&points.timestamps, &period)?; + verify_max_upsampling_ratio(points.timestamps(), &period)?; // Always convert the output to doubles, when computing the mean. The // output is always a gauge, so we do not need the start times of the @@ -179,7 +179,7 @@ fn align_mean_within( // - Compute the mean of those. let period_ = TimeDelta::from_std(*period).context("time delta out of range")?; - let first_timestamp = points.timestamps[0]; + let first_timestamp = points.timestamps()[0]; let mut ix: u32 = 0; loop { // Compute the next output timestamp, by shifting the query end time @@ -220,15 +220,15 @@ fn align_mean_within( // entries. let output_value = if matches!(metric_type, MetricType::Gauge) { mean_gauge_value_in_window( - &points.timestamps, + points.timestamps(), &input_points, window_start, output_time, ) } else { mean_delta_value_in_window( - points.start_times.as_ref().unwrap(), - &points.timestamps, + points.start_times().unwrap(), + points.timestamps(), &input_points, window_start, output_time, @@ -255,10 +255,9 @@ fn align_mean_within( ValueArray::Double(output_values.into_iter().rev().collect()); let timestamps = output_timestamps.into_iter().rev().collect(); let values = Values { values, metric_type: MetricType::Gauge }; - new_timeseries.points = - Points { start_times: None, timestamps, values: vec![values] }; - new_timeseries.alignment = - Some(Alignment { end_time: *query_end, period: *period }); + new_timeseries.points = Points::new(None, timestamps, vec![values]); + new_timeseries + .set_alignment(Alignment { end_time: *query_end, period: *period }); output_table.insert(new_timeseries).unwrap(); } Ok(output_table) diff --git a/oximeter/db/src/oxql/ast/table_ops/filter.rs b/oximeter/db/src/oxql/ast/table_ops/filter.rs index b6fc533e4d..ad398da983 100644 --- a/oximeter/db/src/oxql/ast/table_ops/filter.rs +++ b/oximeter/db/src/oxql/ast/table_ops/filter.rs @@ -12,18 +12,18 @@ use crate::oxql::ast::literal::Literal; use crate::oxql::ast::logical_op::LogicalOp; use crate::oxql::ast::table_ops::limit::Limit; use crate::oxql::ast::table_ops::limit::LimitKind; -use crate::oxql::point::DataType; -use crate::oxql::point::MetricType; -use crate::oxql::point::Points; -use crate::oxql::point::ValueArray; use crate::oxql::Error; -use crate::oxql::Table; -use crate::oxql::Timeseries; use crate::shells::special_idents; use chrono::DateTime; use chrono::Utc; use oximeter::FieldType; use oximeter::FieldValue; +use oxql_types::point::DataType; +use oxql_types::point::MetricType; +use oxql_types::point::Points; +use oxql_types::point::ValueArray; +use oxql_types::Table; +use oxql_types::Timeseries; use regex::Regex; use std::collections::BTreeSet; use std::fmt; @@ -340,16 +340,13 @@ impl Filter { // Apply the filter to the data points as well. let points = self.filter_points(&input.points)?; - // Similar to above, if the filter removes all data points in - // the timeseries, let's remove the timeseries altogether. - if points.is_empty() { - continue; + if let Some(new_timeseries) = input.copy_with_points(points) { + timeseries.push(new_timeseries); + } else { + // None means that the filter removed all data points in + // the timeseries. In that case, we remove the timeseries + // altogether. } - timeseries.push(Timeseries { - fields: input.fields.clone(), - points, - alignment: input.alignment, - }) } output_tables.push(Table::from_timeseries( table.name(), @@ -823,7 +820,7 @@ impl SimpleFilter { ) -> Result, Error> { let ident = self.ident.as_str(); if ident == "timestamp" { - self.filter_points_by_timestamp(negated, &points.timestamps) + self.filter_points_by_timestamp(negated, points.timestamps()) } else if ident == "datum" { anyhow::ensure!( points.dimensionality() == 1, @@ -1151,15 +1148,15 @@ impl SimpleFilter { mod tests { use crate::oxql::ast::grammar::query_parser; use crate::oxql::ast::logical_op::LogicalOp; - use crate::oxql::point::DataType; - use crate::oxql::point::MetricType; - use crate::oxql::point::Points; - use crate::oxql::point::ValueArray; - use crate::oxql::point::Values; - use crate::oxql::Table; - use crate::oxql::Timeseries; use chrono::Utc; use oximeter::FieldValue; + use oxql_types::point::DataType; + use oxql_types::point::MetricType; + use oxql_types::point::Points; + use oxql_types::point::ValueArray; + use oxql_types::point::Values; + use oxql_types::Table; + use oxql_types::Timeseries; use std::time::Duration; use uuid::Uuid; @@ -1172,7 +1169,7 @@ mod tests { values: ValueArray::Double(vec![Some(0.0), Some(2.0)]), metric_type: MetricType::Gauge, }]; - let points = Points { start_times, timestamps, values }; + let points = Points::new(start_times, timestamps, values); // This filter should remove the first point based on its timestamp. let t = Utc::now() + Duration::from_secs(10); @@ -1205,7 +1202,7 @@ mod tests { values: ValueArray::Double(vec![Some(0.0), Some(2.0)]), metric_type: MetricType::Gauge, }]; - let points = Points { start_times, timestamps, values }; + let points = Points::new(start_times, timestamps, values); let filter = query_parser::filter("filter datum < \"something\"").unwrap(); diff --git a/oximeter/db/src/oxql/ast/table_ops/group_by.rs b/oximeter/db/src/oxql/ast/table_ops/group_by.rs index f40572d762..c48804a788 100644 --- a/oximeter/db/src/oxql/ast/table_ops/group_by.rs +++ b/oximeter/db/src/oxql/ast/table_ops/group_by.rs @@ -10,13 +10,13 @@ use chrono::DateTime; use chrono::Utc; use crate::oxql::ast::ident::Ident; -use crate::oxql::point::DataType; -use crate::oxql::point::MetricType; -use crate::oxql::point::ValueArray; -use crate::oxql::Error; -use crate::oxql::Table; -use crate::oxql::Timeseries; -use crate::TimeseriesKey; +use anyhow::Error; +use oximeter::schema::TimeseriesKey; +use oxql_types::point::DataType; +use oxql_types::point::MetricType; +use oxql_types::point::ValueArray; +use oxql_types::Table; +use oxql_types::Timeseries; use std::collections::btree_map::Entry; use std::collections::BTreeMap; @@ -98,7 +98,7 @@ impl GroupBy { ValueArray::Double(new_values), ValueArray::Double(existing_values), ) => { - let new_timestamps = &dropped.points.timestamps; + let new_timestamps = dropped.points.timestamps(); // We will be merging the new data with the // existing, but borrow-checking limits the degree @@ -106,7 +106,7 @@ impl GroupBy { // entry in the output table. Instead, aggregate // everything into a copy of the expected data. let mut timestamps = - existing.points.timestamps.clone(); + existing.points.timestamps().to_owned(); let mut values = existing_values.clone(); // Merge in the new values, so long as they actually @@ -152,10 +152,7 @@ impl GroupBy { // Replace the existing output timeseries's // timestamps and data arrays. - std::mem::swap( - &mut existing.points.timestamps, - &mut timestamps, - ); + existing.points.set_timestamps(timestamps); existing .points .values_mut(0) @@ -166,7 +163,7 @@ impl GroupBy { ValueArray::Integer(new_values), ValueArray::Integer(existing_values), ) => { - let new_timestamps = &dropped.points.timestamps; + let new_timestamps = dropped.points.timestamps(); // We will be merging the new data with the // existing, but borrow-checking limits the degree @@ -174,7 +171,7 @@ impl GroupBy { // entry in the output table. Instead, aggregate // everything into a copy of the expected data. let mut timestamps = - existing.points.timestamps.clone(); + existing.points.timestamps().to_owned(); let mut values = existing_values.clone(); // Merge in the new values, so long as they actually @@ -220,10 +217,7 @@ impl GroupBy { // Replace the existing output timeseries's // timestamps and data arrays. - std::mem::swap( - &mut existing.points.timestamps, - &mut timestamps, - ); + existing.points.set_timestamps(timestamps); existing .points .values_mut(0) @@ -286,14 +280,15 @@ impl GroupBy { else { unreachable!(); }; - let new_timestamps = &new_points.timestamps; + let new_timestamps = new_points.timestamps(); // We will be merging the new data with the // existing, but borrow-checking limits the degree // to which we can easily do this on the `existing` // entry in the output table. Instead, aggregate // everything into a copy of the expected data. - let mut timestamps = existing.points.timestamps.clone(); + let mut timestamps = + existing.points.timestamps().to_owned(); let mut values = existing .points .values(0) @@ -360,10 +355,7 @@ impl GroupBy { // Replace the existing output timeseries's // timestamps and data arrays. - std::mem::swap( - &mut existing.points.timestamps, - &mut timestamps, - ); + existing.points.set_timestamps(timestamps); existing .points .values_mut(0) @@ -388,7 +380,7 @@ impl GroupBy { // _zero_ for any where the values are none. let counts = new_timeseries .points - .timestamps + .timestamps() .iter() .zip(values) .map(|(timestamp, maybe_value)| { @@ -434,16 +426,16 @@ pub enum Reducer { #[cfg(test)] mod tests { use super::{GroupBy, Reducer}; - use crate::oxql::{ - ast::{ - ident::Ident, - table_ops::align::{Align, AlignmentMethod}, - }, - point::{DataType, MetricType, ValueArray}, - Table, Timeseries, + use crate::oxql::ast::{ + ident::Ident, + table_ops::align::{Align, AlignmentMethod}, }; use chrono::{DateTime, Utc}; use oximeter::FieldValue; + use oxql_types::{ + point::{DataType, MetricType, ValueArray}, + Table, Timeseries, + }; use std::{collections::BTreeMap, time::Duration}; // Which timeseries the second data point is missing from. @@ -495,8 +487,8 @@ mod tests { MetricType::Gauge, ) .unwrap(); - ts0.points.start_times = None; - ts0.points.timestamps.clone_from(×tamps); + ts0.points.clear_start_times(); + ts0.points.set_timestamps(timestamps.clone()); *ts0.points.values_mut(0).unwrap() = ValueArray::Double(vec![ Some(1.0), if matches!( @@ -527,7 +519,7 @@ mod tests { MetricType::Gauge, ) .unwrap(); - ts1.points.start_times = None; + ts1.points.clear_start_times(); // Non-overlapping in this test setup means that we just shift one // value from this array backward in time by one additional second. @@ -538,7 +530,7 @@ mod tests { // // When reducing, t0 is never changed, and t1-t2 are always reduced // together, if the values are present. - ts1.points.timestamps = if cfg.overlapping_times { + let new_timestamps = if cfg.overlapping_times { timestamps.clone() } else { let mut new_timestamps = timestamps.clone(); @@ -546,6 +538,7 @@ mod tests { timestamps.insert(0, new_timestamps[0]); new_timestamps }; + ts1.points.set_timestamps(new_timestamps); *ts1.points.values_mut(0).unwrap() = ValueArray::Double(vec![ Some(2.0), if matches!(cfg.missing_value, MissingValue::Both) { @@ -604,11 +597,13 @@ mod tests { let points = &grouped_timeseries.points; assert_eq!(points.dimensionality(), 1, "Points should still be 1D"); assert_eq!( - points.start_times, None, + points.start_times(), + None, "Points should not have start times" ); assert_eq!( - points.timestamps, test.timestamps, + points.timestamps(), + test.timestamps, "Points do not have correct timestamps" ); diff --git a/oximeter/db/src/oxql/ast/table_ops/join.rs b/oximeter/db/src/oxql/ast/table_ops/join.rs index 3c150a4acf..2893f6cf3e 100644 --- a/oximeter/db/src/oxql/ast/table_ops/join.rs +++ b/oximeter/db/src/oxql/ast/table_ops/join.rs @@ -6,12 +6,10 @@ // Copyright 2024 Oxide Computer Company -use crate::oxql::point::MetricType; -use crate::oxql::point::Points; -use crate::oxql::point::Values; -use crate::oxql::Error; -use crate::oxql::Table; use anyhow::Context; +use anyhow::Error; +use oxql_types::point::MetricType; +use oxql_types::Table; /// An AST node for a natural inner join. #[derive(Clone, Copy, Debug, PartialEq)] @@ -80,10 +78,8 @@ impl Join { // 1. They have the same alignment, and // 2. We merge the timepoints rather than simply creating a // ragged array of points. - timeseries.points = inner_join_point_arrays( - ×eries.points, - &next_timeseries.points, - )?; + timeseries.points = + timeseries.points.inner_join(&next_timeseries.points)?; } // We'll also update the name, to indicate the joined data. out.name.push(','); @@ -93,101 +89,6 @@ impl Join { } } -// Given two arrays of points, stack them together at matching timepoints. -// -// For time points in either which do not have a corresponding point in the -// other, the entire time point is elided. -fn inner_join_point_arrays( - left: &Points, - right: &Points, -) -> Result { - // Create an output array with roughly the right capacity, and double the - // number of dimensions. We're trying to stack output value arrays together - // along the dimension axis. - let data_types = - left.data_types().chain(right.data_types()).collect::>(); - let metric_types = - left.metric_types().chain(right.metric_types()).collect::>(); - let mut out = Points::with_capacity( - left.len().max(right.len()), - data_types.iter().copied(), - metric_types.iter().copied(), - )?; - - // Iterate through each array until one is exhausted. We're only inserting - // values from both arrays where the timestamps actually match, since this - // is an inner join. We may want to insert missing values where timestamps - // do not match on either side, when we support an outer join of some kind. - let n_left_dim = left.values.len(); - let mut left_ix = 0; - let mut right_ix = 0; - while left_ix < left.len() && right_ix < right.len() { - let left_timestamp = left.timestamps[left_ix]; - let right_timestamp = right.timestamps[right_ix]; - if left_timestamp == right_timestamp { - out.timestamps.push(left_timestamp); - push_concrete_values( - &mut out.values[..n_left_dim], - &left.values, - left_ix, - ); - push_concrete_values( - &mut out.values[n_left_dim..], - &right.values, - right_ix, - ); - left_ix += 1; - right_ix += 1; - } else if left_timestamp < right_timestamp { - left_ix += 1; - } else { - right_ix += 1; - } - } - Ok(out) -} - -// Push the `i`th value from each dimension of `from` onto `to`. -fn push_concrete_values(to: &mut [Values], from: &[Values], i: usize) { - assert_eq!(to.len(), from.len()); - for (output, input) in to.iter_mut().zip(from.iter()) { - let input_array = &input.values; - let output_array = &mut output.values; - assert_eq!(input_array.data_type(), output_array.data_type()); - if let Ok(ints) = input_array.as_integer() { - output_array.as_integer_mut().unwrap().push(ints[i]); - continue; - } - if let Ok(doubles) = input_array.as_double() { - output_array.as_double_mut().unwrap().push(doubles[i]); - continue; - } - if let Ok(bools) = input_array.as_boolean() { - output_array.as_boolean_mut().unwrap().push(bools[i]); - continue; - } - if let Ok(strings) = input_array.as_string() { - output_array.as_string_mut().unwrap().push(strings[i].clone()); - continue; - } - if let Ok(dists) = input_array.as_integer_distribution() { - output_array - .as_integer_distribution_mut() - .unwrap() - .push(dists[i].clone()); - continue; - } - if let Ok(dists) = input_array.as_double_distribution() { - output_array - .as_double_distribution_mut() - .unwrap() - .push(dists[i].clone()); - continue; - } - unreachable!(); - } -} - // Return an error if any metric types are not suitable for joining. fn ensure_all_metric_types( mut metric_types: impl ExactSizeIterator, @@ -200,186 +101,3 @@ fn ensure_all_metric_types( ); Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - use crate::oxql::point::DataType; - use crate::oxql::point::Datum; - use crate::oxql::point::ValueArray; - use chrono::Utc; - use std::time::Duration; - - #[test] - fn test_push_concrete_values() { - let mut points = Points::with_capacity( - 2, - [DataType::Integer, DataType::Double].into_iter(), - [MetricType::Gauge, MetricType::Gauge].into_iter(), - ) - .unwrap(); - - // Push a concrete value for the integer dimension - let from_ints = vec![Values { - values: ValueArray::Integer(vec![Some(1)]), - metric_type: MetricType::Gauge, - }]; - push_concrete_values(&mut points.values[..1], &from_ints, 0); - - // And another for the double dimension. - let from_doubles = vec![Values { - values: ValueArray::Double(vec![Some(2.0)]), - metric_type: MetricType::Gauge, - }]; - push_concrete_values(&mut points.values[1..], &from_doubles, 0); - - assert_eq!( - points.dimensionality(), - 2, - "Points should have 2 dimensions", - ); - let ints = points.values[0].values.as_integer().unwrap(); - assert_eq!( - ints.len(), - 1, - "Should have pushed one point in the first dimension" - ); - assert_eq!( - ints[0], - Some(1), - "Should have pushed 1 onto the first dimension" - ); - let doubles = points.values[1].values.as_double().unwrap(); - assert_eq!( - doubles.len(), - 1, - "Should have pushed one point in the second dimension" - ); - assert_eq!( - doubles[0], - Some(2.0), - "Should have pushed 2.0 onto the second dimension" - ); - } - - #[test] - fn test_join_point_arrays() { - let now = Utc::now(); - - // Create a set of integer points to join with. - // - // This will have two timestamps, one of which will match the points - // below that are merged in. - let int_points = Points { - start_times: None, - timestamps: vec![ - now - Duration::from_secs(3), - now - Duration::from_secs(2), - now, - ], - values: vec![Values { - values: ValueArray::Integer(vec![Some(1), Some(2), Some(3)]), - metric_type: MetricType::Gauge, - }], - }; - - // Create an additional set of double points. - // - // This also has two timepoints, one of which matches with the above, - // and one of which does not. - let double_points = Points { - start_times: None, - timestamps: vec![ - now - Duration::from_secs(3), - now - Duration::from_secs(1), - now, - ], - values: vec![Values { - values: ValueArray::Double(vec![ - Some(4.0), - Some(5.0), - Some(6.0), - ]), - metric_type: MetricType::Gauge, - }], - }; - - // Merge the arrays. - let merged = - inner_join_point_arrays(&int_points, &double_points).unwrap(); - - // Basic checks that we merged in the right values and have the right - // types and dimensions. - assert_eq!( - merged.dimensionality(), - 2, - "Should have appended the dimensions from each input array" - ); - assert_eq!(merged.len(), 2, "Should have merged two common points",); - assert_eq!( - merged.data_types().collect::>(), - &[DataType::Integer, DataType::Double], - "Should have combined the data types of the input arrays" - ); - assert_eq!( - merged.metric_types().collect::>(), - &[MetricType::Gauge, MetricType::Gauge], - "Should have combined the metric types of the input arrays" - ); - - // Check the actual values of the array. - let mut points = merged.iter_points(); - - // The first and last timepoint overlapped between the two arrays, so we - // should have both of them as concrete samples. - let pt = points.next().unwrap(); - assert_eq!(pt.start_time, None, "Gauges don't have a start time"); - assert_eq!( - *pt.timestamp, int_points.timestamps[0], - "Should have taken the first input timestamp from both arrays", - ); - assert_eq!( - *pt.timestamp, double_points.timestamps[0], - "Should have taken the first input timestamp from both arrays", - ); - let values = pt.values; - assert_eq!(values.len(), 2, "Should have 2 dimensions"); - assert_eq!( - &values[0], - &(Datum::Integer(Some(&1)), MetricType::Gauge), - "Should have pulled value from first integer array." - ); - assert_eq!( - &values[1], - &(Datum::Double(Some(&4.0)), MetricType::Gauge), - "Should have pulled value from second double array." - ); - - // And the next point - let pt = points.next().unwrap(); - assert_eq!(pt.start_time, None, "Gauges don't have a start time"); - assert_eq!( - *pt.timestamp, int_points.timestamps[2], - "Should have taken the input timestamp from both arrays", - ); - assert_eq!( - *pt.timestamp, double_points.timestamps[2], - "Should have taken the input timestamp from both arrays", - ); - let values = pt.values; - assert_eq!(values.len(), 2, "Should have 2 dimensions"); - assert_eq!( - &values[0], - &(Datum::Integer(Some(&3)), MetricType::Gauge), - "Should have pulled value from first integer array." - ); - assert_eq!( - &values[1], - &(Datum::Double(Some(&6.0)), MetricType::Gauge), - "Should have pulled value from second double array." - ); - - // And there should be no other values. - assert!(points.next().is_none(), "There should be no more points"); - } -} diff --git a/oximeter/db/src/oxql/ast/table_ops/limit.rs b/oximeter/db/src/oxql/ast/table_ops/limit.rs index 0205868f5c..89afb31a7c 100644 --- a/oximeter/db/src/oxql/ast/table_ops/limit.rs +++ b/oximeter/db/src/oxql/ast/table_ops/limit.rs @@ -6,12 +6,8 @@ // Copyright 2024 Oxide Computer Company -use crate::oxql::point::Points; -use crate::oxql::point::ValueArray; -use crate::oxql::point::Values; -use crate::oxql::Error; -use crate::oxql::Table; -use crate::oxql::Timeseries; +use anyhow::Error; +use oxql_types::Table; use std::num::NonZeroUsize; /// The kind of limiting operation @@ -65,58 +61,7 @@ impl Limit { } }; - // Slice the various data arrays. - let start_times = input_points - .start_times - .as_ref() - .map(|s| s[start..end].to_vec()); - let timestamps = - input_points.timestamps[start..end].to_vec(); - let values = input_points - .values - .iter() - .map(|vals| { - let values = match &vals.values { - ValueArray::Integer(inner) => { - ValueArray::Integer( - inner[start..end].to_vec(), - ) - } - ValueArray::Double(inner) => { - ValueArray::Double( - inner[start..end].to_vec(), - ) - } - ValueArray::Boolean(inner) => { - ValueArray::Boolean( - inner[start..end].to_vec(), - ) - } - ValueArray::String(inner) => { - ValueArray::String( - inner[start..end].to_vec(), - ) - } - ValueArray::IntegerDistribution(inner) => { - ValueArray::IntegerDistribution( - inner[start..end].to_vec(), - ) - } - ValueArray::DoubleDistribution(inner) => { - ValueArray::DoubleDistribution( - inner[start..end].to_vec(), - ) - } - }; - Values { values, metric_type: vals.metric_type } - }) - .collect(); - let points = Points { start_times, timestamps, values }; - Timeseries { - fields: timeseries.fields.clone(), - points, - alignment: timeseries.alignment, - } + timeseries.limit(start, end) }); Table::from_timeseries(table.name(), timeseries) }) @@ -127,9 +72,12 @@ impl Limit { #[cfg(test)] mod tests { use super::*; - use crate::oxql::point::{DataType, MetricType}; use chrono::Utc; use oximeter::FieldValue; + use oxql_types::{ + point::{DataType, MetricType}, + Timeseries, + }; use std::{collections::BTreeMap, time::Duration}; fn test_tables() -> Vec { @@ -150,12 +98,14 @@ mod tests { MetricType::Gauge, ) .unwrap(); - timeseries.points.timestamps.clone_from(×tamps); - timeseries.points.values[0].values.as_integer_mut().unwrap().extend([ - Some(1), - Some(2), - Some(3), - ]); + timeseries.points.set_timestamps(timestamps.clone()); + timeseries + .points + .values_mut(0) + .unwrap() + .as_integer_mut() + .unwrap() + .extend([Some(1), Some(2), Some(3)]); let table1 = Table::from_timeseries("first", std::iter::once(timeseries)) .unwrap(); @@ -166,12 +116,14 @@ mod tests { MetricType::Gauge, ) .unwrap(); - timeseries.points.timestamps.clone_from(×tamps); - timeseries.points.values[0].values.as_integer_mut().unwrap().extend([ - Some(4), - Some(5), - Some(6), - ]); + timeseries.points.set_timestamps(timestamps.clone()); + timeseries + .points + .values_mut(0) + .unwrap() + .as_integer_mut() + .unwrap() + .extend([Some(4), Some(5), Some(6)]); let table2 = Table::from_timeseries("second", std::iter::once(timeseries)) .unwrap(); @@ -223,7 +175,8 @@ mod tests { "Limited table should have the same fields" ); assert_eq!( - timeseries.alignment, limited_timeseries.alignment, + timeseries.alignment(), + limited_timeseries.alignment(), "Limited timeseries should have the same alignment" ); assert_eq!( @@ -237,14 +190,15 @@ mod tests { // These depend on the limit operation. let points = ×eries.points; let limited_points = &limited_timeseries.points; - assert_eq!(points.start_times, limited_points.start_times); + assert_eq!(points.start_times(), limited_points.start_times()); assert_eq!( - points.timestamps[start..end], - limited_points.timestamps + &points.timestamps()[start..end], + limited_points.timestamps() ); assert_eq!( - limited_points.values[0].values.as_integer().unwrap(), - &points.values[0].values.as_integer().unwrap()[start..end], + limited_points.values(0).unwrap().as_integer().unwrap(), + &points.values(0).unwrap().as_integer().unwrap() + [start..end], "Points should be limited to [{start}..{end}]", ); } diff --git a/oximeter/db/src/oxql/ast/table_ops/mod.rs b/oximeter/db/src/oxql/ast/table_ops/mod.rs index 46f5106a08..8b8d4cbe1b 100644 --- a/oximeter/db/src/oxql/ast/table_ops/mod.rs +++ b/oximeter/db/src/oxql/ast/table_ops/mod.rs @@ -20,10 +20,10 @@ use self::join::Join; use self::limit::Limit; use crate::oxql::ast::Query; use crate::oxql::Error; -use crate::oxql::Table; use chrono::DateTime; use chrono::Utc; use oximeter::TimeseriesName; +use oxql_types::Table; /// A basic table operation, the atoms of an OxQL query. #[derive(Clone, Debug, PartialEq)] diff --git a/oximeter/db/src/oxql/mod.rs b/oximeter/db/src/oxql/mod.rs index 3961fae1cc..fcdfb783c5 100644 --- a/oximeter/db/src/oxql/mod.rs +++ b/oximeter/db/src/oxql/mod.rs @@ -10,13 +10,9 @@ use peg::error::ParseError as PegError; use peg::str::LineCol; pub mod ast; -pub mod point; pub mod query; -pub mod table; pub use self::query::Query; -pub use self::table::Table; -pub use self::table::Timeseries; pub use anyhow::Error; /// Format a PEG parsing error into a nice anyhow error. diff --git a/oximeter/db/src/oxql/query/mod.rs b/oximeter/db/src/oxql/query/mod.rs index e1fada9f2a..46c9bbc92c 100644 --- a/oximeter/db/src/oxql/query/mod.rs +++ b/oximeter/db/src/oxql/query/mod.rs @@ -23,7 +23,6 @@ use crate::oxql::Error; use crate::TimeseriesName; use chrono::DateTime; use chrono::Utc; -use std::time::Duration; /// A parsed OxQL query. #[derive(Clone, Debug, PartialEq)] @@ -391,15 +390,6 @@ fn restrict_filter_idents( } } -/// Describes the time alignment for an OxQL query. -#[derive(Clone, Copy, Debug, PartialEq)] -pub struct Alignment { - /// The end time of the query, which the temporal reference point. - pub end_time: DateTime, - /// The alignment period, the interval on which values are produced. - pub period: Duration, -} - #[cfg(test)] mod tests { use super::Filter; diff --git a/oximeter/db/src/query.rs b/oximeter/db/src/query.rs index ceabf00888..556ced0437 100644 --- a/oximeter/db/src/query.rs +++ b/oximeter/db/src/query.rs @@ -6,11 +6,12 @@ // Copyright 2021 Oxide Computer Company use crate::{ - Error, FieldSchema, FieldSource, TimeseriesKey, TimeseriesSchema, - DATABASE_NAME, DATABASE_SELECT_FORMAT, + Error, FieldSchema, FieldSource, TimeseriesSchema, DATABASE_NAME, + DATABASE_SELECT_FORMAT, }; use chrono::{DateTime, Utc}; use dropshot::PaginationOrder; +use oximeter::schema::TimeseriesKey; use oximeter::types::{DatumType, FieldType, FieldValue}; use oximeter::{Metric, Target}; use regex::Regex; diff --git a/oximeter/db/src/shells/oxql.rs b/oximeter/db/src/shells/oxql.rs index 0f23ea7d64..f46d08c0cf 100644 --- a/oximeter/db/src/shells/oxql.rs +++ b/oximeter/db/src/shells/oxql.rs @@ -7,9 +7,10 @@ // Copyright 2024 Oxide Computer use super::{list_timeseries, prepare_columns}; -use crate::{make_client, oxql::Table, Client, OxqlResult}; +use crate::{make_client, Client, OxqlResult}; use clap::Args; use crossterm::style::Stylize; +use oxql_types::Table; use reedline::DefaultPrompt; use reedline::DefaultPromptSegment; use reedline::Reedline; diff --git a/oximeter/oxql-types/Cargo.toml b/oximeter/oxql-types/Cargo.toml new file mode 100644 index 0000000000..da7c7bcd1c --- /dev/null +++ b/oximeter/oxql-types/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "oxql-types" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +highway.workspace = true +num.workspace = true +omicron-workspace-hack.workspace = true +oximeter-types.workspace = true +schemars.workspace = true +serde.workspace = true diff --git a/oximeter/oxql-types/src/lib.rs b/oximeter/oxql-types/src/lib.rs new file mode 100644 index 0000000000..00468705a9 --- /dev/null +++ b/oximeter/oxql-types/src/lib.rs @@ -0,0 +1,23 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Core types for OxQL. + +use chrono::{DateTime, Utc}; +use std::time::Duration; + +pub mod point; +pub mod table; + +pub use self::table::Table; +pub use self::table::Timeseries; + +/// Describes the time alignment for an OxQL query. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Alignment { + /// The end time of the query, which the temporal reference point. + pub end_time: DateTime, + /// The alignment period, the interval on which values are produced. + pub period: Duration, +} diff --git a/oximeter/db/src/oxql/point.rs b/oximeter/oxql-types/src/point.rs similarity index 82% rename from oximeter/db/src/oxql/point.rs rename to oximeter/oxql-types/src/point.rs index e04193e8b8..6e3c7143dc 100644 --- a/oximeter/db/src/oxql/point.rs +++ b/oximeter/oxql-types/src/point.rs @@ -6,15 +6,15 @@ // Copyright 2024 Oxide Computer Company -use super::Error; use anyhow::Context; +use anyhow::Error; use chrono::DateTime; use chrono::Utc; use num::ToPrimitive; -use oximeter::traits::HistogramSupport; -use oximeter::DatumType; -use oximeter::Measurement; -use oximeter::Quantile; +use oximeter_types::traits::HistogramSupport; +use oximeter_types::DatumType; +use oximeter_types::Measurement; +use oximeter_types::Quantile; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -131,32 +131,32 @@ impl CumulativeDatum { // not cumulative. fn from_cumulative(meas: &Measurement) -> Result { let datum = match meas.datum() { - oximeter::Datum::CumulativeI64(val) => { + oximeter_types::Datum::CumulativeI64(val) => { CumulativeDatum::Integer(val.value()) } - oximeter::Datum::CumulativeU64(val) => { + oximeter_types::Datum::CumulativeU64(val) => { let int = val .value() .try_into() .context("Overflow converting u64 to i64")?; CumulativeDatum::Integer(int) } - oximeter::Datum::CumulativeF32(val) => { + oximeter_types::Datum::CumulativeF32(val) => { CumulativeDatum::Double(val.value().into()) } - oximeter::Datum::CumulativeF64(val) => { + oximeter_types::Datum::CumulativeF64(val) => { CumulativeDatum::Double(val.value()) } - oximeter::Datum::HistogramI8(hist) => hist.into(), - oximeter::Datum::HistogramU8(hist) => hist.into(), - oximeter::Datum::HistogramI16(hist) => hist.into(), - oximeter::Datum::HistogramU16(hist) => hist.into(), - oximeter::Datum::HistogramI32(hist) => hist.into(), - oximeter::Datum::HistogramU32(hist) => hist.into(), - oximeter::Datum::HistogramI64(hist) => hist.into(), - oximeter::Datum::HistogramU64(hist) => hist.try_into()?, - oximeter::Datum::HistogramF32(hist) => hist.into(), - oximeter::Datum::HistogramF64(hist) => hist.into(), + oximeter_types::Datum::HistogramI8(hist) => hist.into(), + oximeter_types::Datum::HistogramU8(hist) => hist.into(), + oximeter_types::Datum::HistogramI16(hist) => hist.into(), + oximeter_types::Datum::HistogramU16(hist) => hist.into(), + oximeter_types::Datum::HistogramI32(hist) => hist.into(), + oximeter_types::Datum::HistogramU32(hist) => hist.into(), + oximeter_types::Datum::HistogramI64(hist) => hist.into(), + oximeter_types::Datum::HistogramU64(hist) => hist.try_into()?, + oximeter_types::Datum::HistogramF32(hist) => hist.into(), + oximeter_types::Datum::HistogramF64(hist) => hist.into(), other => anyhow::bail!( "Input datum of type {} is not cumulative", other.datum_type(), @@ -169,10 +169,10 @@ impl CumulativeDatum { /// A single list of values, for one dimension of a timeseries. #[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] pub struct Values { - // The data values. - pub(super) values: ValueArray, - // The type of this metric. - pub(super) metric_type: MetricType, + /// The data values. + pub values: ValueArray, + /// The type of this metric. + pub metric_type: MetricType, } impl Values { @@ -285,14 +285,23 @@ impl<'a> fmt::Display for Datum<'a> { #[derive(Clone, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] pub struct Points { // The start time points for cumulative or delta metrics. - pub(super) start_times: Option>>, + pub(crate) start_times: Option>>, // The timestamp of each value. - pub(super) timestamps: Vec>, + pub(crate) timestamps: Vec>, // The array of data values, one for each dimension. - pub(super) values: Vec, + pub(crate) values: Vec, } impl Points { + /// Construct a new `Points` with the provided data. + pub fn new( + start_times: Option>>, + timestamps: Vec>, + values: Vec, + ) -> Self { + Self { start_times, timestamps, values } + } + /// Construct an empty array of points to hold data of the provided type. pub fn empty(data_type: DataType, metric_type: MetricType) -> Self { Self::with_capacity( @@ -303,8 +312,28 @@ impl Points { .unwrap() } - // Return a mutable reference to the value array of the specified dimension, if any. - pub(super) fn values_mut(&mut self, dim: usize) -> Option<&mut ValueArray> { + /// Return the start times of the points, if any. + pub fn start_times(&self) -> Option<&[DateTime]> { + self.start_times.as_deref() + } + + /// Clear the start times of the points. + pub fn clear_start_times(&mut self) { + self.start_times = None; + } + + /// Return the timestamps of the points. + pub fn timestamps(&self) -> &[DateTime] { + &self.timestamps + } + + pub fn set_timestamps(&mut self, timestamps: Vec>) { + self.timestamps = timestamps; + } + + /// Return a mutable reference to the value array of the specified + /// dimension, if any. + pub fn values_mut(&mut self, dim: usize) -> Option<&mut ValueArray> { self.values.get_mut(dim).map(|val| &mut val.values) } @@ -563,8 +592,8 @@ impl Points { }) } - // Filter points in self to those where `to_keep` is true. - pub(crate) fn filter(&self, to_keep: Vec) -> Result { + /// Filter points in self to those where `to_keep` is true. + pub fn filter(&self, to_keep: Vec) -> Result { anyhow::ensure!( to_keep.len() == self.len(), "Filter array must be the same length as self", @@ -646,8 +675,8 @@ impl Points { Ok(out) } - // Return a new set of points, with the values casted to the provided types. - pub(crate) fn cast(&self, types: &[DataType]) -> Result { + /// Return a new set of points, with the values casted to the provided types. + pub fn cast(&self, types: &[DataType]) -> Result { anyhow::ensure!( types.len() == self.dimensionality(), "Cannot cast to {} types, the data has dimensionality {}", @@ -863,12 +892,104 @@ impl Points { Ok(Self { start_times, timestamps, values: new_values }) } + /// Given two arrays of points, stack them together at matching timepoints. + /// + /// For time points in either which do not have a corresponding point in + /// the other, the entire time point is elided. + pub fn inner_join(&self, right: &Points) -> Result { + // Create an output array with roughly the right capacity, and double the + // number of dimensions. We're trying to stack output value arrays together + // along the dimension axis. + let data_types = + self.data_types().chain(right.data_types()).collect::>(); + let metric_types = + self.metric_types().chain(right.metric_types()).collect::>(); + let mut out = Points::with_capacity( + self.len().max(right.len()), + data_types.iter().copied(), + metric_types.iter().copied(), + )?; + + // Iterate through each array until one is exhausted. We're only inserting + // values from both arrays where the timestamps actually match, since this + // is an inner join. We may want to insert missing values where timestamps + // do not match on either side, when we support an outer join of some kind. + let n_left_dim = self.dimensionality(); + let mut left_ix = 0; + let mut right_ix = 0; + while left_ix < self.len() && right_ix < right.len() { + let left_timestamp = self.timestamps()[left_ix]; + let right_timestamp = right.timestamps()[right_ix]; + if left_timestamp == right_timestamp { + out.timestamps.push(left_timestamp); + push_concrete_values( + &mut out.values[..n_left_dim], + &self.values, + left_ix, + ); + push_concrete_values( + &mut out.values[n_left_dim..], + &right.values, + right_ix, + ); + left_ix += 1; + right_ix += 1; + } else if left_timestamp < right_timestamp { + left_ix += 1; + } else { + right_ix += 1; + } + } + Ok(out) + } + /// Return true if self contains no data points. pub fn is_empty(&self) -> bool { self.len() == 0 } } +// Push the `i`th value from each dimension of `from` onto `to`. +fn push_concrete_values(to: &mut [Values], from: &[Values], i: usize) { + assert_eq!(to.len(), from.len()); + for (output, input) in to.iter_mut().zip(from.iter()) { + let input_array = &input.values; + let output_array = &mut output.values; + assert_eq!(input_array.data_type(), output_array.data_type()); + if let Ok(ints) = input_array.as_integer() { + output_array.as_integer_mut().unwrap().push(ints[i]); + continue; + } + if let Ok(doubles) = input_array.as_double() { + output_array.as_double_mut().unwrap().push(doubles[i]); + continue; + } + if let Ok(bools) = input_array.as_boolean() { + output_array.as_boolean_mut().unwrap().push(bools[i]); + continue; + } + if let Ok(strings) = input_array.as_string() { + output_array.as_string_mut().unwrap().push(strings[i].clone()); + continue; + } + if let Ok(dists) = input_array.as_integer_distribution() { + output_array + .as_integer_distribution_mut() + .unwrap() + .push(dists[i].clone()); + continue; + } + if let Ok(dists) = input_array.as_double_distribution() { + output_array + .as_double_distribution_mut() + .unwrap() + .push(dists[i].clone()); + continue; + } + unreachable!(); + } +} + /// List of data values for one timeseries. /// /// Each element is an option, where `None` represents a missing sample. @@ -900,8 +1021,8 @@ impl ValueArray { } } - // Return the data type in self. - pub(super) fn data_type(&self) -> DataType { + /// Return the data type in self. + pub fn data_type(&self) -> DataType { match self { ValueArray::Integer(_) => DataType::Integer, ValueArray::Double(_) => DataType::Double, @@ -947,10 +1068,8 @@ impl ValueArray { Ok(inner) } - // Access the inner array of integers, if possible. - pub(super) fn as_integer_mut( - &mut self, - ) -> Result<&mut Vec>, Error> { + /// Access the inner array of integers, if possible. + pub fn as_integer_mut(&mut self) -> Result<&mut Vec>, Error> { let ValueArray::Integer(inner) = self else { anyhow::bail!( "Cannot access value array as integer type, it has type {}", @@ -1107,91 +1226,97 @@ impl ValueArray { // Push a value directly from a datum, without modification. fn push_value_from_datum( &mut self, - datum: &oximeter::Datum, + datum: &oximeter_types::Datum, ) -> Result<(), Error> { match datum { - oximeter::Datum::Bool(b) => self.as_boolean_mut()?.push(Some(*b)), - oximeter::Datum::I8(i) => { + oximeter_types::Datum::Bool(b) => { + self.as_boolean_mut()?.push(Some(*b)) + } + oximeter_types::Datum::I8(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::U8(i) => { + oximeter_types::Datum::U8(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::I16(i) => { + oximeter_types::Datum::I16(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::U16(i) => { + oximeter_types::Datum::U16(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::I32(i) => { + oximeter_types::Datum::I32(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::U32(i) => { + oximeter_types::Datum::U32(i) => { self.as_integer_mut()?.push(Some(i64::from(*i))) } - oximeter::Datum::I64(i) => self.as_integer_mut()?.push(Some(*i)), - oximeter::Datum::U64(i) => { + oximeter_types::Datum::I64(i) => { + self.as_integer_mut()?.push(Some(*i)) + } + oximeter_types::Datum::U64(i) => { let i = i.to_i64().context("Failed to convert u64 datum to i64")?; self.as_integer_mut()?.push(Some(i)); } - oximeter::Datum::F32(f) => { + oximeter_types::Datum::F32(f) => { self.as_double_mut()?.push(Some(f64::from(*f))) } - oximeter::Datum::F64(f) => self.as_double_mut()?.push(Some(*f)), - oximeter::Datum::String(s) => { + oximeter_types::Datum::F64(f) => { + self.as_double_mut()?.push(Some(*f)) + } + oximeter_types::Datum::String(s) => { self.as_string_mut()?.push(Some(s.clone())) } - oximeter::Datum::Bytes(_) => { + oximeter_types::Datum::Bytes(_) => { anyhow::bail!("Bytes data types are not yet supported") } - oximeter::Datum::CumulativeI64(c) => { + oximeter_types::Datum::CumulativeI64(c) => { self.as_integer_mut()?.push(Some(c.value())) } - oximeter::Datum::CumulativeU64(c) => { + oximeter_types::Datum::CumulativeU64(c) => { let c = c .value() .to_i64() .context("Failed to convert u64 datum to i64")?; self.as_integer_mut()?.push(Some(c)); } - oximeter::Datum::CumulativeF32(c) => { + oximeter_types::Datum::CumulativeF32(c) => { self.as_double_mut()?.push(Some(f64::from(c.value()))) } - oximeter::Datum::CumulativeF64(c) => { + oximeter_types::Datum::CumulativeF64(c) => { self.as_double_mut()?.push(Some(c.value())) } - oximeter::Datum::HistogramI8(h) => self + oximeter_types::Datum::HistogramI8(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramU8(h) => self + oximeter_types::Datum::HistogramU8(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramI16(h) => self + oximeter_types::Datum::HistogramI16(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramU16(h) => self + oximeter_types::Datum::HistogramU16(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramI32(h) => self + oximeter_types::Datum::HistogramI32(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramU32(h) => self + oximeter_types::Datum::HistogramU32(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramI64(h) => self + oximeter_types::Datum::HistogramI64(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramU64(h) => self + oximeter_types::Datum::HistogramU64(h) => self .as_integer_distribution_mut()? .push(Some(Distribution::try_from(h)?)), - oximeter::Datum::HistogramF32(h) => self + oximeter_types::Datum::HistogramF32(h) => self .as_double_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::HistogramF64(h) => self + oximeter_types::Datum::HistogramF64(h) => self .as_double_distribution_mut()? .push(Some(Distribution::from(h))), - oximeter::Datum::Missing(missing) => { + oximeter_types::Datum::Missing(missing) => { self.push_missing(missing.datum_type())? } } @@ -1216,7 +1341,7 @@ impl ValueArray { fn push_diff_from_last_to_datum( &mut self, last_datum: &Option, - new_datum: &oximeter::Datum, + new_datum: &oximeter_types::Datum, data_type: DataType, ) -> Result<(), Error> { match (last_datum.as_ref(), new_datum.is_missing()) { @@ -1253,49 +1378,49 @@ impl ValueArray { match (last_datum, new_datum) { ( CumulativeDatum::Integer(last), - oximeter::Datum::I8(new), + oximeter_types::Datum::I8(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::U8(new), + oximeter_types::Datum::U8(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::I16(new), + oximeter_types::Datum::I16(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::U16(new), + oximeter_types::Datum::U16(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::I32(new), + oximeter_types::Datum::I32(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::U32(new), + oximeter_types::Datum::U32(new), ) => { let new = i64::from(*new); self.as_integer_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::I64(new), + oximeter_types::Datum::I64(new), ) => { let diff = new .checked_sub(*last) @@ -1304,7 +1429,7 @@ impl ValueArray { } ( CumulativeDatum::Integer(last), - oximeter::Datum::U64(new), + oximeter_types::Datum::U64(new), ) => { let new = new .to_i64() @@ -1316,20 +1441,20 @@ impl ValueArray { } ( CumulativeDatum::Double(last), - oximeter::Datum::F32(new), + oximeter_types::Datum::F32(new), ) => { self.as_double_mut()? .push(Some(f64::from(*new) - last)); } ( CumulativeDatum::Double(last), - oximeter::Datum::F64(new), + oximeter_types::Datum::F64(new), ) => { self.as_double_mut()?.push(Some(new - last)); } ( CumulativeDatum::Integer(last), - oximeter::Datum::CumulativeI64(new), + oximeter_types::Datum::CumulativeI64(new), ) => { let new = new.value(); let diff = new @@ -1339,7 +1464,7 @@ impl ValueArray { } ( CumulativeDatum::Integer(last), - oximeter::Datum::CumulativeU64(new), + oximeter_types::Datum::CumulativeU64(new), ) => { let new = new .value() @@ -1352,20 +1477,20 @@ impl ValueArray { } ( CumulativeDatum::Double(last), - oximeter::Datum::CumulativeF32(new), + oximeter_types::Datum::CumulativeF32(new), ) => { self.as_double_mut()? .push(Some(f64::from(new.value()) - last)); } ( CumulativeDatum::Double(last), - oximeter::Datum::CumulativeF64(new), + oximeter_types::Datum::CumulativeF64(new), ) => { self.as_double_mut()?.push(Some(new.value() - last)); } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramI8(new), + oximeter_types::Datum::HistogramI8(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1373,7 +1498,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramU8(new), + oximeter_types::Datum::HistogramU8(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1381,7 +1506,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramI16(new), + oximeter_types::Datum::HistogramI16(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1389,7 +1514,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramU16(new), + oximeter_types::Datum::HistogramU16(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1397,7 +1522,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramI32(new), + oximeter_types::Datum::HistogramI32(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1405,7 +1530,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramU32(new), + oximeter_types::Datum::HistogramU32(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1413,7 +1538,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramI64(new), + oximeter_types::Datum::HistogramI64(new), ) => { let new = Distribution::from(new); self.as_integer_distribution_mut()? @@ -1421,7 +1546,7 @@ impl ValueArray { } ( CumulativeDatum::IntegerDistribution(last), - oximeter::Datum::HistogramU64(new), + oximeter_types::Datum::HistogramU64(new), ) => { let new = Distribution::try_from(new)?; self.as_integer_distribution_mut()? @@ -1429,7 +1554,7 @@ impl ValueArray { } ( CumulativeDatum::DoubleDistribution(last), - oximeter::Datum::HistogramF32(new), + oximeter_types::Datum::HistogramF32(new), ) => { let new = Distribution::::from(new); self.as_double_distribution_mut()? @@ -1437,7 +1562,7 @@ impl ValueArray { } ( CumulativeDatum::DoubleDistribution(last), - oximeter::Datum::HistogramF64(new), + oximeter_types::Datum::HistogramF64(new), ) => { let new = Distribution::::from(new); self.as_double_distribution_mut()? @@ -1486,8 +1611,8 @@ impl ValueArray { } } - // Swap the value in self with other, asserting they're the same type. - pub(crate) fn swap(&mut self, mut values: ValueArray) { + /// Swap the value in self with other, asserting they're the same type. + pub fn swap(&mut self, mut values: ValueArray) { use std::mem::swap; match (self, &mut values) { (ValueArray::Integer(x), ValueArray::Integer(y)) => swap(x, y), @@ -1733,8 +1858,10 @@ where macro_rules! i64_dist_from { ($t:ty) => { - impl From<&oximeter::histogram::Histogram<$t>> for Distribution { - fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + impl From<&oximeter_types::histogram::Histogram<$t>> + for Distribution + { + fn from(hist: &oximeter_types::histogram::Histogram<$t>) -> Self { let (bins, counts) = hist.bins_and_counts(); Self { bins: bins.into_iter().map(i64::from).collect(), @@ -1750,8 +1877,10 @@ macro_rules! i64_dist_from { } } - impl From<&oximeter::histogram::Histogram<$t>> for CumulativeDatum { - fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + impl From<&oximeter_types::histogram::Histogram<$t>> + for CumulativeDatum + { + fn from(hist: &oximeter_types::histogram::Histogram<$t>) -> Self { CumulativeDatum::IntegerDistribution(hist.into()) } } @@ -1766,10 +1895,10 @@ i64_dist_from!(i32); i64_dist_from!(u32); i64_dist_from!(i64); -impl TryFrom<&oximeter::histogram::Histogram> for Distribution { +impl TryFrom<&oximeter_types::histogram::Histogram> for Distribution { type Error = Error; fn try_from( - hist: &oximeter::histogram::Histogram, + hist: &oximeter_types::histogram::Histogram, ) -> Result { let (bins, counts) = hist.bins_and_counts(); let bins = bins @@ -1791,10 +1920,10 @@ impl TryFrom<&oximeter::histogram::Histogram> for Distribution { } } -impl TryFrom<&oximeter::histogram::Histogram> for CumulativeDatum { +impl TryFrom<&oximeter_types::histogram::Histogram> for CumulativeDatum { type Error = Error; fn try_from( - hist: &oximeter::histogram::Histogram, + hist: &oximeter_types::histogram::Histogram, ) -> Result { hist.try_into().map(CumulativeDatum::IntegerDistribution) } @@ -1802,8 +1931,10 @@ impl TryFrom<&oximeter::histogram::Histogram> for CumulativeDatum { macro_rules! f64_dist_from { ($t:ty) => { - impl From<&oximeter::histogram::Histogram<$t>> for Distribution { - fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + impl From<&oximeter_types::histogram::Histogram<$t>> + for Distribution + { + fn from(hist: &oximeter_types::histogram::Histogram<$t>) -> Self { let (bins, counts) = hist.bins_and_counts(); Self { bins: bins.into_iter().map(f64::from).collect(), @@ -1819,8 +1950,10 @@ macro_rules! f64_dist_from { } } - impl From<&oximeter::histogram::Histogram<$t>> for CumulativeDatum { - fn from(hist: &oximeter::histogram::Histogram<$t>) -> Self { + impl From<&oximeter_types::histogram::Histogram<$t>> + for CumulativeDatum + { + fn from(hist: &oximeter_types::histogram::Histogram<$t>) -> Self { CumulativeDatum::DoubleDistribution(hist.into()) } } @@ -1833,9 +1966,9 @@ f64_dist_from!(f64); #[cfg(test)] mod tests { use super::{Distribution, MetricType, Points, Values}; - use crate::oxql::point::{DataType, ValueArray}; + use crate::point::{push_concrete_values, DataType, Datum, ValueArray}; use chrono::{DateTime, Utc}; - use oximeter::{ + use oximeter_types::{ histogram::Record, types::Cumulative, Measurement, Quantile, }; use std::time::Duration; @@ -1939,12 +2072,12 @@ mod tests { let now = Utc::now(); let current1 = now + Duration::from_secs(1); let mut hist1 = - oximeter::histogram::Histogram::new(&[0i64, 10, 20]).unwrap(); + oximeter_types::histogram::Histogram::new(&[0i64, 10, 20]).unwrap(); hist1.sample(1).unwrap(); hist1.set_start_time(current1); let current2 = now + Duration::from_secs(2); let mut hist2 = - oximeter::histogram::Histogram::new(&[0i64, 10, 20]).unwrap(); + oximeter_types::histogram::Histogram::new(&[0i64, 10, 20]).unwrap(); hist2.sample(5).unwrap(); hist2.sample(10).unwrap(); hist2.sample(15).unwrap(); @@ -2273,4 +2406,176 @@ mod tests { .cast(&[DataType::DoubleDistribution, DataType::DoubleDistribution]) .is_err()); } + + #[test] + fn test_push_concrete_values() { + let mut points = Points::with_capacity( + 2, + [DataType::Integer, DataType::Double].into_iter(), + [MetricType::Gauge, MetricType::Gauge].into_iter(), + ) + .unwrap(); + + // Push a concrete value for the integer dimension + let from_ints = vec![Values { + values: ValueArray::Integer(vec![Some(1)]), + metric_type: MetricType::Gauge, + }]; + push_concrete_values(&mut points.values[..1], &from_ints, 0); + + // And another for the double dimension. + let from_doubles = vec![Values { + values: ValueArray::Double(vec![Some(2.0)]), + metric_type: MetricType::Gauge, + }]; + push_concrete_values(&mut points.values[1..], &from_doubles, 0); + + assert_eq!( + points.dimensionality(), + 2, + "Points should have 2 dimensions", + ); + let ints = points.values[0].values.as_integer().unwrap(); + assert_eq!( + ints.len(), + 1, + "Should have pushed one point in the first dimension" + ); + assert_eq!( + ints[0], + Some(1), + "Should have pushed 1 onto the first dimension" + ); + let doubles = points.values[1].values.as_double().unwrap(); + assert_eq!( + doubles.len(), + 1, + "Should have pushed one point in the second dimension" + ); + assert_eq!( + doubles[0], + Some(2.0), + "Should have pushed 2.0 onto the second dimension" + ); + } + + #[test] + fn test_join_point_arrays() { + let now = Utc::now(); + + // Create a set of integer points to join with. + // + // This will have two timestamps, one of which will match the points + // below that are merged in. + let int_points = Points { + start_times: None, + timestamps: vec![ + now - Duration::from_secs(3), + now - Duration::from_secs(2), + now, + ], + values: vec![Values { + values: ValueArray::Integer(vec![Some(1), Some(2), Some(3)]), + metric_type: MetricType::Gauge, + }], + }; + + // Create an additional set of double points. + // + // This also has two timepoints, one of which matches with the above, + // and one of which does not. + let double_points = Points { + start_times: None, + timestamps: vec![ + now - Duration::from_secs(3), + now - Duration::from_secs(1), + now, + ], + values: vec![Values { + values: ValueArray::Double(vec![ + Some(4.0), + Some(5.0), + Some(6.0), + ]), + metric_type: MetricType::Gauge, + }], + }; + + // Merge the arrays. + let merged = int_points.inner_join(&double_points).unwrap(); + + // Basic checks that we merged in the right values and have the right + // types and dimensions. + assert_eq!( + merged.dimensionality(), + 2, + "Should have appended the dimensions from each input array" + ); + assert_eq!(merged.len(), 2, "Should have merged two common points",); + assert_eq!( + merged.data_types().collect::>(), + &[DataType::Integer, DataType::Double], + "Should have combined the data types of the input arrays" + ); + assert_eq!( + merged.metric_types().collect::>(), + &[MetricType::Gauge, MetricType::Gauge], + "Should have combined the metric types of the input arrays" + ); + + // Check the actual values of the array. + let mut points = merged.iter_points(); + + // The first and last timepoint overlapped between the two arrays, so we + // should have both of them as concrete samples. + let pt = points.next().unwrap(); + assert_eq!(pt.start_time, None, "Gauges don't have a start time"); + assert_eq!( + *pt.timestamp, int_points.timestamps[0], + "Should have taken the first input timestamp from both arrays", + ); + assert_eq!( + *pt.timestamp, double_points.timestamps[0], + "Should have taken the first input timestamp from both arrays", + ); + let values = pt.values; + assert_eq!(values.len(), 2, "Should have 2 dimensions"); + assert_eq!( + &values[0], + &(Datum::Integer(Some(&1)), MetricType::Gauge), + "Should have pulled value from first integer array." + ); + assert_eq!( + &values[1], + &(Datum::Double(Some(&4.0)), MetricType::Gauge), + "Should have pulled value from second double array." + ); + + // And the next point + let pt = points.next().unwrap(); + assert_eq!(pt.start_time, None, "Gauges don't have a start time"); + assert_eq!( + *pt.timestamp, int_points.timestamps[2], + "Should have taken the input timestamp from both arrays", + ); + assert_eq!( + *pt.timestamp, double_points.timestamps[2], + "Should have taken the input timestamp from both arrays", + ); + let values = pt.values; + assert_eq!(values.len(), 2, "Should have 2 dimensions"); + assert_eq!( + &values[0], + &(Datum::Integer(Some(&3)), MetricType::Gauge), + "Should have pulled value from first integer array." + ); + assert_eq!( + &values[1], + &(Datum::Double(Some(&6.0)), MetricType::Gauge), + "Should have pulled value from second double array." + ); + + // And there should be no other values. + assert!(points.next().is_none(), "There should be no more points"); + } } diff --git a/oximeter/db/src/oxql/table.rs b/oximeter/oxql-types/src/table.rs similarity index 75% rename from oximeter/db/src/oxql/table.rs rename to oximeter/oxql-types/src/table.rs index 2cd141d2fa..f37992942f 100644 --- a/oximeter/db/src/oxql/table.rs +++ b/oximeter/oxql-types/src/table.rs @@ -6,14 +6,16 @@ // Copyright 2024 Oxide Computer Company -use super::point::DataType; -use super::point::MetricType; -use super::point::Points; -use super::query::Alignment; -use super::Error; -use crate::TimeseriesKey; +use crate::point::DataType; +use crate::point::MetricType; +use crate::point::Points; +use crate::point::ValueArray; +use crate::point::Values; +use crate::Alignment; +use anyhow::Error; use highway::HighwayHasher; -use oximeter::FieldValue; +use oximeter_types::schema::TimeseriesKey; +use oximeter_types::FieldValue; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -67,10 +69,20 @@ impl Timeseries { hasher.finish() } + /// Return the alignment of this timeseries, if any. + pub fn alignment(&self) -> Option { + self.alignment + } + + /// Set the alignment of this timeseries. + pub fn set_alignment(&mut self, alignment: Alignment) { + self.alignment = Some(alignment); + } + /// Return a copy of the timeseries, keeping only the provided fields. /// /// An error is returned if the timeseries does not contain those fields. - pub(crate) fn copy_with_fields( + pub fn copy_with_fields( &self, kept_fields: &[&str], ) -> Result { @@ -88,6 +100,20 @@ impl Timeseries { }) } + /// Return a copy of the timeseries, keeping only the provided points. + /// + /// Returns `None` if `kept_points` is empty. + pub fn copy_with_points(&self, kept_points: Points) -> Option { + if kept_points.is_empty() { + return None; + } + Some(Self { + fields: self.fields.clone(), + points: kept_points, + alignment: self.alignment, + }) + } + // Return `true` if the schema in `other` matches that of `self`. fn matches_schema(&self, other: &Timeseries) -> bool { if self.fields.len() != other.fields.len() { @@ -125,7 +151,7 @@ impl Timeseries { /// This returns an error if the points cannot be so cast, or the /// dimensionality of the types requested differs from the dimensionality of /// the points themselves. - pub(crate) fn cast(&self, types: &[DataType]) -> Result { + pub fn cast(&self, types: &[DataType]) -> Result { let fields = self.fields.clone(); Ok(Self { fields, @@ -133,6 +159,49 @@ impl Timeseries { alignment: self.alignment, }) } + + /// Return a new timeseries, with the points limited to the provided range. + pub fn limit(&self, start: usize, end: usize) -> Self { + let input_points = &self.points; + + // Slice the various data arrays. + let start_times = + input_points.start_times().map(|s| s[start..end].to_vec()); + let timestamps = input_points.timestamps()[start..end].to_vec(); + let values = input_points + .values + .iter() + .map(|vals| { + let values = match &vals.values { + ValueArray::Integer(inner) => { + ValueArray::Integer(inner[start..end].to_vec()) + } + ValueArray::Double(inner) => { + ValueArray::Double(inner[start..end].to_vec()) + } + ValueArray::Boolean(inner) => { + ValueArray::Boolean(inner[start..end].to_vec()) + } + ValueArray::String(inner) => { + ValueArray::String(inner[start..end].to_vec()) + } + ValueArray::IntegerDistribution(inner) => { + ValueArray::IntegerDistribution( + inner[start..end].to_vec(), + ) + } + ValueArray::DoubleDistribution(inner) => { + ValueArray::DoubleDistribution( + inner[start..end].to_vec(), + ) + } + }; + Values { values, metric_type: vals.metric_type } + }) + .collect(); + let points = Points::new(start_times, timestamps, values); + Self { fields: self.fields.clone(), points, alignment: self.alignment } + } } /// A table represents one or more timeseries with the same schema. @@ -146,7 +215,7 @@ pub struct Table { // // This starts as the name of the timeseries schema the data is derived // from, but can be modified as operations are done. - pub(super) name: String, + pub name: String, // The set of timeseries in the table, ordered by key. timeseries: BTreeMap, } diff --git a/oximeter/types/src/schema.rs b/oximeter/types/src/schema.rs index 2efd5265ff..80aaa6f101 100644 --- a/oximeter/types/src/schema.rs +++ b/oximeter/types/src/schema.rs @@ -28,6 +28,8 @@ use std::num::NonZeroU8; pub const SCHEMA_DIRECTORY: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../oximeter/schema"); +pub type TimeseriesKey = u64; + /// The name and type information for a field of a timeseries schema. #[derive( Clone, From a63e7840eabaaf3e4105dc4af4685c262409a4d0 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Sat, 17 Aug 2024 23:35:26 -0700 Subject: [PATCH 69/91] Make OxQL query response type into an object (#6374) - Fixes #6371 --- Cargo.lock | 1 + nexus/src/external_api/http_entrypoints.rs | 4 ++-- nexus/tests/integration_tests/metrics.rs | 13 ++++++++----- nexus/types/Cargo.toml | 1 + nexus/types/src/external_api/views.rs | 9 +++++++++ openapi/nexus.json | 22 +++++++++++++++++----- 6 files changed, 38 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2943a4f2c2..040ecd9e98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5461,6 +5461,7 @@ dependencies = [ "omicron-workspace-hack", "openssl", "oxnet", + "oxql-types", "parse-display", "proptest", "schemars", diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index df522f18ab..5b80c973e3 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -6386,7 +6386,7 @@ async fn timeseries_schema_list( async fn timeseries_query( rqctx: RequestContext, body: TypedBody, -) -> Result>, HttpError> { +) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; @@ -6395,7 +6395,7 @@ async fn timeseries_query( nexus .timeseries_query(&opctx, &query) .await - .map(HttpResponseOk) + .map(|tables| HttpResponseOk(views::OxqlQueryResult { tables })) .map_err(HttpError::from) }; apictx diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index e24de2a3ad..3b808984ae 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -19,6 +19,7 @@ use nexus_test_utils::resource_helpers::{ }; use nexus_test_utils::ControlPlaneTestContext; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::views::OxqlQueryResult; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use oximeter::types::Datum; @@ -307,12 +308,14 @@ pub async fn timeseries_query( .unwrap_or_else(|e| { panic!("timeseries query failed: {e:?}\nquery: {query}") }); - rsp.parsed_body().unwrap_or_else(|e| { - panic!( - "could not parse timeseries query response: {e:?}\n\ + rsp.parsed_body::() + .unwrap_or_else(|e| { + panic!( + "could not parse timeseries query response: {e:?}\n\ query: {query}\nresponse: {rsp:#?}" - ); - }) + ); + }) + .tables } #[nexus_test] diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index a4418d2a74..8dd6292d5c 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -19,6 +19,7 @@ humantime.workspace = true ipnetwork.workspace = true omicron-uuid-kinds.workspace = true openssl.workspace = true +oxql-types.workspace = true oxnet.workspace = true parse-display.workspace = true schemars = { workspace = true, features = ["chrono", "uuid1"] } diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index e241f849ee..58c2e560ab 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -971,3 +971,12 @@ pub struct AllowList { /// The allowlist of IPs or subnets. pub allowed_ips: ExternalAllowedSourceIps, } + +// OxQL QUERIES + +/// The result of a successful OxQL query. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct OxqlQueryResult { + /// Tables resulting from the query, each containing timeseries. + pub tables: Vec, +} diff --git a/openapi/nexus.json b/openapi/nexus.json index a0cbfa2f63..c29cb8a95c 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -8026,11 +8026,7 @@ "content": { "application/json": { "schema": { - "title": "Array_of_Table", - "type": "array", - "items": { - "$ref": "#/components/schemas/Table" - } + "$ref": "#/components/schemas/OxqlQueryResult" } } } @@ -16501,6 +16497,22 @@ } ] }, + "OxqlQueryResult": { + "description": "The result of a successful OxQL query.", + "type": "object", + "properties": { + "tables": { + "description": "Tables resulting from the query, each containing timeseries.", + "type": "array", + "items": { + "$ref": "#/components/schemas/Table" + } + } + }, + "required": [ + "tables" + ] + }, "Password": { "title": "A password used to authenticate a user", "description": "Passwords may be subject to additional constraints.", From ede17c7a26017d29c2fccc1f0183a945793dd693 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Sun, 18 Aug 2024 16:42:00 -0700 Subject: [PATCH 70/91] Update Rust crate camino to v1.1.9 (#6382) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 040ecd9e98..f3aafb9b7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -815,9 +815,9 @@ dependencies = [ [[package]] name = "camino" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3054fea8a20d8ff3968d5b22cc27501d2b08dc4decdb31b184323f00c5ef23bb" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" dependencies = [ "serde", ] From b449abb736c10313df1d5e0c8f126970b2b968e5 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 19 Aug 2024 04:32:11 +0000 Subject: [PATCH 71/91] Update taiki-e/install-action digest to 37129d5 (#6384) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`2d7ff60` -> `37129d5`](https://togithub.com/taiki-e/install-action/compare/2d7ff60...37129d5) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 4b4d09ba35..63752880d6 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@2d7ff60c815c5236dc38fd3909d97d6d605315d2 # v2 + uses: taiki-e/install-action@37129d5de13e9122cce55a7a5e7e49981cef514c # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 5eb0284d3733ad1ae8633332458bcaa26c3b5c29 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Mon, 19 Aug 2024 09:05:01 -0700 Subject: [PATCH 72/91] releng: allow building os image from local sources (#6126) --- dev-tools/releng/src/main.rs | 73 ++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index ee649e79b2..264eec2503 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -143,6 +143,10 @@ struct Args { /// Path to a pre-built omicron-package binary (skips building if set) #[clap(long, env = "OMICRON_PACKAGE")] omicron_package_bin: Option, + + /// Build the helios OS image from local sources. + #[clap(long)] + helios_local: bool, } impl Args { @@ -286,7 +290,7 @@ async fn main() -> Result<()> { logger, "helios checkout at {0} is out-of-date; run \ `git pull -C {0}`, or run omicron-releng with \ - --ignore-helios-origin or --helios-path", + --ignore-helios-origin or --helios-dir", shell_words::quote(args.helios_dir.as_str()) ); preflight_ok = false; @@ -496,39 +500,42 @@ async fn main() -> Result<()> { Utc::now().format("%Y-%m-%d %H:%M") ); - // helios-build experiment-image - jobs.push_command( - format!("{}-image", target), - Command::new("ptime") - .arg("-m") - .arg(args.helios_dir.join("helios-build")) - .arg("experiment-image") - .arg("-o") // output directory for image - .arg(args.output_dir.join(format!("os-{}", target))) + let mut image_cmd = Command::new("ptime") + .arg("-m") + .arg(args.helios_dir.join("helios-build")) + .arg("experiment-image") + .arg("-o") // output directory for image + .arg(args.output_dir.join(format!("os-{}", target))) + .arg("-F") // pass extra image builder features + .arg(format!("optever={}", opte_version.trim())) + .arg("-P") // include all files from extra proto area + .arg(proto_dir.join("root")) + .arg("-N") // image name + .arg(image_name) + .arg("-s") // tempdir name suffix + .arg(target.as_str()) + .args(target.image_build_args()) + .current_dir(&args.helios_dir) + .env( + "IMAGE_DATASET", + match target { + Target::Host => &args.host_dataset, + Target::Recovery => &args.recovery_dataset, + }, + ) + .env_remove("CARGO") + .env_remove("RUSTUP_TOOLCHAIN"); + + if !args.helios_local { + image_cmd = image_cmd .arg("-p") // use an external package repository - .arg(format!("helios-dev={}", HELIOS_REPO)) - .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version.trim())) - .arg("-P") // include all files from extra proto area - .arg(proto_dir.join("root")) - .arg("-N") // image name - .arg(image_name) - .arg("-s") // tempdir name suffix - .arg(target.as_str()) - .args(target.image_build_args()) - .current_dir(&args.helios_dir) - .env( - "IMAGE_DATASET", - match target { - Target::Host => &args.host_dataset, - Target::Recovery => &args.recovery_dataset, - }, - ) - .env_remove("CARGO") - .env_remove("RUSTUP_TOOLCHAIN"), - ) - .after("helios-setup") - .after(format!("{}-proto", target)); + .arg(format!("helios-dev={HELIOS_REPO}")) + } + + // helios-build experiment-image + jobs.push_command(format!("{}-image", target), image_cmd) + .after("helios-setup") + .after(format!("{}-proto", target)); } // Build the recovery target after we build the host target. Only one // of these will build at a time since Cargo locks its target directory; From b29947e9efd4fc79864a85d1a6776acfd2d94ea8 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 19 Aug 2024 09:25:39 -0700 Subject: [PATCH 73/91] Slim down the ClickHouse database (#6352) - Add TTLs to all field tables, by using a materialized column with the time each record is inserted. ClickHouse will retain the latest timestamp, so when we stop inserting, the TTL clock will start counting down on those timeseries records. - Update Dropshot dependency. - Add operation ID to HTTP service timeseries, remove other fields. Expunge the old timeseries too. - Remove unnecessary stingifying of URIs in latency tracking. --- Cargo.lock | 7 +- ...ast_updated_column_to_fields_i64_local.sql | 1 + ...ast_updated_column_on_fields_i64_local.sql | 1 + .../10/02_add_ttl_to_fields_i64_local.sql | 1 + ...st_updated_column_to_fields_uuid_local.sql | 1 + ...st_updated_column_on_fields_uuid_local.sql | 1 + .../10/05_add_ttl_to_fields_uuid_local.sql | 1 + ...st_updated_column_to_fields_bool_local.sql | 1 + ...st_updated_column_on_fields_bool_local.sql | 1 + .../10/08_add_ttl_to_fields_bool_local.sql | 1 + ..._updated_column_to_fields_ipaddr_local.sql | 1 + ..._updated_column_on_fields_ipaddr_local.sql | 1 + .../10/11_add_ttl_to_fields_ipaddr_local.sql | 1 + ..._updated_column_to_fields_string_local.sql | 1 + ..._updated_column_on_fields_string_local.sql | 1 + .../10/14_add_ttl_to_fields_string_local.sql | 1 + ...last_updated_column_to_fields_i8_local.sql | 1 + ...last_updated_column_on_fields_i8_local.sql | 1 + .../10/17_add_ttl_to_fields_i8_local.sql | 1 + ...last_updated_column_to_fields_u8_local.sql | 1 + ...last_updated_column_on_fields_u8_local.sql | 1 + .../10/20_add_ttl_to_fields_u8_local.sql | 1 + ...ast_updated_column_to_fields_i16_local.sql | 1 + ...ast_updated_column_on_fields_i16_local.sql | 1 + .../10/23_add_ttl_to_fields_i16_local.sql | 1 + ...ast_updated_column_to_fields_u16_local.sql | 1 + ...ast_updated_column_on_fields_u16_local.sql | 1 + .../10/26_add_ttl_to_fields_u16_local.sql | 1 + ...ast_updated_column_to_fields_i32_local.sql | 1 + ...ast_updated_column_on_fields_i32_local.sql | 1 + .../10/29_add_ttl_to_fields_i32_local.sql | 1 + ...ast_updated_column_to_fields_u32_local.sql | 1 + ...ast_updated_column_on_fields_u32_local.sql | 1 + .../10/32_add_ttl_to_fields_u32_local.sql | 1 + ...ast_updated_column_to_fields_u64_local.sql | 1 + ...ast_updated_column_on_fields_u64_local.sql | 1 + .../10/35_add_ttl_to_fields_u64_local.sql | 1 + .../replicated/10/timeseries-to-delete.txt | 1 + oximeter/db/schema/replicated/db-init-1.sql | 12 +- oximeter/db/schema/replicated/db-init-2.sql | 60 ++++++--- ...add_last_updated_column_to_fields_bool.sql | 1 + ...ize_last_updated_column_on_fields_bool.sql | 1 + .../10/02_add_ttl_to_fields_bool.sql | 1 + ...3_add_last_updated_column_to_fields_i8.sql | 1 + ...alize_last_updated_column_on_fields_i8.sql | 1 + .../10/05_add_ttl_to_fields_i8.sql | 1 + ...6_add_last_updated_column_to_fields_u8.sql | 1 + ...alize_last_updated_column_on_fields_u8.sql | 1 + .../10/08_add_ttl_to_fields_u8.sql | 1 + ..._add_last_updated_column_to_fields_i16.sql | 1 + ...lize_last_updated_column_on_fields_i16.sql | 1 + .../10/11_add_ttl_to_fields_i16.sql | 1 + ..._add_last_updated_column_to_fields_u16.sql | 1 + ...lize_last_updated_column_on_fields_u16.sql | 1 + .../10/14_add_ttl_to_fields_u16.sql | 1 + ..._add_last_updated_column_to_fields_i32.sql | 1 + ...lize_last_updated_column_on_fields_i32.sql | 1 + .../10/17_add_ttl_to_fields_i32.sql | 1 + ..._add_last_updated_column_to_fields_u32.sql | 1 + ...lize_last_updated_column_on_fields_u32.sql | 1 + .../10/20_add_ttl_to_fields_u32.sql | 1 + ..._add_last_updated_column_to_fields_i64.sql | 1 + ...lize_last_updated_column_on_fields_i64.sql | 1 + .../10/23_add_ttl_to_fields_i64.sql | 1 + ..._add_last_updated_column_to_fields_u64.sql | 1 + ...lize_last_updated_column_on_fields_u64.sql | 1 + .../10/26_add_ttl_to_fields_u64.sql | 1 + ...d_last_updated_column_to_fields_ipaddr.sql | 1 + ...e_last_updated_column_on_fields_ipaddr.sql | 1 + .../10/29_add_ttl_to_fields_ipaddr.sql | 1 + ...d_last_updated_column_to_fields_string.sql | 1 + ...e_last_updated_column_on_fields_string.sql | 1 + .../10/32_add_ttl_to_fields_string.sql | 1 + ...add_last_updated_column_to_fields_uuid.sql | 1 + ...ize_last_updated_column_on_fields_uuid.sql | 1 + .../10/35_add_ttl_to_fields_uuid.sql | 1 + .../single-node/10/timeseries-to-delete.txt | 1 + oximeter/db/schema/single-node/db-init.sql | 80 ++++++++---- oximeter/db/src/model.rs | 2 +- oximeter/instruments/src/http.rs | 119 ++++++++---------- oximeter/oximeter/schema/http-service.toml | 15 +-- workspace-hack/Cargo.toml | 2 + 82 files changed, 244 insertions(+), 127 deletions(-) create mode 100644 oximeter/db/schema/replicated/10/00_add_last_updated_column_to_fields_i64_local.sql create mode 100644 oximeter/db/schema/replicated/10/01_materialize_last_updated_column_on_fields_i64_local.sql create mode 100644 oximeter/db/schema/replicated/10/02_add_ttl_to_fields_i64_local.sql create mode 100644 oximeter/db/schema/replicated/10/03_add_last_updated_column_to_fields_uuid_local.sql create mode 100644 oximeter/db/schema/replicated/10/04_materialize_last_updated_column_on_fields_uuid_local.sql create mode 100644 oximeter/db/schema/replicated/10/05_add_ttl_to_fields_uuid_local.sql create mode 100644 oximeter/db/schema/replicated/10/06_add_last_updated_column_to_fields_bool_local.sql create mode 100644 oximeter/db/schema/replicated/10/07_materialize_last_updated_column_on_fields_bool_local.sql create mode 100644 oximeter/db/schema/replicated/10/08_add_ttl_to_fields_bool_local.sql create mode 100644 oximeter/db/schema/replicated/10/09_add_last_updated_column_to_fields_ipaddr_local.sql create mode 100644 oximeter/db/schema/replicated/10/10_materialize_last_updated_column_on_fields_ipaddr_local.sql create mode 100644 oximeter/db/schema/replicated/10/11_add_ttl_to_fields_ipaddr_local.sql create mode 100644 oximeter/db/schema/replicated/10/12_add_last_updated_column_to_fields_string_local.sql create mode 100644 oximeter/db/schema/replicated/10/13_materialize_last_updated_column_on_fields_string_local.sql create mode 100644 oximeter/db/schema/replicated/10/14_add_ttl_to_fields_string_local.sql create mode 100644 oximeter/db/schema/replicated/10/15_add_last_updated_column_to_fields_i8_local.sql create mode 100644 oximeter/db/schema/replicated/10/16_materialize_last_updated_column_on_fields_i8_local.sql create mode 100644 oximeter/db/schema/replicated/10/17_add_ttl_to_fields_i8_local.sql create mode 100644 oximeter/db/schema/replicated/10/18_add_last_updated_column_to_fields_u8_local.sql create mode 100644 oximeter/db/schema/replicated/10/19_materialize_last_updated_column_on_fields_u8_local.sql create mode 100644 oximeter/db/schema/replicated/10/20_add_ttl_to_fields_u8_local.sql create mode 100644 oximeter/db/schema/replicated/10/21_add_last_updated_column_to_fields_i16_local.sql create mode 100644 oximeter/db/schema/replicated/10/22_materialize_last_updated_column_on_fields_i16_local.sql create mode 100644 oximeter/db/schema/replicated/10/23_add_ttl_to_fields_i16_local.sql create mode 100644 oximeter/db/schema/replicated/10/24_add_last_updated_column_to_fields_u16_local.sql create mode 100644 oximeter/db/schema/replicated/10/25_materialize_last_updated_column_on_fields_u16_local.sql create mode 100644 oximeter/db/schema/replicated/10/26_add_ttl_to_fields_u16_local.sql create mode 100644 oximeter/db/schema/replicated/10/27_add_last_updated_column_to_fields_i32_local.sql create mode 100644 oximeter/db/schema/replicated/10/28_materialize_last_updated_column_on_fields_i32_local.sql create mode 100644 oximeter/db/schema/replicated/10/29_add_ttl_to_fields_i32_local.sql create mode 100644 oximeter/db/schema/replicated/10/30_add_last_updated_column_to_fields_u32_local.sql create mode 100644 oximeter/db/schema/replicated/10/31_materialize_last_updated_column_on_fields_u32_local.sql create mode 100644 oximeter/db/schema/replicated/10/32_add_ttl_to_fields_u32_local.sql create mode 100644 oximeter/db/schema/replicated/10/33_add_last_updated_column_to_fields_u64_local.sql create mode 100644 oximeter/db/schema/replicated/10/34_materialize_last_updated_column_on_fields_u64_local.sql create mode 100644 oximeter/db/schema/replicated/10/35_add_ttl_to_fields_u64_local.sql create mode 100644 oximeter/db/schema/replicated/10/timeseries-to-delete.txt create mode 100644 oximeter/db/schema/single-node/10/00_add_last_updated_column_to_fields_bool.sql create mode 100644 oximeter/db/schema/single-node/10/01_materialize_last_updated_column_on_fields_bool.sql create mode 100644 oximeter/db/schema/single-node/10/02_add_ttl_to_fields_bool.sql create mode 100644 oximeter/db/schema/single-node/10/03_add_last_updated_column_to_fields_i8.sql create mode 100644 oximeter/db/schema/single-node/10/04_materialize_last_updated_column_on_fields_i8.sql create mode 100644 oximeter/db/schema/single-node/10/05_add_ttl_to_fields_i8.sql create mode 100644 oximeter/db/schema/single-node/10/06_add_last_updated_column_to_fields_u8.sql create mode 100644 oximeter/db/schema/single-node/10/07_materialize_last_updated_column_on_fields_u8.sql create mode 100644 oximeter/db/schema/single-node/10/08_add_ttl_to_fields_u8.sql create mode 100644 oximeter/db/schema/single-node/10/09_add_last_updated_column_to_fields_i16.sql create mode 100644 oximeter/db/schema/single-node/10/10_materialize_last_updated_column_on_fields_i16.sql create mode 100644 oximeter/db/schema/single-node/10/11_add_ttl_to_fields_i16.sql create mode 100644 oximeter/db/schema/single-node/10/12_add_last_updated_column_to_fields_u16.sql create mode 100644 oximeter/db/schema/single-node/10/13_materialize_last_updated_column_on_fields_u16.sql create mode 100644 oximeter/db/schema/single-node/10/14_add_ttl_to_fields_u16.sql create mode 100644 oximeter/db/schema/single-node/10/15_add_last_updated_column_to_fields_i32.sql create mode 100644 oximeter/db/schema/single-node/10/16_materialize_last_updated_column_on_fields_i32.sql create mode 100644 oximeter/db/schema/single-node/10/17_add_ttl_to_fields_i32.sql create mode 100644 oximeter/db/schema/single-node/10/18_add_last_updated_column_to_fields_u32.sql create mode 100644 oximeter/db/schema/single-node/10/19_materialize_last_updated_column_on_fields_u32.sql create mode 100644 oximeter/db/schema/single-node/10/20_add_ttl_to_fields_u32.sql create mode 100644 oximeter/db/schema/single-node/10/21_add_last_updated_column_to_fields_i64.sql create mode 100644 oximeter/db/schema/single-node/10/22_materialize_last_updated_column_on_fields_i64.sql create mode 100644 oximeter/db/schema/single-node/10/23_add_ttl_to_fields_i64.sql create mode 100644 oximeter/db/schema/single-node/10/24_add_last_updated_column_to_fields_u64.sql create mode 100644 oximeter/db/schema/single-node/10/25_materialize_last_updated_column_on_fields_u64.sql create mode 100644 oximeter/db/schema/single-node/10/26_add_ttl_to_fields_u64.sql create mode 100644 oximeter/db/schema/single-node/10/27_add_last_updated_column_to_fields_ipaddr.sql create mode 100644 oximeter/db/schema/single-node/10/28_materialize_last_updated_column_on_fields_ipaddr.sql create mode 100644 oximeter/db/schema/single-node/10/29_add_ttl_to_fields_ipaddr.sql create mode 100644 oximeter/db/schema/single-node/10/30_add_last_updated_column_to_fields_string.sql create mode 100644 oximeter/db/schema/single-node/10/31_materialize_last_updated_column_on_fields_string.sql create mode 100644 oximeter/db/schema/single-node/10/32_add_ttl_to_fields_string.sql create mode 100644 oximeter/db/schema/single-node/10/33_add_last_updated_column_to_fields_uuid.sql create mode 100644 oximeter/db/schema/single-node/10/34_materialize_last_updated_column_on_fields_uuid.sql create mode 100644 oximeter/db/schema/single-node/10/35_add_ttl_to_fields_uuid.sql create mode 100644 oximeter/db/schema/single-node/10/timeseries-to-delete.txt diff --git a/Cargo.lock b/Cargo.lock index f3aafb9b7d..874b33134f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2213,7 +2213,7 @@ dependencies = [ [[package]] name = "dropshot" version = "0.10.2-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#52d900a470b8f08eddf021813470b2a9194f2cc0" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#06c8dab40e28d313f8bb0e15e1027eeace3bce89" dependencies = [ "async-stream", "async-trait", @@ -2259,7 +2259,7 @@ dependencies = [ [[package]] name = "dropshot_endpoint" version = "0.10.2-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#52d900a470b8f08eddf021813470b2a9194f2cc0" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#06c8dab40e28d313f8bb0e15e1027eeace3bce89" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3616,7 +3616,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.4.10", "tokio", "tower-service", "tracing", @@ -6523,6 +6523,7 @@ dependencies = [ "similar", "slog", "smallvec 1.13.2", + "socket2 0.5.7", "spin 0.9.8", "string_cache", "subtle", diff --git a/oximeter/db/schema/replicated/10/00_add_last_updated_column_to_fields_i64_local.sql b/oximeter/db/schema/replicated/10/00_add_last_updated_column_to_fields_i64_local.sql new file mode 100644 index 0000000000..04158b36ce --- /dev/null +++ b/oximeter/db/schema/replicated/10/00_add_last_updated_column_to_fields_i64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/01_materialize_last_updated_column_on_fields_i64_local.sql b/oximeter/db/schema/replicated/10/01_materialize_last_updated_column_on_fields_i64_local.sql new file mode 100644 index 0000000000..2e35dd2793 --- /dev/null +++ b/oximeter/db/schema/replicated/10/01_materialize_last_updated_column_on_fields_i64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/02_add_ttl_to_fields_i64_local.sql b/oximeter/db/schema/replicated/10/02_add_ttl_to_fields_i64_local.sql new file mode 100644 index 0000000000..25e5303e5a --- /dev/null +++ b/oximeter/db/schema/replicated/10/02_add_ttl_to_fields_i64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/03_add_last_updated_column_to_fields_uuid_local.sql b/oximeter/db/schema/replicated/10/03_add_last_updated_column_to_fields_uuid_local.sql new file mode 100644 index 0000000000..f26fdedbb6 --- /dev/null +++ b/oximeter/db/schema/replicated/10/03_add_last_updated_column_to_fields_uuid_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/04_materialize_last_updated_column_on_fields_uuid_local.sql b/oximeter/db/schema/replicated/10/04_materialize_last_updated_column_on_fields_uuid_local.sql new file mode 100644 index 0000000000..1bc623f418 --- /dev/null +++ b/oximeter/db/schema/replicated/10/04_materialize_last_updated_column_on_fields_uuid_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/05_add_ttl_to_fields_uuid_local.sql b/oximeter/db/schema/replicated/10/05_add_ttl_to_fields_uuid_local.sql new file mode 100644 index 0000000000..b98bba1e88 --- /dev/null +++ b/oximeter/db/schema/replicated/10/05_add_ttl_to_fields_uuid_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/06_add_last_updated_column_to_fields_bool_local.sql b/oximeter/db/schema/replicated/10/06_add_last_updated_column_to_fields_bool_local.sql new file mode 100644 index 0000000000..bf3c16dde5 --- /dev/null +++ b/oximeter/db/schema/replicated/10/06_add_last_updated_column_to_fields_bool_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/07_materialize_last_updated_column_on_fields_bool_local.sql b/oximeter/db/schema/replicated/10/07_materialize_last_updated_column_on_fields_bool_local.sql new file mode 100644 index 0000000000..3ddb0eec84 --- /dev/null +++ b/oximeter/db/schema/replicated/10/07_materialize_last_updated_column_on_fields_bool_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/08_add_ttl_to_fields_bool_local.sql b/oximeter/db/schema/replicated/10/08_add_ttl_to_fields_bool_local.sql new file mode 100644 index 0000000000..58d599cf49 --- /dev/null +++ b/oximeter/db/schema/replicated/10/08_add_ttl_to_fields_bool_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/09_add_last_updated_column_to_fields_ipaddr_local.sql b/oximeter/db/schema/replicated/10/09_add_last_updated_column_to_fields_ipaddr_local.sql new file mode 100644 index 0000000000..94696b7b06 --- /dev/null +++ b/oximeter/db/schema/replicated/10/09_add_last_updated_column_to_fields_ipaddr_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/10_materialize_last_updated_column_on_fields_ipaddr_local.sql b/oximeter/db/schema/replicated/10/10_materialize_last_updated_column_on_fields_ipaddr_local.sql new file mode 100644 index 0000000000..f621033d56 --- /dev/null +++ b/oximeter/db/schema/replicated/10/10_materialize_last_updated_column_on_fields_ipaddr_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/11_add_ttl_to_fields_ipaddr_local.sql b/oximeter/db/schema/replicated/10/11_add_ttl_to_fields_ipaddr_local.sql new file mode 100644 index 0000000000..4a01da9e74 --- /dev/null +++ b/oximeter/db/schema/replicated/10/11_add_ttl_to_fields_ipaddr_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/12_add_last_updated_column_to_fields_string_local.sql b/oximeter/db/schema/replicated/10/12_add_last_updated_column_to_fields_string_local.sql new file mode 100644 index 0000000000..173d803437 --- /dev/null +++ b/oximeter/db/schema/replicated/10/12_add_last_updated_column_to_fields_string_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/13_materialize_last_updated_column_on_fields_string_local.sql b/oximeter/db/schema/replicated/10/13_materialize_last_updated_column_on_fields_string_local.sql new file mode 100644 index 0000000000..d9fcc84eba --- /dev/null +++ b/oximeter/db/schema/replicated/10/13_materialize_last_updated_column_on_fields_string_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/14_add_ttl_to_fields_string_local.sql b/oximeter/db/schema/replicated/10/14_add_ttl_to_fields_string_local.sql new file mode 100644 index 0000000000..8c9aecca9d --- /dev/null +++ b/oximeter/db/schema/replicated/10/14_add_ttl_to_fields_string_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/15_add_last_updated_column_to_fields_i8_local.sql b/oximeter/db/schema/replicated/10/15_add_last_updated_column_to_fields_i8_local.sql new file mode 100644 index 0000000000..8d071424f6 --- /dev/null +++ b/oximeter/db/schema/replicated/10/15_add_last_updated_column_to_fields_i8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/16_materialize_last_updated_column_on_fields_i8_local.sql b/oximeter/db/schema/replicated/10/16_materialize_last_updated_column_on_fields_i8_local.sql new file mode 100644 index 0000000000..ac5fa948ae --- /dev/null +++ b/oximeter/db/schema/replicated/10/16_materialize_last_updated_column_on_fields_i8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/17_add_ttl_to_fields_i8_local.sql b/oximeter/db/schema/replicated/10/17_add_ttl_to_fields_i8_local.sql new file mode 100644 index 0000000000..3caa1b93f6 --- /dev/null +++ b/oximeter/db/schema/replicated/10/17_add_ttl_to_fields_i8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/18_add_last_updated_column_to_fields_u8_local.sql b/oximeter/db/schema/replicated/10/18_add_last_updated_column_to_fields_u8_local.sql new file mode 100644 index 0000000000..ed6978c7e6 --- /dev/null +++ b/oximeter/db/schema/replicated/10/18_add_last_updated_column_to_fields_u8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/19_materialize_last_updated_column_on_fields_u8_local.sql b/oximeter/db/schema/replicated/10/19_materialize_last_updated_column_on_fields_u8_local.sql new file mode 100644 index 0000000000..81ce8626a7 --- /dev/null +++ b/oximeter/db/schema/replicated/10/19_materialize_last_updated_column_on_fields_u8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/20_add_ttl_to_fields_u8_local.sql b/oximeter/db/schema/replicated/10/20_add_ttl_to_fields_u8_local.sql new file mode 100644 index 0000000000..2a7c757dc8 --- /dev/null +++ b/oximeter/db/schema/replicated/10/20_add_ttl_to_fields_u8_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/21_add_last_updated_column_to_fields_i16_local.sql b/oximeter/db/schema/replicated/10/21_add_last_updated_column_to_fields_i16_local.sql new file mode 100644 index 0000000000..cbe0b08fe4 --- /dev/null +++ b/oximeter/db/schema/replicated/10/21_add_last_updated_column_to_fields_i16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/22_materialize_last_updated_column_on_fields_i16_local.sql b/oximeter/db/schema/replicated/10/22_materialize_last_updated_column_on_fields_i16_local.sql new file mode 100644 index 0000000000..d4854807b7 --- /dev/null +++ b/oximeter/db/schema/replicated/10/22_materialize_last_updated_column_on_fields_i16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/23_add_ttl_to_fields_i16_local.sql b/oximeter/db/schema/replicated/10/23_add_ttl_to_fields_i16_local.sql new file mode 100644 index 0000000000..c84b634a00 --- /dev/null +++ b/oximeter/db/schema/replicated/10/23_add_ttl_to_fields_i16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/24_add_last_updated_column_to_fields_u16_local.sql b/oximeter/db/schema/replicated/10/24_add_last_updated_column_to_fields_u16_local.sql new file mode 100644 index 0000000000..60c28c0047 --- /dev/null +++ b/oximeter/db/schema/replicated/10/24_add_last_updated_column_to_fields_u16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/25_materialize_last_updated_column_on_fields_u16_local.sql b/oximeter/db/schema/replicated/10/25_materialize_last_updated_column_on_fields_u16_local.sql new file mode 100644 index 0000000000..b38cdda831 --- /dev/null +++ b/oximeter/db/schema/replicated/10/25_materialize_last_updated_column_on_fields_u16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/26_add_ttl_to_fields_u16_local.sql b/oximeter/db/schema/replicated/10/26_add_ttl_to_fields_u16_local.sql new file mode 100644 index 0000000000..cd533ffd8f --- /dev/null +++ b/oximeter/db/schema/replicated/10/26_add_ttl_to_fields_u16_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/27_add_last_updated_column_to_fields_i32_local.sql b/oximeter/db/schema/replicated/10/27_add_last_updated_column_to_fields_i32_local.sql new file mode 100644 index 0000000000..1ea7093d8f --- /dev/null +++ b/oximeter/db/schema/replicated/10/27_add_last_updated_column_to_fields_i32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/28_materialize_last_updated_column_on_fields_i32_local.sql b/oximeter/db/schema/replicated/10/28_materialize_last_updated_column_on_fields_i32_local.sql new file mode 100644 index 0000000000..f9f6464729 --- /dev/null +++ b/oximeter/db/schema/replicated/10/28_materialize_last_updated_column_on_fields_i32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/29_add_ttl_to_fields_i32_local.sql b/oximeter/db/schema/replicated/10/29_add_ttl_to_fields_i32_local.sql new file mode 100644 index 0000000000..7c37ee9b21 --- /dev/null +++ b/oximeter/db/schema/replicated/10/29_add_ttl_to_fields_i32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/30_add_last_updated_column_to_fields_u32_local.sql b/oximeter/db/schema/replicated/10/30_add_last_updated_column_to_fields_u32_local.sql new file mode 100644 index 0000000000..b15eab9387 --- /dev/null +++ b/oximeter/db/schema/replicated/10/30_add_last_updated_column_to_fields_u32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/31_materialize_last_updated_column_on_fields_u32_local.sql b/oximeter/db/schema/replicated/10/31_materialize_last_updated_column_on_fields_u32_local.sql new file mode 100644 index 0000000000..caa96ab5eb --- /dev/null +++ b/oximeter/db/schema/replicated/10/31_materialize_last_updated_column_on_fields_u32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/32_add_ttl_to_fields_u32_local.sql b/oximeter/db/schema/replicated/10/32_add_ttl_to_fields_u32_local.sql new file mode 100644 index 0000000000..25af5ee660 --- /dev/null +++ b/oximeter/db/schema/replicated/10/32_add_ttl_to_fields_u32_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/33_add_last_updated_column_to_fields_u64_local.sql b/oximeter/db/schema/replicated/10/33_add_last_updated_column_to_fields_u64_local.sql new file mode 100644 index 0000000000..e85bd845d4 --- /dev/null +++ b/oximeter/db/schema/replicated/10/33_add_last_updated_column_to_fields_u64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64_local ON CLUSTER oximeter_cluster ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/replicated/10/34_materialize_last_updated_column_on_fields_u64_local.sql b/oximeter/db/schema/replicated/10/34_materialize_last_updated_column_on_fields_u64_local.sql new file mode 100644 index 0000000000..d287a02c6f --- /dev/null +++ b/oximeter/db/schema/replicated/10/34_materialize_last_updated_column_on_fields_u64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64_local ON CLUSTER oximeter_cluster MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/replicated/10/35_add_ttl_to_fields_u64_local.sql b/oximeter/db/schema/replicated/10/35_add_ttl_to_fields_u64_local.sql new file mode 100644 index 0000000000..02eb09c300 --- /dev/null +++ b/oximeter/db/schema/replicated/10/35_add_ttl_to_fields_u64_local.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64_local ON CLUSTER oximeter_cluster MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/replicated/10/timeseries-to-delete.txt b/oximeter/db/schema/replicated/10/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/replicated/10/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/schema/replicated/db-init-1.sql b/oximeter/db/schema/replicated/db-init-1.sql index 176e5b64f7..4eac2b4e37 100644 --- a/oximeter/db/schema/replicated/db-init-1.sql +++ b/oximeter/db/schema/replicated/db-init-1.sql @@ -78,10 +78,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_i64_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int64 + field_value Int64, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_i64_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i64 ON CLUSTER oximeter_cluster AS oximeter.fields_i64_local @@ -93,10 +95,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_uuid_local ON CLUSTER oximeter_cluste timeseries_name String, timeseries_key UInt64, field_name String, - field_value UUID + field_value UUID, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_uuid_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_uuid ON CLUSTER oximeter_cluster AS oximeter.fields_uuid_local diff --git a/oximeter/db/schema/replicated/db-init-2.sql b/oximeter/db/schema/replicated/db-init-2.sql index ae0431ec84..51e64e20e0 100644 --- a/oximeter/db/schema/replicated/db-init-2.sql +++ b/oximeter/db/schema/replicated/db-init-2.sql @@ -595,10 +595,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_bool_local ON CLUSTER oximeter_cluste timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt8 + field_value UInt8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_bool_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_bool ON CLUSTER oximeter_cluster AS oximeter.fields_bool_local @@ -609,10 +611,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_ipaddr_local ON CLUSTER oximeter_clus timeseries_name String, timeseries_key UInt64, field_name String, - field_value IPv6 + field_value IPv6, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_ipaddr_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_ipaddr ON CLUSTER oximeter_cluster AS oximeter.fields_ipaddr_local @@ -623,10 +627,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_string_local ON CLUSTER oximeter_clus timeseries_name String, timeseries_key UInt64, field_name String, - field_value String + field_value String, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_string_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_string ON CLUSTER oximeter_cluster AS oximeter.fields_string_local @@ -637,10 +643,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_i8_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int8 + field_value Int8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_i8_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i8 ON CLUSTER oximeter_cluster AS oximeter.fields_i8_local @@ -651,10 +659,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_u8_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt8 + field_value UInt8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_u8_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u8 ON CLUSTER oximeter_cluster AS oximeter.fields_u8_local @@ -665,10 +675,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_i16_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int16 + field_value Int16, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_i16_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i16 ON CLUSTER oximeter_cluster AS oximeter.fields_i16_local @@ -679,10 +691,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_u16_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt16 + field_value UInt16, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_u16_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u16 ON CLUSTER oximeter_cluster AS oximeter.fields_u16_local @@ -693,10 +707,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_i32_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int32 + field_value Int32, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_i32_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i32 ON CLUSTER oximeter_cluster AS oximeter.fields_i32_local @@ -707,10 +723,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_u32_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt32 + field_value UInt32, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_u32_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u32 ON CLUSTER oximeter_cluster AS oximeter.fields_u32_local @@ -721,10 +739,12 @@ CREATE TABLE IF NOT EXISTS oximeter.fields_u64_local ON CLUSTER oximeter_cluster timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt64 + field_value UInt64, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/fields_u64_local', '{replica}') -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u64 ON CLUSTER oximeter_cluster AS oximeter.fields_u64_local diff --git a/oximeter/db/schema/single-node/10/00_add_last_updated_column_to_fields_bool.sql b/oximeter/db/schema/single-node/10/00_add_last_updated_column_to_fields_bool.sql new file mode 100644 index 0000000000..86f46a43bf --- /dev/null +++ b/oximeter/db/schema/single-node/10/00_add_last_updated_column_to_fields_bool.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/01_materialize_last_updated_column_on_fields_bool.sql b/oximeter/db/schema/single-node/10/01_materialize_last_updated_column_on_fields_bool.sql new file mode 100644 index 0000000000..6ebec2d506 --- /dev/null +++ b/oximeter/db/schema/single-node/10/01_materialize_last_updated_column_on_fields_bool.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/02_add_ttl_to_fields_bool.sql b/oximeter/db/schema/single-node/10/02_add_ttl_to_fields_bool.sql new file mode 100644 index 0000000000..cc07b8cd1d --- /dev/null +++ b/oximeter/db/schema/single-node/10/02_add_ttl_to_fields_bool.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_bool MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/03_add_last_updated_column_to_fields_i8.sql b/oximeter/db/schema/single-node/10/03_add_last_updated_column_to_fields_i8.sql new file mode 100644 index 0000000000..884b5ffed6 --- /dev/null +++ b/oximeter/db/schema/single-node/10/03_add_last_updated_column_to_fields_i8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/04_materialize_last_updated_column_on_fields_i8.sql b/oximeter/db/schema/single-node/10/04_materialize_last_updated_column_on_fields_i8.sql new file mode 100644 index 0000000000..ef569d80c3 --- /dev/null +++ b/oximeter/db/schema/single-node/10/04_materialize_last_updated_column_on_fields_i8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/05_add_ttl_to_fields_i8.sql b/oximeter/db/schema/single-node/10/05_add_ttl_to_fields_i8.sql new file mode 100644 index 0000000000..adfc3dd1a4 --- /dev/null +++ b/oximeter/db/schema/single-node/10/05_add_ttl_to_fields_i8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i8 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/06_add_last_updated_column_to_fields_u8.sql b/oximeter/db/schema/single-node/10/06_add_last_updated_column_to_fields_u8.sql new file mode 100644 index 0000000000..0f4e43ce2c --- /dev/null +++ b/oximeter/db/schema/single-node/10/06_add_last_updated_column_to_fields_u8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/07_materialize_last_updated_column_on_fields_u8.sql b/oximeter/db/schema/single-node/10/07_materialize_last_updated_column_on_fields_u8.sql new file mode 100644 index 0000000000..8dcbb32bb2 --- /dev/null +++ b/oximeter/db/schema/single-node/10/07_materialize_last_updated_column_on_fields_u8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/08_add_ttl_to_fields_u8.sql b/oximeter/db/schema/single-node/10/08_add_ttl_to_fields_u8.sql new file mode 100644 index 0000000000..11a83bde7a --- /dev/null +++ b/oximeter/db/schema/single-node/10/08_add_ttl_to_fields_u8.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u8 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/09_add_last_updated_column_to_fields_i16.sql b/oximeter/db/schema/single-node/10/09_add_last_updated_column_to_fields_i16.sql new file mode 100644 index 0000000000..d27f38f94f --- /dev/null +++ b/oximeter/db/schema/single-node/10/09_add_last_updated_column_to_fields_i16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/10_materialize_last_updated_column_on_fields_i16.sql b/oximeter/db/schema/single-node/10/10_materialize_last_updated_column_on_fields_i16.sql new file mode 100644 index 0000000000..cd60a2a1e9 --- /dev/null +++ b/oximeter/db/schema/single-node/10/10_materialize_last_updated_column_on_fields_i16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/11_add_ttl_to_fields_i16.sql b/oximeter/db/schema/single-node/10/11_add_ttl_to_fields_i16.sql new file mode 100644 index 0000000000..5b1b2fcfb6 --- /dev/null +++ b/oximeter/db/schema/single-node/10/11_add_ttl_to_fields_i16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i16 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/12_add_last_updated_column_to_fields_u16.sql b/oximeter/db/schema/single-node/10/12_add_last_updated_column_to_fields_u16.sql new file mode 100644 index 0000000000..a71753f95d --- /dev/null +++ b/oximeter/db/schema/single-node/10/12_add_last_updated_column_to_fields_u16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/13_materialize_last_updated_column_on_fields_u16.sql b/oximeter/db/schema/single-node/10/13_materialize_last_updated_column_on_fields_u16.sql new file mode 100644 index 0000000000..c8dbfb494e --- /dev/null +++ b/oximeter/db/schema/single-node/10/13_materialize_last_updated_column_on_fields_u16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/14_add_ttl_to_fields_u16.sql b/oximeter/db/schema/single-node/10/14_add_ttl_to_fields_u16.sql new file mode 100644 index 0000000000..30da688c8c --- /dev/null +++ b/oximeter/db/schema/single-node/10/14_add_ttl_to_fields_u16.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u16 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/15_add_last_updated_column_to_fields_i32.sql b/oximeter/db/schema/single-node/10/15_add_last_updated_column_to_fields_i32.sql new file mode 100644 index 0000000000..eb0f377e2d --- /dev/null +++ b/oximeter/db/schema/single-node/10/15_add_last_updated_column_to_fields_i32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/16_materialize_last_updated_column_on_fields_i32.sql b/oximeter/db/schema/single-node/10/16_materialize_last_updated_column_on_fields_i32.sql new file mode 100644 index 0000000000..9cd4fa05c8 --- /dev/null +++ b/oximeter/db/schema/single-node/10/16_materialize_last_updated_column_on_fields_i32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/17_add_ttl_to_fields_i32.sql b/oximeter/db/schema/single-node/10/17_add_ttl_to_fields_i32.sql new file mode 100644 index 0000000000..5230634097 --- /dev/null +++ b/oximeter/db/schema/single-node/10/17_add_ttl_to_fields_i32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i32 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/18_add_last_updated_column_to_fields_u32.sql b/oximeter/db/schema/single-node/10/18_add_last_updated_column_to_fields_u32.sql new file mode 100644 index 0000000000..9d967784e9 --- /dev/null +++ b/oximeter/db/schema/single-node/10/18_add_last_updated_column_to_fields_u32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/19_materialize_last_updated_column_on_fields_u32.sql b/oximeter/db/schema/single-node/10/19_materialize_last_updated_column_on_fields_u32.sql new file mode 100644 index 0000000000..f625138b59 --- /dev/null +++ b/oximeter/db/schema/single-node/10/19_materialize_last_updated_column_on_fields_u32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/20_add_ttl_to_fields_u32.sql b/oximeter/db/schema/single-node/10/20_add_ttl_to_fields_u32.sql new file mode 100644 index 0000000000..fc80ce7102 --- /dev/null +++ b/oximeter/db/schema/single-node/10/20_add_ttl_to_fields_u32.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u32 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/21_add_last_updated_column_to_fields_i64.sql b/oximeter/db/schema/single-node/10/21_add_last_updated_column_to_fields_i64.sql new file mode 100644 index 0000000000..26256d3924 --- /dev/null +++ b/oximeter/db/schema/single-node/10/21_add_last_updated_column_to_fields_i64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/22_materialize_last_updated_column_on_fields_i64.sql b/oximeter/db/schema/single-node/10/22_materialize_last_updated_column_on_fields_i64.sql new file mode 100644 index 0000000000..a81294e535 --- /dev/null +++ b/oximeter/db/schema/single-node/10/22_materialize_last_updated_column_on_fields_i64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/23_add_ttl_to_fields_i64.sql b/oximeter/db/schema/single-node/10/23_add_ttl_to_fields_i64.sql new file mode 100644 index 0000000000..43ca166755 --- /dev/null +++ b/oximeter/db/schema/single-node/10/23_add_ttl_to_fields_i64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_i64 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/24_add_last_updated_column_to_fields_u64.sql b/oximeter/db/schema/single-node/10/24_add_last_updated_column_to_fields_u64.sql new file mode 100644 index 0000000000..46074c79ce --- /dev/null +++ b/oximeter/db/schema/single-node/10/24_add_last_updated_column_to_fields_u64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64 ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/25_materialize_last_updated_column_on_fields_u64.sql b/oximeter/db/schema/single-node/10/25_materialize_last_updated_column_on_fields_u64.sql new file mode 100644 index 0000000000..a68d449de7 --- /dev/null +++ b/oximeter/db/schema/single-node/10/25_materialize_last_updated_column_on_fields_u64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64 MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/26_add_ttl_to_fields_u64.sql b/oximeter/db/schema/single-node/10/26_add_ttl_to_fields_u64.sql new file mode 100644 index 0000000000..48afb51bf1 --- /dev/null +++ b/oximeter/db/schema/single-node/10/26_add_ttl_to_fields_u64.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_u64 MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/27_add_last_updated_column_to_fields_ipaddr.sql b/oximeter/db/schema/single-node/10/27_add_last_updated_column_to_fields_ipaddr.sql new file mode 100644 index 0000000000..d3c6be9072 --- /dev/null +++ b/oximeter/db/schema/single-node/10/27_add_last_updated_column_to_fields_ipaddr.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/28_materialize_last_updated_column_on_fields_ipaddr.sql b/oximeter/db/schema/single-node/10/28_materialize_last_updated_column_on_fields_ipaddr.sql new file mode 100644 index 0000000000..5bdffd4b2e --- /dev/null +++ b/oximeter/db/schema/single-node/10/28_materialize_last_updated_column_on_fields_ipaddr.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/29_add_ttl_to_fields_ipaddr.sql b/oximeter/db/schema/single-node/10/29_add_ttl_to_fields_ipaddr.sql new file mode 100644 index 0000000000..4551db90cd --- /dev/null +++ b/oximeter/db/schema/single-node/10/29_add_ttl_to_fields_ipaddr.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_ipaddr MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/30_add_last_updated_column_to_fields_string.sql b/oximeter/db/schema/single-node/10/30_add_last_updated_column_to_fields_string.sql new file mode 100644 index 0000000000..024c5f8f94 --- /dev/null +++ b/oximeter/db/schema/single-node/10/30_add_last_updated_column_to_fields_string.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/31_materialize_last_updated_column_on_fields_string.sql b/oximeter/db/schema/single-node/10/31_materialize_last_updated_column_on_fields_string.sql new file mode 100644 index 0000000000..67d3b7a596 --- /dev/null +++ b/oximeter/db/schema/single-node/10/31_materialize_last_updated_column_on_fields_string.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/32_add_ttl_to_fields_string.sql b/oximeter/db/schema/single-node/10/32_add_ttl_to_fields_string.sql new file mode 100644 index 0000000000..c5272df459 --- /dev/null +++ b/oximeter/db/schema/single-node/10/32_add_ttl_to_fields_string.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_string MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/33_add_last_updated_column_to_fields_uuid.sql b/oximeter/db/schema/single-node/10/33_add_last_updated_column_to_fields_uuid.sql new file mode 100644 index 0000000000..8d01b382fe --- /dev/null +++ b/oximeter/db/schema/single-node/10/33_add_last_updated_column_to_fields_uuid.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid ADD COLUMN IF NOT EXISTS last_updated_at DateTime MATERIALIZED now(); diff --git a/oximeter/db/schema/single-node/10/34_materialize_last_updated_column_on_fields_uuid.sql b/oximeter/db/schema/single-node/10/34_materialize_last_updated_column_on_fields_uuid.sql new file mode 100644 index 0000000000..06fbd94d02 --- /dev/null +++ b/oximeter/db/schema/single-node/10/34_materialize_last_updated_column_on_fields_uuid.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid MATERIALIZE COLUMN last_updated_at; diff --git a/oximeter/db/schema/single-node/10/35_add_ttl_to_fields_uuid.sql b/oximeter/db/schema/single-node/10/35_add_ttl_to_fields_uuid.sql new file mode 100644 index 0000000000..481055d4f5 --- /dev/null +++ b/oximeter/db/schema/single-node/10/35_add_ttl_to_fields_uuid.sql @@ -0,0 +1 @@ +ALTER TABLE oximeter.fields_uuid MODIFY TTL last_updated_at + INTERVAL 30 DAY; diff --git a/oximeter/db/schema/single-node/10/timeseries-to-delete.txt b/oximeter/db/schema/single-node/10/timeseries-to-delete.txt new file mode 100644 index 0000000000..40b90e05ff --- /dev/null +++ b/oximeter/db/schema/single-node/10/timeseries-to-delete.txt @@ -0,0 +1 @@ +http_service:request_latency_histogram diff --git a/oximeter/db/schema/single-node/db-init.sql b/oximeter/db/schema/single-node/db-init.sql index 38e9d0b70c..184951feeb 100644 --- a/oximeter/db/schema/single-node/db-init.sql +++ b/oximeter/db/schema/single-node/db-init.sql @@ -504,126 +504,158 @@ TTL toDateTime(timestamp) + INTERVAL 30 DAY; * timeseries name and then key, since it would improve lookups where one * already has the key. Realistically though, these tables are quite small and * so performance benefits will be low in absolute terms. + * + * TTL: We use a materialized column to expire old field table records. This + * column is generated automatically by the database whenever a new row is + * inserted. It cannot be inserted directly, nor is it returned in a `SELECT *` + * query. Since these tables are `ReplacingMergeTree`s, that means the last + * record will remain during a deduplication, which will have the last + * timestamp. ClickHouse will then expire old data for us, similar to the + * measurement tables. */ CREATE TABLE IF NOT EXISTS oximeter.fields_bool ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt8 + field_value UInt8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i8 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int8 + field_value Int8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u8 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt8 + field_value UInt8, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i16 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int16 + field_value Int16, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u16 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt16 + field_value UInt16, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i32 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int32 + field_value Int32, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u32 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt32 + field_value UInt32, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_i64 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value Int64 + field_value Int64, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_u64 ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UInt64 + field_value UInt64, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_ipaddr ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value IPv6 + field_value IPv6, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_string ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value String + field_value String, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; CREATE TABLE IF NOT EXISTS oximeter.fields_uuid ( timeseries_name String, timeseries_key UInt64, field_name String, - field_value UUID + field_value UUID, + last_updated_at DateTime MATERIALIZED now() ) ENGINE = ReplacingMergeTree() -ORDER BY (timeseries_name, field_name, field_value, timeseries_key); +ORDER BY (timeseries_name, field_name, field_value, timeseries_key) +TTL last_updated_at + INTERVAL 30 DAY; /* The timeseries schema table stores the extracted schema for the samples * oximeter collects. diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index 986bf00225..7608f81e45 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 9; +pub const OXIMETER_VERSION: u64 = 10; // Wrapper type to represent a boolean in the database. // diff --git a/oximeter/instruments/src/http.rs b/oximeter/instruments/src/http.rs index 6a0a35ce63..2eef327d02 100644 --- a/oximeter/instruments/src/http.rs +++ b/oximeter/instruments/src/http.rs @@ -6,17 +6,14 @@ // Copyright 2024 Oxide Computer Company -use dropshot::{ - HttpError, HttpResponse, RequestContext, RequestInfo, ServerContext, -}; +use dropshot::{HttpError, HttpResponse, RequestContext, ServerContext}; use futures::Future; use http::StatusCode; -use http::Uri; use oximeter::{ histogram::Histogram, histogram::Record, MetricsError, Producer, Sample, }; -use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::HashMap; +use std::hash::{DefaultHasher, Hash as _, Hasher}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; @@ -24,28 +21,18 @@ oximeter::use_timeseries!("http-service.toml"); pub use http_service::HttpService; pub use http_service::RequestLatencyHistogram; -// Return the route portion of the request, normalized to include a single -// leading slash and no trailing slashes. -fn normalized_uri_path(uri: &Uri) -> Cow<'static, str> { - Cow::Owned(format!( - "/{}", - uri.path().trim_end_matches('/').trim_start_matches('/') - )) -} - impl RequestLatencyHistogram { /// Build a new `RequestLatencyHistogram` with a specified histogram. /// /// Latencies are expressed in seconds. pub fn new( - request: &RequestInfo, + operation_id: &str, status_code: StatusCode, histogram: Histogram, ) -> Self { Self { - route: normalized_uri_path(request.uri()), - method: request.method().to_string().into(), - status_code: status_code.as_u16().into(), + operation_id: operation_id.to_string().into(), + status_code: status_code.as_u16(), datum: histogram, } } @@ -59,25 +46,27 @@ impl RequestLatencyHistogram { /// /// Latencies are expressed as seconds. pub fn with_latency_decades( - request: &RequestInfo, + operation_id: &str, status_code: StatusCode, start_decade: i16, end_decade: i16, ) -> Result { Ok(Self::new( - request, + operation_id, status_code, Histogram::span_decades(start_decade, end_decade)?, )) } - fn key_for(request: &RequestInfo, status_code: StatusCode) -> String { - format!( - "{}:{}:{}", - normalized_uri_path(request.uri()), - request.method(), - status_code.as_u16() - ) + /// Return a key used to ID this histogram. + /// + /// This is a quick way to look up the histogram tracking any particular + /// request and response. + fn key_for(operation_id: &str, status_code: StatusCode) -> u64 { + let mut hasher = DefaultHasher::new(); + operation_id.hash(&mut hasher); + status_code.hash(&mut hasher); + hasher.finish() } } @@ -92,8 +81,19 @@ impl RequestLatencyHistogram { /// The `LatencyTracker` can be used to produce metric data collected by `oximeter`. #[derive(Debug, Clone)] pub struct LatencyTracker { + /// The HTTP service target for which we're tracking request histograms. pub service: HttpService, - latencies: Arc>>, + /// The latency histogram for each request. + /// + /// The map here use a hash of the request fields (operation and status + /// code) as the key to each histogram. It's a bit redundant to then store + /// that in a hashmap, but this lets us avoid creating a new + /// `RequestLatencyHistogram` when handling a request that we already have + /// one for. Instead, we use this key to get the existing entry. + latencies: Arc>>, + /// The histogram used to track each request. + /// + /// We store it here to clone as we see new requests. histogram: Histogram, } @@ -104,7 +104,7 @@ impl LatencyTracker { pub fn new(service: HttpService, histogram: Histogram) -> Self { Self { service, - latencies: Arc::new(Mutex::new(BTreeMap::new())), + latencies: Arc::new(Mutex::new(HashMap::new())), histogram, } } @@ -129,15 +129,15 @@ impl LatencyTracker { /// to which the other arguments belong. (One is created if it does not exist.) pub fn update( &self, - request: &RequestInfo, + operation_id: &str, status_code: StatusCode, latency: Duration, ) -> Result<(), MetricsError> { - let key = RequestLatencyHistogram::key_for(request, status_code); + let key = RequestLatencyHistogram::key_for(operation_id, status_code); let mut latencies = self.latencies.lock().unwrap(); let entry = latencies.entry(key).or_insert_with(|| { RequestLatencyHistogram::new( - request, + operation_id, status_code, self.histogram.clone(), ) @@ -170,14 +170,14 @@ impl LatencyTracker { Ok(response) => response.status_code(), Err(ref e) => e.status_code, }; - if let Err(e) = self.update(&context.request, status_code, latency) { + if let Err(e) = self.update(&context.operation_id, status_code, latency) + { slog::error!( &context.log, "error instrumenting dropshot handler"; "error" => ?e, "status_code" => status_code.as_u16(), - "method" => %context.request.method(), - "uri" => %context.request.uri(), + "operation_id" => &context.operation_id, "remote_addr" => context.request.remote_addr(), "latency" => ?latency, ); @@ -220,41 +220,24 @@ mod tests { HttpService { name: "my-service".into(), id: ID.parse().unwrap() }; let hist = Histogram::new(&[0.0, 1.0]).unwrap(); let tracker = LatencyTracker::new(service, hist); - let request = http::request::Builder::new() - .method(http::Method::GET) - .uri("/some/uri") - .body(()) + let status_code0 = StatusCode::OK; + let status_code1 = StatusCode::NOT_FOUND; + let operation_id = "some_operation_id"; + tracker + .update(operation_id, status_code0, Duration::from_secs_f64(0.5)) .unwrap(); - let status_code = StatusCode::OK; tracker - .update( - &RequestInfo::new(&request, "0.0.0.0:0".parse().unwrap()), - status_code, - Duration::from_secs_f64(0.5), - ) + .update(operation_id, status_code1, Duration::from_secs_f64(0.5)) .unwrap(); - - let key = "/some/uri:GET:200"; - let actual_hist = tracker.latencies.lock().unwrap()[key].datum.clone(); - assert_eq!(actual_hist.n_samples(), 1); - let bins = actual_hist.iter().collect::>(); - assert_eq!(bins[1].count, 1); - } - - #[test] - fn test_normalize_uri_path() { - const EXPECTED: &str = "/foo/bar"; - const TESTS: &[&str] = &[ - "/foo/bar", - "/foo/bar/", - "//foo/bar", - "//foo/bar/", - "/foo/bar//", - "////foo/bar/////", - ]; - for test in TESTS.iter() { - println!("{test}"); - assert_eq!(normalized_uri_path(&test.parse().unwrap()), EXPECTED); + let key0 = RequestLatencyHistogram::key_for(operation_id, status_code0); + let key1 = RequestLatencyHistogram::key_for(operation_id, status_code1); + let latencies = tracker.latencies.lock().unwrap(); + assert_eq!(latencies.len(), 2); + for key in [key0, key1] { + let actual_hist = &latencies[&key].datum; + assert_eq!(actual_hist.n_samples(), 1); + let bins = actual_hist.iter().collect::>(); + assert_eq!(bins[1].count, 1); } } } diff --git a/oximeter/oximeter/schema/http-service.toml b/oximeter/oximeter/schema/http-service.toml index 9098110656..5270f6942c 100644 --- a/oximeter/oximeter/schema/http-service.toml +++ b/oximeter/oximeter/schema/http-service.toml @@ -14,7 +14,7 @@ description = "Duration for the server to handle a request" units = "seconds" datum_type = "histogram_f64" versions = [ - { added_in = 1, fields = [ "route", "method", "status_code" ] } + { added_in = 1, fields = [ "operation_id", "status_code" ] } ] [fields.name] @@ -25,14 +25,15 @@ description = "The name of the HTTP server, or program running it" type = "uuid" description = "UUID of the HTTP server" -[fields.route] +[fields.operation_id] type = "string" -description = "HTTP route in the request" +description = """\ +The identifier for the HTTP operation.\ -[fields.method] -type = "string" -description = "HTTP method in the request" +In most cases, this the OpenAPI `operationId` field that uniquely identifies the +endpoint the request is targeted to and the HTTP method used. +""" [fields.status_code] -type = "i64" +type = "u16" description = "HTTP status code in the server's response" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index fb1b94ae0f..1c58626d2d 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -101,6 +101,7 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } +socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } @@ -208,6 +209,7 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } +socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } From 845298e32594018b579353c1487f9c6456efffea Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 19 Aug 2024 09:45:55 -0700 Subject: [PATCH 74/91] Support hex integer literals in OxQL (#6380) --- oximeter/db/src/oxql/ast/grammar.rs | 47 ++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/oximeter/db/src/oxql/ast/grammar.rs b/oximeter/db/src/oxql/ast/grammar.rs index cbca4470f9..62182ec553 100644 --- a/oximeter/db/src/oxql/ast/grammar.rs +++ b/oximeter/db/src/oxql/ast/grammar.rs @@ -279,11 +279,27 @@ peg::parser! { pub rule string_literal() -> Literal = s:string_literal_impl() { Literal::String(s) } + pub(super) rule hex_integer_literal_impl() -> i128 + = n:$("0x" ['0'..='9' | 'a'..='f' | 'A'..='F']+ !['.']) + {? + let Some((maybe_sign, digits)) = n.split_once("0x") else { + return Err("hex literals should start with '0x'"); + }; + i128::from_str_radix(digits, 16).map_err(|_| "invalid hex literal") + } + + pub(super) rule dec_integer_literal_impl() -> i128 + = n:$(['0'..='9']+ !['e' | 'E' | '.']) + {? + n.parse().map_err(|_| "integer literal") + } + pub(super) rule integer_literal_impl() -> i128 - = n:$("-"? ['0'..='9']+ !['e' | 'E' | '.']) + = maybe_sign:$("-"?) n:(hex_integer_literal_impl() / dec_integer_literal_impl()) {? - let Ok(x) = n.parse() else { - return Err("integer literal"); + let sign = if maybe_sign == "-" { -1 } else { 1 }; + let Some(x) = n.checked_mul(sign) else { + return Err("negative overflow"); }; if x < i128::from(i64::MIN) { Err("negative overflow") @@ -747,13 +763,36 @@ mod tests { fn test_integer_literal() { assert_eq!(query_parser::integer_literal_impl("1").unwrap(), 1); assert_eq!(query_parser::integer_literal_impl("-1").unwrap(), -1); - assert_eq!(query_parser::integer_literal_impl("-1").unwrap(), -1); assert!(query_parser::integer_literal_impl("-1.0").is_err()); assert!(query_parser::integer_literal_impl("-1.").is_err()); assert!(query_parser::integer_literal_impl("1e3").is_err()); } + #[test] + fn test_hex_integer_literal() { + assert_eq!(query_parser::integer_literal_impl("0x1").unwrap(), 1); + assert_eq!(query_parser::integer_literal_impl("-0x1").unwrap(), -1); + assert_eq!(query_parser::integer_literal_impl("-0xa").unwrap(), -0xa); + assert_eq!( + query_parser::integer_literal_impl("0xfeed").unwrap(), + 0xfeed + ); + assert_eq!( + query_parser::integer_literal_impl("0xFEED").unwrap(), + 0xfeed + ); + + // Out of range in either direction + assert!(query_parser::integer_literal_impl("0xFFFFFFFFFFFFFFFFFFFF") + .is_err()); + assert!(query_parser::integer_literal_impl("-0xFFFFFFFFFFFFFFFFFFFF") + .is_err()); + + assert!(query_parser::integer_literal_impl("-0x1.0").is_err()); + assert!(query_parser::integer_literal_impl("-0x1.").is_err()); + } + #[test] fn test_double_literal() { assert_eq!(query_parser::double_literal_impl("1.0").unwrap(), 1.0); From 54b387871e17579f9ad30166d4b7522ddc18ba31 Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 19 Aug 2024 10:36:31 -0700 Subject: [PATCH 75/91] [sled-agent] make sled-agent-sim implement the sled-agent API trait (#6339) `sled-agent-sim` is another implementation of the API, so it can be backed by the same trait, reducing the likelihood of confusion. I did have to add stub implementations for unimplemented methods, though -- I think that's okay. (I'd like to address this via the `api_description` macro at some point.) --- sled-agent/src/sim/http_entrypoints.rs | 894 +++++++++++++------------ 1 file changed, 455 insertions(+), 439 deletions(-) diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index c219a747ce..e93bebad98 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -5,18 +5,27 @@ //! HTTP entrypoint functions for the sled agent's exposed API use super::collection::PokeMode; +use camino::Utf8PathBuf; +use dropshot::endpoint; use dropshot::ApiDescription; +use dropshot::FreeformBody; use dropshot::HttpError; +use dropshot::HttpResponseCreated; +use dropshot::HttpResponseDeleted; +use dropshot::HttpResponseHeaders; use dropshot::HttpResponseOk; use dropshot::HttpResponseUpdatedNoContent; use dropshot::Path; +use dropshot::Query; use dropshot::RequestContext; +use dropshot::StreamingBody; use dropshot::TypedBody; -use dropshot::{endpoint, ApiDescriptionRegisterError}; +use nexus_sled_agent_shared::inventory::SledRole; use nexus_sled_agent_shared::inventory::{Inventory, OmicronZonesConfig}; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::UpdateArtifactId; +use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use omicron_common::api::internal::shared::{ ResolvedVpcRouteSet, ResolvedVpcRouteState, SwitchPorts, @@ -24,8 +33,12 @@ use omicron_common::api::internal::shared::{ use omicron_common::disk::DisksManagementResult; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use sled_agent_api::*; +use sled_agent_types::boot_disk::BootDiskOsWriteStatus; +use sled_agent_types::boot_disk::BootDiskPathParams; +use sled_agent_types::boot_disk::BootDiskUpdatePathParams; +use sled_agent_types::boot_disk::BootDiskWriteStartQueryParams; +use sled_agent_types::bootstore::BootstoreStatus; use sled_agent_types::disk::DiskEnsureBody; use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; @@ -35,8 +48,14 @@ use sled_agent_types::instance::InstancePutStateBody; use sled_agent_types::instance::InstancePutStateResponse; use sled_agent_types::instance::InstanceUnregisterResponse; use sled_agent_types::sled::AddSledRequest; +use sled_agent_types::time_sync::TimeSync; +use sled_agent_types::zone_bundle::BundleUtilization; +use sled_agent_types::zone_bundle::CleanupContext; +use sled_agent_types::zone_bundle::CleanupCount; +use sled_agent_types::zone_bundle::ZoneBundleId; +use sled_agent_types::zone_bundle::ZoneBundleMetadata; +use std::collections::BTreeMap; use std::sync::Arc; -use uuid::Uuid; use super::sled_agent::SledAgent; @@ -44,510 +63,507 @@ type SledApiDescription = ApiDescription>; /// Returns a description of the sled agent API pub fn api() -> SledApiDescription { - fn register_endpoints( - api: &mut SledApiDescription, - ) -> Result<(), ApiDescriptionRegisterError> { - api.register(instance_put_state)?; - api.register(instance_get_state)?; - api.register(instance_register)?; - api.register(instance_unregister)?; - api.register(instance_put_external_ip)?; - api.register(instance_delete_external_ip)?; + fn register_endpoints() -> Result { + let mut api = sled_agent_api::sled_agent_api_mod::api_description::< + SledAgentSimImpl, + >()?; api.register(instance_poke_post)?; api.register(instance_poke_single_step_post)?; api.register(instance_post_sim_migration_source)?; - api.register(disk_put)?; api.register(disk_poke_post)?; - api.register(update_artifact)?; - api.register(instance_issue_disk_snapshot_request)?; - api.register(vpc_firewall_rules_put)?; - api.register(set_v2p)?; - api.register(del_v2p)?; - api.register(list_v2p)?; - api.register(uplink_ensure)?; - api.register(read_network_bootstore_config)?; - api.register(write_network_bootstore_config)?; - api.register(inventory)?; - api.register(omicron_physical_disks_get)?; - api.register(omicron_physical_disks_put)?; - api.register(omicron_zones_get)?; - api.register(omicron_zones_put)?; - api.register(sled_add)?; - api.register(list_vpc_routes)?; - api.register(set_vpc_routes)?; - - Ok(()) - } - - let mut api = SledApiDescription::new(); - if let Err(err) = register_endpoints(&mut api) { - panic!("failed to register entrypoints: {}", err); - } - api -} -/// Path parameters for Instance requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct InstancePathParam { - instance_id: InstanceUuid, -} + Ok(api) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}", -}] -async fn instance_register( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_register( - instance_id, - body_args.propolis_id, - body_args.hardware, - body_args.instance_runtime, - body_args.vmm_runtime, - body_args.metadata, + register_endpoints().expect("failed to register entrypoints") +} + +enum SledAgentSimImpl {} + +impl SledAgentApi for SledAgentSimImpl { + type Context = Arc; + + async fn instance_register( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.instance_register( + instance_id, + body_args.propolis_id, + body_args.hardware, + body_args.instance_runtime, + body_args.vmm_runtime, + body_args.metadata, + ) + .await?, + )) + } + + async fn instance_unregister( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + Ok(HttpResponseOk(sa.instance_unregister(instance_id).await?)) + } + + async fn instance_put_state( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.instance_ensure_state(instance_id, body_args.state).await?, + )) + } + + async fn instance_get_state( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) + } + + async fn instance_put_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_put_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn instance_delete_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_delete_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn disk_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let disk_id = path_params.into_inner().disk_id; + let body_args = body.into_inner(); + Ok(HttpResponseOk( + sa.disk_ensure( + disk_id, + body_args.initial_runtime.clone(), + body_args.target.clone(), + ) + .await?, + )) + } + + async fn update_artifact( + rqctx: RequestContext, + artifact: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.updates() + .download_artifact( + artifact.into_inner(), + rqctx.context().nexus_client.as_ref(), + ) + .await + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn instance_issue_disk_snapshot_request( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result< + HttpResponseOk, + HttpError, + > { + let sa = rqctx.context(); + let path_params = path_params.into_inner(); + let body = body.into_inner(); + + sa.instance_issue_disk_snapshot_request( + InstanceUuid::from_untyped_uuid(path_params.instance_id), + path_params.disk_id, + body.snapshot_id, ) - .await?, - )) -} + .await + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; -#[endpoint { - method = DELETE, - path = "/instances/{instance_id}", -}] -async fn instance_unregister( - rqctx: RequestContext>, - path_params: Path, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_unregister(instance_id).await?)) -} + Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { + snapshot_id: body.snapshot_id, + })) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/state", -}] -async fn instance_put_state( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_ensure_state(instance_id, body_args.state).await?, - )) -} + async fn vpc_firewall_rules_put( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let _sa = rqctx.context(); + let _vpc_id = path_params.into_inner().vpc_id; + let _body_args = body.into_inner(); -#[endpoint { - method = GET, - path = "/instances/{instance_id}/state", -}] -async fn instance_get_state( - rqctx: RequestContext>, - path_params: Path, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) -} + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/external-ip", -}] -async fn instance_put_external_ip( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - sa.instance_put_external_ip(instance_id, &body_args).await?; - Ok(HttpResponseUpdatedNoContent()) -} + async fn set_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); -#[endpoint { - method = DELETE, - path = "/instances/{instance_id}/external-ip", -}] -async fn instance_delete_external_ip( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - sa.instance_delete_external_ip(instance_id, &body_args).await?; - Ok(HttpResponseUpdatedNoContent()) -} + sa.set_virtual_nic_host(&body_args) + .await + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; -#[endpoint { - method = POST, - path = "/instances/{instance_id}/poke", -}] -async fn instance_poke_post( - rqctx: RequestContext>, - path_params: Path, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id, PokeMode::Drain).await; - Ok(HttpResponseUpdatedNoContent()) -} + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = POST, - path = "/instances/{instance_id}/poke-single-step", -}] -async fn instance_poke_single_step_post( - rqctx: RequestContext>, - path_params: Path, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id, PokeMode::SingleStep).await; - Ok(HttpResponseUpdatedNoContent()) -} + async fn del_v2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); -#[endpoint { - method = POST, - path = "/instances/{instance_id}/sim-migration-source", -}] -async fn instance_post_sim_migration_source( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - sa.instance_simulate_migration_source(instance_id, body.into_inner()) - .await?; - Ok(HttpResponseUpdatedNoContent()) -} + sa.unset_virtual_nic_host(&body_args) + .await + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; -/// Path parameters for Disk requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct DiskPathParam { - disk_id: Uuid, -} + Ok(HttpResponseUpdatedNoContent()) + } -#[endpoint { - method = PUT, - path = "/disks/{disk_id}", -}] -async fn disk_put( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let disk_id = path_params.into_inner().disk_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.disk_ensure( - disk_id, - body_args.initial_runtime.clone(), - body_args.target.clone(), - ) - .await?, - )) -} + async fn list_v2p( + rqctx: RequestContext, + ) -> Result>, HttpError> + { + let sa = rqctx.context(); -#[endpoint { - method = POST, - path = "/disks/{disk_id}/poke", -}] -async fn disk_poke_post( - rqctx: RequestContext>, - path_params: Path, -) -> Result { - let sa = rqctx.context(); - let disk_id = path_params.into_inner().disk_id; - sa.disk_poke(disk_id).await; - Ok(HttpResponseUpdatedNoContent()) -} + let vnics = sa.list_virtual_nics().await.map_err(HttpError::from)?; -#[endpoint { - method = POST, - path = "/update" -}] -async fn update_artifact( - rqctx: RequestContext>, - artifact: TypedBody, -) -> Result { - let sa = rqctx.context(); - sa.updates() - .download_artifact( - artifact.into_inner(), - rqctx.context().nexus_client.as_ref(), - ) - .await - .map_err(|e| HttpError::for_internal_error(e.to_string()))?; - Ok(HttpResponseUpdatedNoContent()) -} + Ok(HttpResponseOk(vnics)) + } -#[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestPathParam { - instance_id: Uuid, - disk_id: Uuid, -} + async fn uplink_ensure( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + Ok(HttpResponseUpdatedNoContent()) + } -#[derive(Deserialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestBody { - snapshot_id: Uuid, -} + async fn read_network_bootstore_config_cache( + rqctx: RequestContext, + ) -> Result, HttpError> { + let config = + rqctx.context().bootstore_network_config.lock().await.clone(); + Ok(HttpResponseOk(config)) + } -#[derive(Serialize, JsonSchema)] -pub struct InstanceIssueDiskSnapshotRequestResponse { - snapshot_id: Uuid, -} + async fn write_network_bootstore_config( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let mut config = rqctx.context().bootstore_network_config.lock().await; + *config = body.into_inner(); + Ok(HttpResponseUpdatedNoContent()) + } -/// Take a snapshot of a disk that is attached to an instance -#[endpoint { - method = POST, - path = "/instances/{instance_id}/disks/{disk_id}/snapshot", -}] -async fn instance_issue_disk_snapshot_request( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> -{ - let sa = rqctx.context(); - let path_params = path_params.into_inner(); - let body = body.into_inner(); - - sa.instance_issue_disk_snapshot_request( - InstanceUuid::from_untyped_uuid(path_params.instance_id), - path_params.disk_id, - body.snapshot_id, - ) - .await - .map_err(|e| HttpError::for_internal_error(e.to_string()))?; - - Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse { - snapshot_id: body.snapshot_id, - })) -} + /// Fetch basic information about this sled + async fn inventory( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk( + sa.inventory(rqctx.server.local_addr).await.map_err(|e| { + HttpError::for_internal_error(format!("{:#}", e)) + })?, + )) + } -/// Path parameters for VPC requests (sled agent API) -#[derive(Deserialize, JsonSchema)] -struct VpcPathParam { - vpc_id: Uuid, -} + async fn omicron_physical_disks_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError> { + let sa = rqctx.context(); + let body_args = body.into_inner(); + let result = sa.omicron_physical_disks_ensure(body_args).await?; + Ok(HttpResponseOk(result)) + } -#[endpoint { - method = PUT, - path = "/vpc/{vpc_id}/firewall/rules", -}] -async fn vpc_firewall_rules_put( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result { - let _sa = rqctx.context(); - let _vpc_id = path_params.into_inner().vpc_id; - let _body_args = body.into_inner(); + async fn omicron_physical_disks_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) + } - Ok(HttpResponseUpdatedNoContent()) -} + async fn omicron_zones_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_zones_list().await)) + } -/// Create a mapping from a virtual NIC to a physical host -#[endpoint { - method = PUT, - path = "/v2p/", -}] -async fn set_v2p( - rqctx: RequestContext>, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); + async fn omicron_zones_put( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.omicron_zones_ensure(body_args).await; + Ok(HttpResponseUpdatedNoContent()) + } - sa.set_virtual_nic_host(&body_args) - .await - .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + async fn sled_add( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + Ok(HttpResponseUpdatedNoContent()) + } - Ok(HttpResponseUpdatedNoContent()) -} + async fn list_vpc_routes( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.list_vpc_routes().await)) + } -/// Delete a mapping from a virtual NIC to a physical host -#[endpoint { - method = DELETE, - path = "/v2p/", -}] -async fn del_v2p( - rqctx: RequestContext>, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); + async fn set_vpc_routes( + rqctx: RequestContext, + body: TypedBody>, + ) -> Result { + let sa = rqctx.context(); + sa.set_vpc_routes(body.into_inner()).await; + Ok(HttpResponseUpdatedNoContent()) + } - sa.unset_virtual_nic_host(&body_args) - .await - .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + // --- Unimplemented endpoints --- - Ok(HttpResponseUpdatedNoContent()) -} + async fn zone_bundle_list_all( + _rqctx: RequestContext, + _query: Query, + ) -> Result>, HttpError> { + method_unimplemented() + } -/// List v2p mappings present on sled -#[endpoint { - method = GET, - path = "/v2p/", -}] -async fn list_v2p( - rqctx: RequestContext>, -) -> Result>, HttpError> { - let sa = rqctx.context(); + async fn zone_bundle_list( + _rqctx: RequestContext, + _params: Path, + ) -> Result>, HttpError> { + method_unimplemented() + } - let vnics = sa.list_virtual_nics().await.map_err(HttpError::from)?; + async fn zone_bundle_create( + _rqctx: RequestContext, + _params: Path, + ) -> Result, HttpError> { + method_unimplemented() + } - Ok(HttpResponseOk(vnics)) -} + async fn zone_bundle_get( + _rqctx: RequestContext, + _params: Path, + ) -> Result>, HttpError> + { + method_unimplemented() + } -#[endpoint { - method = POST, - path = "/switch-ports", -}] -async fn uplink_ensure( - _rqctx: RequestContext>, - _body: TypedBody, -) -> Result { - Ok(HttpResponseUpdatedNoContent()) -} + async fn zone_bundle_delete( + _rqctx: RequestContext, + _params: Path, + ) -> Result { + method_unimplemented() + } -#[endpoint { - method = GET, - path = "/network-bootstore-config", -}] -async fn read_network_bootstore_config( - rqctx: RequestContext>, -) -> Result, HttpError> { - let config = rqctx.context().bootstore_network_config.lock().await.clone(); - Ok(HttpResponseOk(config)) -} + async fn zone_bundle_utilization( + _rqctx: RequestContext, + ) -> Result< + HttpResponseOk>, + HttpError, + > { + method_unimplemented() + } -#[endpoint { - method = PUT, - path = "/network-bootstore-config", -}] -async fn write_network_bootstore_config( - rqctx: RequestContext>, - body: TypedBody, -) -> Result { - let mut config = rqctx.context().bootstore_network_config.lock().await; - *config = body.into_inner(); - Ok(HttpResponseUpdatedNoContent()) -} + async fn zone_bundle_cleanup_context( + _rqctx: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } -/// Fetch basic information about this sled -#[endpoint { - method = GET, - path = "/inventory", -}] -async fn inventory( - rqctx: RequestContext>, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk( - sa.inventory(rqctx.server.local_addr) - .await - .map_err(|e| HttpError::for_internal_error(format!("{:#}", e)))?, - )) -} + async fn zone_bundle_cleanup_context_update( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + method_unimplemented() + } -#[endpoint { - method = PUT, - path = "/omicron-physical-disks", -}] -async fn omicron_physical_disks_put( - rqctx: RequestContext>, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let body_args = body.into_inner(); - let result = sa.omicron_physical_disks_ensure(body_args).await?; - Ok(HttpResponseOk(result)) -} + async fn zone_bundle_cleanup( + _rqctx: RequestContext, + ) -> Result>, HttpError> + { + method_unimplemented() + } -#[endpoint { - method = GET, - path = "/omicron-physical-disks", -}] -async fn omicron_physical_disks_get( - rqctx: RequestContext>, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) + async fn zones_list( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + method_unimplemented() + } + + async fn zpools_get( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + method_unimplemented() + } + + async fn sled_role_get( + _rqctx: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn cockroachdb_init( + _rqctx: RequestContext, + ) -> Result { + method_unimplemented() + } + + async fn timesync_get( + _rqctx: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn host_os_write_start( + _rqctx: RequestContext, + _path_params: Path, + _query_params: Query, + _body: StreamingBody, + ) -> Result { + method_unimplemented() + } + + async fn host_os_write_status_get( + _rqctx: RequestContext, + _path_params: Path, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn host_os_write_status_delete( + _rqctx: RequestContext, + _path_params: Path, + ) -> Result { + method_unimplemented() + } + + async fn sled_identifiers( + _rqctx: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } + + async fn bootstore_status( + _rqctx: RequestContext, + ) -> Result, HttpError> { + method_unimplemented() + } } -#[endpoint { - method = GET, - path = "/omicron-zones", -}] -async fn omicron_zones_get( - rqctx: RequestContext>, -) -> Result, HttpError> { - let sa = rqctx.context(); - Ok(HttpResponseOk(sa.omicron_zones_list().await)) +fn method_unimplemented() -> Result { + Err(HttpError { + // Use a client error here (405 Method Not Allowed vs 501 Not + // Implemented) even though it isn't strictly accurate here, so tests + // get to see the error message. + status_code: http::StatusCode::METHOD_NOT_ALLOWED, + error_code: None, + external_message: "Method not implemented in sled-agent-sim" + .to_string(), + internal_message: "Method not implemented in sled-agent-sim" + .to_string(), + }) } +// --- Extra endpoints only available in the sim implementation --- + #[endpoint { - method = PUT, - path = "/omicron-zones", + method = POST, + path = "/instances/{instance_id}/poke", }] -async fn omicron_zones_put( +async fn instance_poke_post( rqctx: RequestContext>, - body: TypedBody, + path_params: Path, ) -> Result { let sa = rqctx.context(); - let body_args = body.into_inner(); - sa.omicron_zones_ensure(body_args).await; + let instance_id = path_params.into_inner().instance_id; + sa.instance_poke(instance_id, PokeMode::Drain).await; Ok(HttpResponseUpdatedNoContent()) } #[endpoint { - method = PUT, - path = "/sleds" + method = POST, + path = "/instances/{instance_id}/poke-single-step", }] -async fn sled_add( - _rqctx: RequestContext>, - _body: TypedBody, +async fn instance_poke_single_step_post( + rqctx: RequestContext>, + path_params: Path, ) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + sa.instance_poke(instance_id, PokeMode::SingleStep).await; Ok(HttpResponseUpdatedNoContent()) } #[endpoint { - method = GET, - path = "/vpc-routes", + method = POST, + path = "/instances/{instance_id}/sim-migration-source", }] -async fn list_vpc_routes( +async fn instance_post_sim_migration_source( rqctx: RequestContext>, -) -> Result>, HttpError> { + path_params: Path, + body: TypedBody, +) -> Result { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.list_vpc_routes().await)) + let instance_id = path_params.into_inner().instance_id; + sa.instance_simulate_migration_source(instance_id, body.into_inner()) + .await?; + Ok(HttpResponseUpdatedNoContent()) } #[endpoint { - method = PUT, - path = "/vpc-routes", + method = POST, + path = "/disks/{disk_id}/poke", }] -async fn set_vpc_routes( +async fn disk_poke_post( rqctx: RequestContext>, - body: TypedBody>, + path_params: Path, ) -> Result { let sa = rqctx.context(); - sa.set_vpc_routes(body.into_inner()).await; + let disk_id = path_params.into_inner().disk_id; + sa.disk_poke(disk_id).await; Ok(HttpResponseUpdatedNoContent()) } From 6dd980251a26430466bcd5aff1edad5416cf94e5 Mon Sep 17 00:00:00 2001 From: Rain Date: Mon, 19 Aug 2024 12:13:05 -0700 Subject: [PATCH 76/91] [3/6] [nexus-auth] move some types into nexus-types (#6369) Put them here rather than in `nexus-auth` so that the upcoming nexus-external-api crate has fewer dependencies. --- Cargo.lock | 5 +++++ nexus/auth/src/authn/external/mod.rs | 1 - nexus/auth/src/authn/external/session_cookie.rs | 2 +- nexus/src/external_api/console_api.rs | 10 ++++------ nexus/types/Cargo.toml | 5 +++++ .../src/authn/external => types/src/authn}/cookies.rs | 0 nexus/types/src/authn/mod.rs | 7 +++++++ nexus/types/src/lib.rs | 1 + 8 files changed, 23 insertions(+), 8 deletions(-) rename nexus/{auth/src/authn/external => types/src/authn}/cookies.rs (100%) create mode 100644 nexus/types/src/authn/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 874b33134f..6bd71f6d38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5443,17 +5443,22 @@ version = "0.1.0" dependencies = [ "anyhow", "api_identity", + "async-trait", "base64 0.22.1", "chrono", "clap", + "cookie 0.18.1", "derive-where", "derive_more", "dns-service-client", + "dropshot", "futures", "gateway-client", + "http 0.2.12", "humantime", "ipnetwork", "newtype-uuid", + "newtype_derive", "nexus-sled-agent-shared", "omicron-common", "omicron-passwords", diff --git a/nexus/auth/src/authn/external/mod.rs b/nexus/auth/src/authn/external/mod.rs index ccb7218285..5c7fc7af05 100644 --- a/nexus/auth/src/authn/external/mod.rs +++ b/nexus/auth/src/authn/external/mod.rs @@ -13,7 +13,6 @@ use slog::trace; use std::borrow::Borrow; use uuid::Uuid; -pub mod cookies; pub mod session_cookie; pub mod spoof; pub mod token; diff --git a/nexus/auth/src/authn/external/session_cookie.rs b/nexus/auth/src/authn/external/session_cookie.rs index 7811bf2826..f6b23308a0 100644 --- a/nexus/auth/src/authn/external/session_cookie.rs +++ b/nexus/auth/src/authn/external/session_cookie.rs @@ -4,7 +4,6 @@ //! authn scheme for console that looks up cookie values in a session table -use super::cookies::parse_cookies; use super::{HttpAuthnScheme, Reason, SchemeResult}; use crate::authn; use crate::authn::{Actor, Details}; @@ -13,6 +12,7 @@ use async_trait::async_trait; use chrono::{DateTime, Duration, Utc}; use dropshot::HttpError; use http::HeaderValue; +use nexus_types::authn::cookies::parse_cookies; use slog::debug; use uuid::Uuid; diff --git a/nexus/src/external_api/console_api.rs b/nexus/src/external_api/console_api.rs index fb0a47bbea..2169b631a7 100644 --- a/nexus/src/external_api/console_api.rs +++ b/nexus/src/external_api/console_api.rs @@ -35,15 +35,13 @@ use nexus_db_model::AuthenticationMode; use nexus_db_queries::authn::silos::IdentityProviderType; use nexus_db_queries::context::OpContext; use nexus_db_queries::{ - authn::external::{ - cookies::Cookies, - session_cookie::{ - clear_session_cookie_header_value, session_cookie_header_value, - SessionStore, SESSION_COOKIE_COOKIE_NAME, - }, + authn::external::session_cookie::{ + clear_session_cookie_header_value, session_cookie_header_value, + SessionStore, SESSION_COOKIE_COOKIE_NAME, }, db::identity::Asset, }; +use nexus_types::authn::cookies::Cookies; use nexus_types::external_api::params; use nexus_types::identity::Resource; use omicron_common::api::external::http_pagination::PaginatedBy; diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 8dd6292d5c..124f0d42c9 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -9,14 +9,19 @@ workspace = true [dependencies] anyhow.workspace = true +async-trait.workspace = true chrono.workspace = true clap.workspace = true +cookie.workspace = true base64.workspace = true derive-where.workspace = true derive_more.workspace = true +dropshot.workspace = true futures.workspace = true +http.workspace = true humantime.workspace = true ipnetwork.workspace = true +newtype_derive.workspace = true omicron-uuid-kinds.workspace = true openssl.workspace = true oxql-types.workspace = true diff --git a/nexus/auth/src/authn/external/cookies.rs b/nexus/types/src/authn/cookies.rs similarity index 100% rename from nexus/auth/src/authn/external/cookies.rs rename to nexus/types/src/authn/cookies.rs diff --git a/nexus/types/src/authn/mod.rs b/nexus/types/src/authn/mod.rs new file mode 100644 index 0000000000..f87935428e --- /dev/null +++ b/nexus/types/src/authn/mod.rs @@ -0,0 +1,7 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Authentication types for the Nexus API. + +pub mod cookies; diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 494573e834..8a0a3ec80e 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -29,6 +29,7 @@ //! rules, so our model layer knows about our views. That seems to be a //! relatively minor offense, so it's the way we leave things for now. +pub mod authn; pub mod deployment; pub mod external_api; pub mod identity; From a338e8a69ad8c9924856a20125d39b048c7cb8e1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 19 Aug 2024 16:49:56 -0700 Subject: [PATCH 77/91] fix flaky test_demo_saga (#6388) --- nexus/src/app/saga.rs | 4 + nexus/src/app/sagas/demo.rs | 154 +++++++++++++++++++++++++++--------- 2 files changed, 119 insertions(+), 39 deletions(-) diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index 5bc69946ad..975df7fc3b 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -469,6 +469,10 @@ impl super::Nexus { // We don't need the handle that runnable_saga.start() returns because // we're not going to wait for the saga to finish here. let _ = runnable_saga.start().await?; + + let mut demo_sagas = self.demo_sagas()?; + demo_sagas.preregister(demo_saga_id); + Ok(DemoSaga { saga_id, demo_saga_id }) } diff --git a/nexus/src/app/sagas/demo.rs b/nexus/src/app/sagas/demo.rs index 4a8eda8b80..d76a48688d 100644 --- a/nexus/src/app/sagas/demo.rs +++ b/nexus/src/app/sagas/demo.rs @@ -21,56 +21,66 @@ use super::NexusActionContext; use super::{ActionRegistry, NexusSaga, SagaInitError}; use crate::app::sagas::declare_saga_actions; -use anyhow::ensure; +use anyhow::Context; use omicron_common::api::external::Error; use omicron_uuid_kinds::DemoSagaUuid; use serde::Deserialize; use serde::Serialize; use slog::info; use std::collections::BTreeMap; +use std::future::Future; +use std::sync::Arc; use steno::ActionError; -use tokio::sync::oneshot; +use tokio::sync::Semaphore; -/// Set of demo sagas that have been marked completed +/// Rendezvous point for demo sagas /// -/// Nexus maintains one of these at the top level. Individual demo sagas wait -/// until their id shows up here, then remove it and proceed. +/// This is where: +/// +/// - demo sagas wait for a completion message +/// - completion messages are recorded for demo sagas that haven't started +/// waiting yet +/// +/// Nexus maintains one of these structures at the top level. pub struct CompletingDemoSagas { - ids: BTreeMap>, + sagas: BTreeMap>, } impl CompletingDemoSagas { pub fn new() -> CompletingDemoSagas { - CompletingDemoSagas { ids: BTreeMap::new() } + CompletingDemoSagas { sagas: BTreeMap::new() } } - pub fn complete(&mut self, id: DemoSagaUuid) -> Result<(), Error> { - self.ids - .remove(&id) - .ok_or_else(|| { - Error::non_resourcetype_not_found(format!( - "demo saga with id {:?}", - id - )) - })? - .send(()) - .map_err(|_| { - Error::internal_error( - "saga stopped listening (Nexus shutting down?)", - ) - }) + pub fn preregister(&mut self, id: DemoSagaUuid) { + assert!(self.sagas.insert(id, Arc::new(Semaphore::new(0))).is_none()); } pub fn subscribe( &mut self, id: DemoSagaUuid, - ) -> Result, anyhow::Error> { - let (tx, rx) = oneshot::channel(); - ensure!( - self.ids.insert(id, tx).is_none(), - "multiple subscriptions for the same demo saga" - ); - Ok(rx) + ) -> impl Future> { + let sem = + self.sagas.entry(id).or_insert_with(|| Arc::new(Semaphore::new(0))); + let sem_clone = sem.clone(); + async move { + sem_clone + .acquire() + .await + // We don't need the Semaphore permit once we've acquired it. + .map(|_| ()) + .context("acquiring demo saga semaphore") + } + } + + pub fn complete(&mut self, id: DemoSagaUuid) -> Result<(), Error> { + let sem = self.sagas.get_mut(&id).ok_or_else(|| { + Error::non_resourcetype_not_found(format!( + "demo saga with demo saga id {:?}", + id + )) + })?; + sem.add_permits(1); + Ok(()) } } @@ -115,21 +125,87 @@ async fn demo_wait(sagactx: NexusActionContext) -> Result<(), ActionError> { .nexus() .demo_sagas() .map_err(ActionError::action_failed)?; - demo_sagas.subscribe(demo_id).map_err(|e| { - ActionError::action_failed(Error::internal_error(&format!( - "demo saga subscribe failed: {:#}", - e - ))) - })? + demo_sagas.subscribe(demo_id) }; match rx.await { Ok(_) => { info!(log, "demo saga: completing"; "id" => %demo_id); + Ok(()) } - Err(_) => { - info!(log, "demo saga: waiting failed (Nexus shutting down?)"; - "id" => %demo_id); + Err(error) => { + warn!(log, "demo saga: waiting failed (Nexus shutting down?)"; + "id" => %demo_id, + "error" => #?error, + ); + Err(ActionError::action_failed(Error::internal_error(&format!( + "demo saga wait failed: {:#}", + error + )))) } } - Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + use assert_matches::assert_matches; + + #[tokio::test] + async fn test_demo_saga_rendezvous() { + let mut hub = CompletingDemoSagas::new(); + + // The most straightforward sequence is: + // - create (preregister) demo saga + // - demo saga starts and waits for completion (subscribe) + // - complete demo saga + let demo_saga_id = DemoSagaUuid::new_v4(); + println!("demo saga: {demo_saga_id}"); + hub.preregister(demo_saga_id); + println!("demo saga: {demo_saga_id} preregistered"); + let subscribe = hub.subscribe(demo_saga_id); + println!("demo saga: {demo_saga_id} subscribed"); + assert!(hub.complete(demo_saga_id).is_ok()); + println!("demo saga: {demo_saga_id} marked completed"); + subscribe.await.unwrap(); + println!("demo saga: {demo_saga_id} done"); + + // It's also possible that the completion request arrives before the + // saga started waiting. In that case, the sequence is: + // + // - create (preregister) demo saga + // - complete demo saga + // - demo saga starts and waits for completion (subscribe) + // + // This should work, too, with no errors. + let demo_saga_id = DemoSagaUuid::new_v4(); + println!("demo saga: {demo_saga_id}"); + hub.preregister(demo_saga_id); + println!("demo saga: {demo_saga_id} preregistered"); + assert!(hub.complete(demo_saga_id).is_ok()); + println!("demo saga: {demo_saga_id} marked completed"); + let subscribe = hub.subscribe(demo_saga_id); + println!("demo saga: {demo_saga_id} subscribed"); + subscribe.await.unwrap(); + println!("demo saga: {demo_saga_id} done"); + + // It's also possible to have no preregistration at all. This happens + // if the demo saga was recovered. That's fine, too, but then it will + // only work if the completion arrives after the saga starts waiting. + let demo_saga_id = DemoSagaUuid::new_v4(); + println!("demo saga: {demo_saga_id}"); + let subscribe = hub.subscribe(demo_saga_id); + println!("demo saga: {demo_saga_id} subscribed"); + assert!(hub.complete(demo_saga_id).is_ok()); + println!("demo saga: {demo_saga_id} marked completed"); + subscribe.await.unwrap(); + println!("demo saga: {demo_saga_id} done"); + + // If there's no preregistration and we get a completion request, then + // that request should fail. + let demo_saga_id = DemoSagaUuid::new_v4(); + println!("demo saga: {demo_saga_id}"); + let error = hub.complete(demo_saga_id).unwrap_err(); + assert_matches!(error, Error::NotFound { .. }); + println!("demo saga: {demo_saga_id} complete error: {:#}", error); + } } From 6bd999b4955a23ae883447a5b1ed0c5d19049425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Tue, 20 Aug 2024 18:04:40 +1200 Subject: [PATCH 78/91] [reconfigurator] `clickhouse_server` SMF service and oximeter replicated mode (#6343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Overview This commit introduces a few changes: - a new `clickhouse_server` smf service which runs the old "replicated" mode from the `clickhouse` service - a new `replicated` field for the oximeter configuration file which is consumed by the `oximeter` binary that runs the replicated SQL against a database. It now connects to the listen address from `ServiceName::ClickhouseServer` or `ServiceName::Clickhouse` depending which zone has been deployed. - a new `--clickhouse-topology` build target flag which builds artifacts based on either a `single-node` or `replicated-cluster` setup. The difference between the two is whether the `oximeter` SMF service is executing the `oximeter` CLI with the `--replicated` flag or not. __CAVEAT:__ It's still necessary to manually change the RSS [node count constants](https://github.com/oxidecomputer/omicron/blob/ffc8807caf04ca3f81b543c520ddbe26b3284264/sled-agent/src/rack_setup/plan/service.rs#L57-L77) to the specified amount for each clickhouse topology mode. This requirement will be short lived as we are moving to use reconfigurator. ## Usage To run single node ClickHouse nothing changes, artifacts can be built the same way as before. To run replicated ClickHouse set the [node count constants](https://github.com/oxidecomputer/omicron/blob/ffc8807caf04ca3f81b543c520ddbe26b3284264/sled-agent/src/rack_setup/plan/service.rs#L57-L77) to the specified amount, and set the build target in the following manner: ```console $ cargo run --locked --release --bin omicron-package -- -t target create -i standard -m non-gimlet -s softnpu -r single-sled -c replicated-cluster Finished `release` profile [optimized] target(s) in 1.03s Running `target/release/omicron-package -t target create -i standard -m non-gimlet -s softnpu -r single-sled -c replicated-cluster` Logging to: /home/coatlicue/src/omicron/out/LOG Created new build target 'centzon' and set it as active $ cargo run --locked --release --bin omicron-package -- -t package <...> $ pfexec ./target/release/omicron-package -t install ``` ## Purpose As laid out in [RFD 468](https://rfd.shared.oxide.computer/rfd/0468), to roll out replicated ClickHouse we will need the ability to roll out either replicated or single node ClickHouse for an undetermined amount of time. This commit is a step in that direction. We need to have separate services for running replicated or single-node ClickHouse servers. ## Testing Deploying omicron on a helios box with both modes. Single node: ```console $ cargo run --locked --release --bin omicron-package -- -t centzon target create -i standard -m non-gimlet -s softnpu -r single-sled Finished `release` profile [optimized] target(s) in 0.94s Running `target/release/omicron-package -t centzon target create -i standard -m non-gimlet -s softnpu -r single-sled` Logging to: /home/coatlicue/src/omicron/out/LOG Created new build target 'centzon' and set it as active $ cargo run --locked --release --bin omicron-package -- -t centzon package <...> $ pfexec ./target/release/omicron-package -t centzon install Logging to: /home/coatlicue/src/omicron/out/LOG $ zoneadm list | grep clickhouse oxz_clickhouse_7ce86c8b-2c9e-4d02-a857-269cb0a99c2e root@oxz_clickhouse_7ce86c8b:~# /opt/oxide/clickhouse/clickhouse client --host fd00:1122:3344:101::e ClickHouse client version 23.8.7.1. Connecting to fd00:1122:3344:101::e:9000 as user default. Connected to ClickHouse server version 23.8.7 revision 54465. oxz_clickhouse_7ce86c8b-2c9e-4d02-a857-269cb0a99c2e.local :) SHOW TABLES FROM oximeter SHOW TABLES FROM oximeter Query id: 5e91fafb-4d70-4a27-a188-75fb83bb7e5e ┌─name───────────────────────┐ │ fields_bool │ │ fields_i16 │ │ fields_i32 │ │ fields_i64 │ │ fields_i8 │ │ fields_ipaddr │ │ fields_string │ │ fields_u16 │ │ fields_u32 │ │ fields_u64 │ │ fields_u8 │ │ fields_uuid │ │ measurements_bool │ │ measurements_bytes │ │ measurements_cumulativef32 │ │ measurements_cumulativef64 │ │ measurements_cumulativei64 │ │ measurements_cumulativeu64 │ │ measurements_f32 │ │ measurements_f64 │ │ measurements_histogramf32 │ │ measurements_histogramf64 │ │ measurements_histogrami16 │ │ measurements_histogrami32 │ │ measurements_histogrami64 │ │ measurements_histogrami8 │ │ measurements_histogramu16 │ │ measurements_histogramu32 │ │ measurements_histogramu64 │ │ measurements_histogramu8 │ │ measurements_i16 │ │ measurements_i32 │ │ measurements_i64 │ │ measurements_i8 │ │ measurements_string │ │ measurements_u16 │ │ measurements_u32 │ │ measurements_u64 │ │ measurements_u8 │ │ timeseries_schema │ │ version │ └────────────────────────────┘ 41 rows in set. Elapsed: 0.014 sec. oxz_clickhouse_7ce86c8b-2c9e-4d02-a857-269cb0a99c2e.local :) SELECT * FROM oximeter.fields_i64 SELECT * FROM oximeter.fields_i64 Query id: 4bbcec72-101f-4cf4-9966-680381f5b62c ┌─timeseries_name────────────────────────┬───────timeseries_key─┬─field_name──┬─field_value─┐ │ http_service:request_latency_histogram │ 8326032694586838023 │ status_code │ 200 │ <...> $ pfexec zlogin oxz_oximeter_b235200f-f0ad-4218-9184-d995df5acaf0 [Connected to zone 'oxz_oximeter_b235200f-f0ad-4218-9184-d995df5acaf0' pts/3] The illumos Project helios-2.0.22784 July 2024 root@oxz_oximeter_b235200f:~# cat /var/svc/manifest/site/oximeter/config.toml # Example configuration file for running an oximeter collector server [db] batch_size = 1000 batch_interval = 5 # In seconds replicated = false [log] level = "debug" mode = "file" path = "/dev/stdout" if_exists = "append" ``` Replicated cluster: ```console $ cargo run --locked --release --bin omicron-package -- -t centzon target create -i standard -m non-gimlet -s softnpu -r single-sled -c replicated-cluster Finished `release` profile [optimized] target(s) in 1.03s Running `target/release/omicron-package -t centzon target create -i standard -m non-gimlet -s softnpu -r single-sled -c replicated-cluster` Logging to: /home/coatlicue/src/omicron/out/LOG Created new build target 'centzon' and set it as active $ cargo run --locked --release --bin omicron-package -- -t centzon package <...> $ pfexec ./target/release/omicron-package -t centzon install Logging to: /home/coatlicue/src/omicron/out/LOG $ zoneadm list | grep clickhouse oxz_clickhouse_keeper_73e7fda2-20af-4a90-9a61-c89ceed47c1a oxz_clickhouse_server_74876663-5337-4d9b-85cb-99d1e88bdf8a oxz_clickhouse_keeper_8eaac4f9-d9e0-4d56-b269-eab7da0c73a3 oxz_clickhouse_keeper_01f3a6af-5249-4dff-b9a4-f1076e467c9a oxz_clickhouse_server_bc6010bf-507c-4b5a-ad4c-3a7af889a6c0 $ pfexec zlogin oxz_clickhouse_server_74876663-5337-4d9b-85cb-99d1e88bdf8a [Connected to zone 'oxz_clickhouse_server_74876663-5337-4d9b-85cb-99d1e88bdf8a' pts/3] The illumos Project helios-2.0.22784 July 2024 root@oxz_clickhouse_server_74876663:~# /opt/oxide/clickhouse_server/clickhouse client --host fd00:1122:3344:101::e ClickHouse client version 23.8.7.1. Connecting to fd00:1122:3344:101::e:9000 as user default. Connected to ClickHouse server version 23.8.7 revision 54465. oximeter_cluster node 1 :) SHOW TABLES FROM oximeter SHOW TABLES FROM oximeter Query id: a5603063-1cbc-41a5-bfbd-33c986764e92 ┌─name─────────────────────────────┐ │ fields_bool │ │ fields_bool_local │ │ fields_i16 │ │ fields_i16_local │ │ fields_i32 │ │ fields_i32_local │ │ fields_i64 │ │ fields_i64_local │ │ fields_i8 │ │ fields_i8_local │ │ fields_ipaddr │ │ fields_ipaddr_local │ │ fields_string │ │ fields_string_local │ │ fields_u16 │ │ fields_u16_local │ │ fields_u32 │ │ fields_u32_local │ │ fields_u64 │ │ fields_u64_local │ │ fields_u8 │ │ fields_u8_local │ │ fields_uuid │ │ fields_uuid_local │ │ measurements_bool │ │ measurements_bool_local │ │ measurements_bytes │ │ measurements_bytes_local │ │ measurements_cumulativef32 │ │ measurements_cumulativef32_local │ │ measurements_cumulativef64 │ │ measurements_cumulativef64_local │ │ measurements_cumulativei64 │ │ measurements_cumulativei64_local │ │ measurements_cumulativeu64 │ │ measurements_cumulativeu64_local │ │ measurements_f32 │ │ measurements_f32_local │ │ measurements_f64 │ │ measurements_f64_local │ │ measurements_histogramf32 │ │ measurements_histogramf32_local │ │ measurements_histogramf64 │ │ measurements_histogramf64_local │ │ measurements_histogrami16 │ │ measurements_histogrami16_local │ │ measurements_histogrami32 │ │ measurements_histogrami32_local │ │ measurements_histogrami64 │ │ measurements_histogrami64_local │ │ measurements_histogrami8 │ │ measurements_histogrami8_local │ │ measurements_histogramu16 │ │ measurements_histogramu16_local │ │ measurements_histogramu32 │ │ measurements_histogramu32_local │ │ measurements_histogramu64 │ │ measurements_histogramu64_local │ │ measurements_histogramu8 │ │ measurements_histogramu8_local │ │ measurements_i16 │ │ measurements_i16_local │ │ measurements_i32 │ │ measurements_i32_local │ │ measurements_i64 │ │ measurements_i64_local │ │ measurements_i8 │ │ measurements_i8_local │ │ measurements_string │ │ measurements_string_local │ │ measurements_u16 │ │ measurements_u16_local │ │ measurements_u32 │ │ measurements_u32_local │ │ measurements_u64 │ │ measurements_u64_local │ │ measurements_u8 │ │ measurements_u8_local │ │ timeseries_schema │ │ timeseries_schema_local │ │ version │ └──────────────────────────────────┘ 81 rows in set. Elapsed: 0.010 sec. oximeter_cluster node 1 :) SELECT * FROM oximeter.fields_i64 SELECT * FROM oximeter.fields_i64 Query id: 14f07468-0e33-4de1-8893-df3e11eb7660 ┌─timeseries_name────────────────────────┬───────timeseries_key─┬─field_name──┬─field_value─┐ │ http_service:request_latency_histogram │ 436117616059041516 │ status_code │ 200 │ <...> $ pfexec zlogin oxz_oximeter_bcba1c06-1ca5-49cf-b277-8c2387975274 [Connected to zone 'oxz_oximeter_bcba1c06-1ca5-49cf-b277-8c2387975274' pts/3] The illumos Project helios-2.0.22784 July 2024 root@oxz_oximeter_bcba1c06:~# cat /var/svc/manifest/site/oximeter/config.toml # Example configuration file for running an oximeter collector server [db] batch_size = 1000 batch_interval = 5 # In seconds replicated = true [log] level = "debug" mode = "file" path = "/dev/stdout" if_exists = "append" ``` Related: https://github.com/oxidecomputer/omicron/issues/5999 --- internal-dns-cli/src/bin/dnswait.rs | 6 +- internal-dns/src/config.rs | 4 + nexus/test-utils/src/lib.rs | 1 + oximeter/collector/src/agent.rs | 14 +- oximeter/collector/src/lib.rs | 13 +- package-manifest.toml | 38 ++++- package/src/bin/omicron-package.rs | 9 +- package/src/lib.rs | 15 ++ package/src/target.rs | 26 +++- sled-agent/src/rack_setup/plan/service.rs | 52 ++++++- sled-agent/src/services.rs | 78 +++++++++- smf/clickhouse/method_script.sh | 141 ++---------------- .../config_replica.xml | 0 smf/clickhouse_server/manifest.xml | 46 ++++++ smf/clickhouse_server/method_script.sh | 124 +++++++++++++++ .../{ => replicated-cluster}/config.toml | 1 + smf/oximeter/single-node/config.toml | 12 ++ 17 files changed, 428 insertions(+), 152 deletions(-) rename smf/{clickhouse => clickhouse_server}/config_replica.xml (100%) create mode 100644 smf/clickhouse_server/manifest.xml create mode 100755 smf/clickhouse_server/method_script.sh rename smf/oximeter/{ => replicated-cluster}/config.toml (91%) create mode 100644 smf/oximeter/single-node/config.toml diff --git a/internal-dns-cli/src/bin/dnswait.rs b/internal-dns-cli/src/bin/dnswait.rs index 8dbd675d64..f9875e71a0 100644 --- a/internal-dns-cli/src/bin/dnswait.rs +++ b/internal-dns-cli/src/bin/dnswait.rs @@ -36,15 +36,17 @@ struct Opt { #[value(rename_all = "kebab-case")] enum ServiceName { Cockroach, - Clickhouse, ClickhouseKeeper, + ClickhouseServer, } impl From for internal_dns::ServiceName { fn from(value: ServiceName) -> Self { match value { ServiceName::Cockroach => internal_dns::ServiceName::Cockroach, - ServiceName::Clickhouse => internal_dns::ServiceName::Clickhouse, + ServiceName::ClickhouseServer => { + internal_dns::ServiceName::ClickhouseServer + } ServiceName::ClickhouseKeeper => { internal_dns::ServiceName::ClickhouseKeeper } diff --git a/internal-dns/src/config.rs b/internal-dns/src/config.rs index a9ff664030..e9d7ed873d 100644 --- a/internal-dns/src/config.rs +++ b/internal-dns/src/config.rs @@ -510,6 +510,10 @@ mod test { ServiceName::ClickhouseKeeper.dns_name(), "_clickhouse-keeper._tcp", ); + assert_eq!( + ServiceName::ClickhouseServer.dns_name(), + "_clickhouse-server._tcp", + ); assert_eq!(ServiceName::Cockroach.dns_name(), "_cockroach._tcp",); assert_eq!(ServiceName::InternalDns.dns_name(), "_nameservice._tcp",); assert_eq!(ServiceName::Nexus.dns_name(), "_nexus._tcp",); diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index ea46f2d017..acee46ce10 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -1428,6 +1428,7 @@ pub async fn start_oximeter( address: Some(SocketAddr::new(Ipv6Addr::LOCALHOST.into(), db_port)), batch_size: 10, batch_interval: 1, + replicated: false, }; let config = oximeter_collector::Config { nexus_address: Some(nexus_address), diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 8271b2e068..b13fbd3938 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -17,7 +17,6 @@ use futures::TryStreamExt; use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use nexus_client::types::IdSortMode; -use omicron_common::address::CLICKHOUSE_PORT; use omicron_common::backoff; use omicron_common::backoff::BackoffError; use oximeter::types::ProducerResults; @@ -380,6 +379,7 @@ impl OximeterAgent { db_config: DbConfig, resolver: &Resolver, log: &Logger, + replicated: bool, ) -> Result { let (result_sender, result_receiver) = mpsc::channel(8); let log = log.new(o!( @@ -393,10 +393,15 @@ impl OximeterAgent { // database. let db_address = if let Some(address) = db_config.address { address + } else if replicated { + SocketAddr::V6( + resolver + .lookup_socket_v6(ServiceName::ClickhouseServer) + .await?, + ) } else { - SocketAddr::new( - resolver.lookup_ip(ServiceName::Clickhouse).await?, - CLICKHOUSE_PORT, + SocketAddr::V6( + resolver.lookup_socket_v6(ServiceName::Clickhouse).await?, ) }; @@ -422,7 +427,6 @@ impl OximeterAgent { .. }) => { debug!(log, "oximeter database does not exist, creating"); - let replicated = client.is_oximeter_cluster().await?; client .initialize_db_with_version( replicated, diff --git a/oximeter/collector/src/lib.rs b/oximeter/collector/src/lib.rs index 7dd423d074..0576c7d532 100644 --- a/oximeter/collector/src/lib.rs +++ b/oximeter/collector/src/lib.rs @@ -78,12 +78,18 @@ pub struct DbConfig { #[serde(default, skip_serializing_if = "Option::is_none")] pub address: Option, - /// Batch size of samples at which to insert + /// Batch size of samples at which to insert. pub batch_size: usize, /// Interval on which to insert data into the database, regardless of the number of collected /// samples. Value is in seconds. pub batch_interval: u64, + + // TODO (https://github.com/oxidecomputer/omicron/issues/4148): This field + // should be removed if single node functionality is removed. + /// Whether ClickHouse is running as a replicated cluster or + /// single-node server. + pub replicated: bool, } impl DbConfig { @@ -95,12 +101,16 @@ impl DbConfig { /// ClickHouse. pub const DEFAULT_BATCH_INTERVAL: u64 = 5; + /// Default ClickHouse topology. + pub const DEFAULT_REPLICATED: bool = false; + // Construct config with an address, using the defaults for other fields fn with_address(address: SocketAddr) -> Self { Self { address: Some(address), batch_size: Self::DEFAULT_BATCH_SIZE, batch_interval: Self::DEFAULT_BATCH_INTERVAL, + replicated: Self::DEFAULT_REPLICATED, } } } @@ -207,6 +217,7 @@ impl Oximeter { config.db, &resolver, &log, + config.db.replicated, ) .await?, )) diff --git a/package-manifest.toml b/package-manifest.toml index 9189ed09a0..0822225837 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -140,13 +140,15 @@ source.type = "local" source.rust.binary_names = ["oximeter", "clickhouse-schema-updater"] source.rust.release = true source.paths = [ - { from = "smf/oximeter", to = "/var/svc/manifest/site/oximeter" }, { from = "oximeter/db/schema", to = "/opt/oxide/oximeter/schema" }, + { from = "smf/oximeter/{{clickhouse-topology}}/config.toml", to = "/var/svc/manifest/site/oximeter/config.toml" }, + { from = "smf/oximeter/manifest.xml", to = "/var/svc/manifest/site/oximeter/manifest.xml" }, ] output.type = "zone" output.intermediate_only = true [package.clickhouse] +# This service runs a single-node ClickHouse server. service_name = "clickhouse" only_for_targets.image = "standard" source.type = "composite" @@ -169,13 +171,45 @@ source.paths = [ { from = "out/clickhouse", to = "/opt/oxide/clickhouse" }, { from = "smf/clickhouse/manifest.xml", to = "/var/svc/manifest/site/clickhouse/manifest.xml" }, { from = "smf/clickhouse/method_script.sh", to = "/opt/oxide/lib/svc/manifest/clickhouse.sh" }, - { from = "smf/clickhouse/config_replica.xml", to = "/opt/oxide/clickhouse/config.d/config_replica.xml" }, +] +output.type = "zone" +output.intermediate_only = true +setup_hint = "Run `cargo xtask download clickhouse` to download the necessary binaries" + +[package.clickhouse_server] +# This service runs a server for a replicated ClickHouse cluster. +# It is complimentary to the clickhouse_keeper service. +# One cannot be run without the other. +service_name = "clickhouse_server" +only_for_targets.image = "standard" +source.type = "composite" +source.packages = [ + "clickhouse_server_svc.tar.gz", + "internal-dns-cli.tar.gz", + "omicron-clickhouse-admin.tar.gz", + "zone-setup.tar.gz", + "zone-network-install.tar.gz" +] +output.type = "zone" + +[package.clickhouse_server_svc] +service_name = "clickhouse_server_svc" +only_for_targets.image = "standard" +source.type = "local" +source.paths = [ + { from = "out/clickhouse", to = "/opt/oxide/clickhouse_server" }, + { from = "smf/clickhouse_server/manifest.xml", to = "/var/svc/manifest/site/clickhouse_server/manifest.xml" }, + { from = "smf/clickhouse_server/method_script.sh", to = "/opt/oxide/lib/svc/manifest/clickhouse_server.sh" }, + { from = "smf/clickhouse_server/config_replica.xml", to = "/opt/oxide/clickhouse_server/config.d/config_replica.xml" }, ] output.type = "zone" output.intermediate_only = true setup_hint = "Run `cargo xtask download clickhouse` to download the necessary binaries" [package.clickhouse_keeper] +# This service runs a keeper for a replicated ClickHouse cluster. +# It is complimentary to the clickhouse_server service. +# One cannot be run without the other. service_name = "clickhouse_keeper" only_for_targets.image = "standard" source.type = "composite" diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index b2b8703015..cd88345d0a 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -265,12 +265,19 @@ async fn do_target( format!("failed to create directory {}", target_dir) })?; match subcommand { - TargetCommand::Create { image, machine, switch, rack_topology } => { + TargetCommand::Create { + image, + machine, + switch, + rack_topology, + clickhouse_topology, + } => { let target = KnownTarget::new( image.clone(), machine.clone(), switch.clone(), rack_topology.clone(), + clickhouse_topology.clone(), )?; let path = get_single_target(&target_dir, name).await?; diff --git a/package/src/lib.rs b/package/src/lib.rs index 2009de9dfe..b37c1774fd 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -68,6 +68,21 @@ pub enum TargetCommand { /// fail in a single-sled environment. `single-sled` relaxes this /// requirement. rack_topology: crate::target::RackTopology, + + #[clap( + short, + long, + default_value = Some("single-node"), + required = false + )] + // TODO (https://github.com/oxidecomputer/omicron/issues/4148): Remove + // once single-node functionality is removed. + /// Specify whether clickhouse will be deployed as a replicated cluster + /// or single-node configuration. + /// + /// Replicated cluster configuration is an experimental feature to be + /// used only for testing. + clickhouse_topology: crate::target::ClickhouseTopology, }, /// List all existing targets List, diff --git a/package/src/target.rs b/package/src/target.rs index 589dba7870..6a6cbd32d8 100644 --- a/package/src/target.rs +++ b/package/src/target.rs @@ -62,6 +62,18 @@ pub enum RackTopology { SingleSled, } +/// Topology of the ClickHouse installation within the rack. +#[derive(Clone, Debug, strum::EnumString, strum::Display, ValueEnum)] +#[strum(serialize_all = "kebab-case")] +#[clap(rename_all = "kebab-case")] +pub enum ClickhouseTopology { + /// Use configurations suitable for a replicated ClickHouse cluster deployment. + ReplicatedCluster, + + /// Use configurations suitable for a single-node ClickHouse deployment. + SingleNode, +} + /// A strongly-typed variant of [Target]. #[derive(Clone, Debug)] pub struct KnownTarget { @@ -69,6 +81,7 @@ pub struct KnownTarget { machine: Option, switch: Option, rack_topology: RackTopology, + clickhouse_topology: ClickhouseTopology, } impl KnownTarget { @@ -77,6 +90,7 @@ impl KnownTarget { machine: Option, switch: Option, rack_topology: RackTopology, + clickhouse_topology: ClickhouseTopology, ) -> Result { if matches!(image, Image::Trampoline) { if machine.is_some() { @@ -93,7 +107,7 @@ impl KnownTarget { bail!("'switch=asic' is only valid with 'machine=gimlet'"); } - Ok(Self { image, machine, switch, rack_topology }) + Ok(Self { image, machine, switch, rack_topology, clickhouse_topology }) } } @@ -104,6 +118,7 @@ impl Default for KnownTarget { machine: Some(Machine::NonGimlet), switch: Some(Switch::Stub), rack_topology: RackTopology::MultiSled, + clickhouse_topology: ClickhouseTopology::SingleNode, } } } @@ -119,6 +134,10 @@ impl From for Target { map.insert("switch".to_string(), switch.to_string()); } map.insert("rack-topology".to_string(), kt.rack_topology.to_string()); + map.insert( + "clickhouse-topology".to_string(), + kt.clickhouse_topology.to_string(), + ); Target(map) } } @@ -140,6 +159,7 @@ impl std::str::FromStr for KnownTarget { let mut machine = None; let mut switch = None; let mut rack_topology = None; + let mut clickhouse_topology = None; for (k, v) in target.0.into_iter() { match k.as_str() { @@ -155,6 +175,9 @@ impl std::str::FromStr for KnownTarget { "rack-topology" => { rack_topology = Some(v.parse()?); } + "clickhouse-topology" => { + clickhouse_topology = Some(v.parse()?); + } _ => { bail!( "Unknown target key {k}\nValid keys include: [{}]", @@ -173,6 +196,7 @@ impl std::str::FromStr for KnownTarget { machine, switch, rack_topology.unwrap_or(RackTopology::MultiSled), + clickhouse_topology.unwrap_or(ClickhouseTopology::SingleNode), ) } } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index c9ed0c2248..8c26d0bf58 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -58,14 +58,23 @@ const OXIMETER_COUNT: usize = 1; // when Nexus provisions Clickhouse. // TODO(https://github.com/oxidecomputer/omicron/issues/4000): Use // omicron_common::policy::CLICKHOUSE_SERVER_REDUNDANCY once we enable -// replicated ClickHouse +// replicated ClickHouse. +// Set to 0 when testing replicated ClickHouse. const CLICKHOUSE_COUNT: usize = 1; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // when Nexus provisions Clickhouse keeper. // TODO(https://github.com/oxidecomputer/omicron/issues/4000): Use // omicron_common::policy::CLICKHOUSE_KEEPER_REDUNDANCY once we enable // replicated ClickHouse +// Set to 3 when testing replicated ClickHouse. const CLICKHOUSE_KEEPER_COUNT: usize = 0; +// TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove +// when Nexus provisions Clickhouse server. +// TODO(https://github.com/oxidecomputer/omicron/issues/4000): Use +// omicron_common::policy::CLICKHOUSE_SERVER_REDUNDANCY once we enable +// replicated ClickHouse. +// Set to 2 when testing replicated ClickHouse +const CLICKHOUSE_SERVER_COUNT: usize = 0; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove. // when Nexus provisions Crucible. const MINIMUM_U2_COUNT: usize = 3; @@ -628,6 +637,47 @@ impl Plan { }); } + // Provision Clickhouse server zones, continuing to stripe across sleds. + // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove + // Temporary linter rule until replicated Clickhouse is enabled + #[allow(clippy::reversed_empty_ranges)] + for _ in 0..CLICKHOUSE_SERVER_COUNT { + let sled = { + let which_sled = + sled_allocator.next().ok_or(PlanError::NotEnoughSleds)?; + &mut sled_info[which_sled] + }; + let id = OmicronZoneUuid::new_v4(); + let ip = sled.addr_alloc.next().expect("Not enough addrs"); + // TODO: This may need to be a different port if/when to have single node + // and replicated running side by side as per stage 1 of RFD 468. + let port = omicron_common::address::CLICKHOUSE_PORT; + let address = SocketAddrV6::new(ip, port, 0, 0); + dns_builder + .host_zone_with_one_backend( + id, + ip, + ServiceName::ClickhouseServer, + port, + ) + .unwrap(); + let dataset_name = + sled.alloc_dataset_from_u2s(DatasetType::ClickhouseServer)?; + let filesystem_pool = Some(dataset_name.pool().clone()); + sled.request.zones.push(OmicronZoneConfig { + // TODO-cleanup use TypedUuid everywhere + id: id.into_untyped_uuid(), + underlay_address: ip, + zone_type: OmicronZoneType::ClickhouseServer { + address, + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, + }, + filesystem_pool, + }); + } + // Provision Clickhouse Keeper zones, continuing to stripe across sleds. // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // Temporary linter rule until replicated Clickhouse is enabled diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 32cf844e6d..abc50aa06c 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1618,18 +1618,82 @@ impl ServiceManager { zone: OmicronZoneConfig { zone_type: OmicronZoneType::ClickhouseServer { .. }, - underlay_address: _, + underlay_address, .. }, .. }) => { - // We aren't yet deploying this service - error!( - &self.inner.log, - "Deploying ClickhouseServer zones is not yet supported" - ); + let Some(info) = self.inner.sled_info.get() else { + return Err(Error::SledAgentNotReady); + }; + + let listen_addr = *underlay_address; + let listen_port = CLICKHOUSE_PORT.to_string(); + + let nw_setup_service = Self::zone_network_setup_install( + Some(&info.underlay_address), + &installed_zone, + &[listen_addr], + )?; + + let dns_service = Self::dns_install(info, None, &None).await?; + + let config = PropertyGroupBuilder::new("config") + .add_property( + "listen_addr", + "astring", + listen_addr.to_string(), + ) + .add_property("listen_port", "astring", listen_port) + .add_property("store", "astring", "/data"); + let clickhouse_server_service = + ServiceBuilder::new("oxide/clickhouse_server") + .add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(config), + ); + + let ch_address = + SocketAddr::new(IpAddr::V6(listen_addr), CLICKHOUSE_PORT) + .to_string(); + + let admin_address = SocketAddr::new( + IpAddr::V6(listen_addr), + CLICKHOUSE_ADMIN_PORT, + ) + .to_string(); + + let clickhouse_admin_config = + PropertyGroupBuilder::new("config") + .add_property( + "clickhouse_address", + "astring", + ch_address, + ) + .add_property("http_address", "astring", admin_address); + let clickhouse_admin_service = + ServiceBuilder::new("oxide/clickhouse-admin").add_instance( + ServiceInstanceBuilder::new("default") + .add_property_group(clickhouse_admin_config), + ); - todo!() + let profile = ProfileBuilder::new("omicron") + .add_service(nw_setup_service) + .add_service(disabled_ssh_service) + .add_service(clickhouse_server_service) + .add_service(dns_service) + .add_service(enabled_dns_client_service) + .add_service(clickhouse_admin_service); + profile + .add_to_zone(&self.inner.log, &installed_zone) + .await + .map_err(|err| { + Error::io( + "Failed to setup clickhouse server profile", + err, + ) + })?; + RunningZone::boot(installed_zone).await? } ZoneArgs::Omicron(OmicronZoneConfigLocal { diff --git a/smf/clickhouse/method_script.sh b/smf/clickhouse/method_script.sh index 224d759cf3..bb5dd960a1 100755 --- a/smf/clickhouse/method_script.sh +++ b/smf/clickhouse/method_script.sh @@ -10,136 +10,13 @@ LISTEN_ADDR="$(svcprop -c -p config/listen_addr "${SMF_FMRI}")" LISTEN_PORT="$(svcprop -c -p config/listen_port "${SMF_FMRI}")" DATASTORE="$(svcprop -c -p config/store "${SMF_FMRI}")" -# TEMPORARY: Racks will be set up with single node ClickHouse until -# Nexus provisions services so there is no divergence between racks -# https://github.com/oxidecomputer/omicron/issues/732 -single_node=true +args=( +"--log-file" "/var/tmp/clickhouse-server.log" +"--errorlog-file" "/var/tmp/clickhouse-server.errlog" +"--" +"--path" "${DATASTORE}" +"--listen_host" "$LISTEN_ADDR" +"--http_port" "$LISTEN_PORT" +) -command=() -# TODO((https://github.com/oxidecomputer/omicron/issues/4000)): Remove single node mode once all racks are running in replicated mode -if $single_node -then - command+=( - "/opt/oxide/clickhouse/clickhouse" "server" - "--log-file" "/var/tmp/clickhouse-server.log" - "--errorlog-file" "/var/tmp/clickhouse-server.errlog" - "--" - "--path" "${DATASTORE}" - "--listen_host" "$LISTEN_ADDR" - "--http_port" "$LISTEN_PORT" - ) -else - # Retrieve hostnames (SRV records in internal DNS) of the clickhouse nodes. - CH_ADDRS="$(/opt/oxide/internal-dns-cli/bin/dnswait clickhouse -H)" - - if [[ -z "$CH_ADDRS" ]]; then - printf 'ERROR: found no hostnames for other ClickHouse nodes\n' >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - - declare -a nodes=($CH_ADDRS) - - for i in "${nodes[@]}" - do - if ! grep -q "host.control-plane.oxide.internal" <<< "${i}"; then - printf 'ERROR: retrieved ClickHouse hostname does not match the expected format\n' >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - done - - # Assign hostnames to replicas - REPLICA_HOST_01="${nodes[0]}" - REPLICA_HOST_02="${nodes[1]}" - - # Retrieve hostnames (SRV records in internal DNS) of the keeper nodes. - K_ADDRS="$(/opt/oxide/internal-dns-cli/bin/dnswait clickhouse-keeper -H)" - - if [[ -z "$K_ADDRS" ]]; then - printf 'ERROR: found no hostnames for other ClickHouse Keeper nodes\n' >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - - declare -a keepers=($K_ADDRS) - - for i in "${keepers[@]}" - do - if ! grep -q "host.control-plane.oxide.internal" <<< "${i}"; then - printf 'ERROR: retrieved ClickHouse Keeper hostname does not match the expected format\n' >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - done - - if [[ "${#keepers[@]}" != 3 ]] - then - printf "ERROR: expected 3 ClickHouse Keeper hosts, found "${#keepers[@]}" instead\n" >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - - # Identify the node type this is as this will influence how the config is constructed - # TODO(https://github.com/oxidecomputer/omicron/issues/3824): There are probably much - # better ways to do this service discovery, but this works for now. - # The services contain the same IDs as the hostnames. - CLICKHOUSE_SVC="$(zonename | tr -dc [:digit:])" - REPLICA_IDENTIFIER_01="$( echo "${REPLICA_HOST_01}" | tr -dc [:digit:])" - REPLICA_IDENTIFIER_02="$( echo "${REPLICA_HOST_02}" | tr -dc [:digit:])" - if [[ $REPLICA_IDENTIFIER_01 == $CLICKHOUSE_SVC ]] - then - REPLICA_DISPLAY_NAME="oximeter_cluster node 1" - REPLICA_NUMBER="01" - elif [[ $REPLICA_IDENTIFIER_02 == $CLICKHOUSE_SVC ]] - then - REPLICA_DISPLAY_NAME="oximeter_cluster node 2" - REPLICA_NUMBER="02" - else - printf 'ERROR: service name does not match any of the identified ClickHouse hostnames\n' >&2 - exit "$SMF_EXIT_ERR_CONFIG" - fi - - # Setting environment variables this way is best practice, but has the downside of - # obscuring the field values to anyone ssh-ing into the zone. To mitigate this, - # we will be saving them to ${DATASTORE}/config_env_vars - export CH_LOG="${DATASTORE}/clickhouse-server.log" - export CH_ERROR_LOG="${DATASTORE}/clickhouse-server.errlog" - export CH_REPLICA_DISPLAY_NAME=${REPLICA_DISPLAY_NAME} - export CH_LISTEN_ADDR=${LISTEN_ADDR} - export CH_LISTEN_PORT=${LISTEN_PORT} - export CH_DATASTORE=${DATASTORE} - export CH_TMP_PATH="${DATASTORE}/tmp/" - export CH_USER_FILES_PATH="${DATASTORE}/user_files/" - export CH_USER_LOCAL_DIR="${DATASTORE}/access/" - export CH_FORMAT_SCHEMA_PATH="${DATASTORE}/format_schemas/" - export CH_REPLICA_NUMBER=${REPLICA_NUMBER} - export CH_REPLICA_HOST_01=${REPLICA_HOST_01} - export CH_REPLICA_HOST_02=${REPLICA_HOST_02} - export CH_KEEPER_HOST_01="${keepers[0]}" - export CH_KEEPER_HOST_02="${keepers[1]}" - export CH_KEEPER_HOST_03="${keepers[2]}" - - content="CH_LOG="${CH_LOG}"\n\ - CH_ERROR_LOG="${CH_ERROR_LOG}"\n\ - CH_REPLICA_DISPLAY_NAME="${CH_REPLICA_DISPLAY_NAME}"\n\ - CH_LISTEN_ADDR="${CH_LISTEN_ADDR}"\n\ - CH_LISTEN_PORT="${CH_LISTEN_PORT}"\n\ - CH_DATASTORE="${CH_DATASTORE}"\n\ - CH_TMP_PATH="${CH_TMP_PATH}"\n\ - CH_USER_FILES_PATH="${CH_USER_FILES_PATH}"\n\ - CH_USER_LOCAL_DIR="${CH_USER_LOCAL_DIR}"\n\ - CH_FORMAT_SCHEMA_PATH="${CH_FORMAT_SCHEMA_PATH}"\n\ - CH_REPLICA_NUMBER="${CH_REPLICA_NUMBER}"\n\ - CH_REPLICA_HOST_01="${CH_REPLICA_HOST_01}"\n\ - CH_REPLICA_HOST_02="${CH_REPLICA_HOST_02}"\n\ - CH_KEEPER_HOST_01="${CH_KEEPER_HOST_01}"\n\ - CH_KEEPER_HOST_02="${CH_KEEPER_HOST_02}"\n\ - CH_KEEPER_HOST_03="${CH_KEEPER_HOST_03}"" - - echo $content >> "${DATASTORE}/config_env_vars" - - - # The clickhouse binary must be run from within the directory that contains it. - # Otherwise, it does not automatically detect the configuration files, nor does - # it append them when necessary - cd /opt/oxide/clickhouse/ - command+=("./clickhouse" "server") -fi - -exec "${command[@]}" & \ No newline at end of file +exec /opt/oxide/clickhouse/clickhouse server "${args[@]}" & \ No newline at end of file diff --git a/smf/clickhouse/config_replica.xml b/smf/clickhouse_server/config_replica.xml similarity index 100% rename from smf/clickhouse/config_replica.xml rename to smf/clickhouse_server/config_replica.xml diff --git a/smf/clickhouse_server/manifest.xml b/smf/clickhouse_server/manifest.xml new file mode 100644 index 0000000000..8ab4f78bcb --- /dev/null +++ b/smf/clickhouse_server/manifest.xml @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/smf/clickhouse_server/method_script.sh b/smf/clickhouse_server/method_script.sh new file mode 100755 index 0000000000..a0d61072ac --- /dev/null +++ b/smf/clickhouse_server/method_script.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +set -x +set -o errexit +set -o pipefail + +. /lib/svc/share/smf_include.sh + +LISTEN_ADDR="$(svcprop -c -p config/listen_addr "${SMF_FMRI}")" +LISTEN_PORT="$(svcprop -c -p config/listen_port "${SMF_FMRI}")" +DATASTORE="$(svcprop -c -p config/store "${SMF_FMRI}")" + +# Retrieve hostnames (SRV records in internal DNS) of the clickhouse nodes. +CH_ADDRS="$(/opt/oxide/internal-dns-cli/bin/dnswait clickhouse-server -H)" + +if [[ -z "$CH_ADDRS" ]]; then + printf 'ERROR: found no hostnames for other ClickHouse server nodes\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" +fi + +declare -a nodes=($CH_ADDRS) + +for i in "${nodes[@]}" +do + if ! grep -q "host.control-plane.oxide.internal" <<< "${i}"; then + printf 'ERROR: retrieved ClickHouse hostname does not match the expected format\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" + fi +done + +# Assign hostnames to replicas +REPLICA_HOST_01="${nodes[0]}" +REPLICA_HOST_02="${nodes[1]}" + +# Retrieve hostnames (SRV records in internal DNS) of the keeper nodes. +K_ADDRS="$(/opt/oxide/internal-dns-cli/bin/dnswait clickhouse-keeper -H)" + +if [[ -z "$K_ADDRS" ]]; then + printf 'ERROR: found no hostnames for other ClickHouse Keeper nodes\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" +fi + +declare -a keepers=($K_ADDRS) + +for i in "${keepers[@]}" +do + if ! grep -q "host.control-plane.oxide.internal" <<< "${i}"; then + printf 'ERROR: retrieved ClickHouse Keeper hostname does not match the expected format\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" + fi +done + +if [[ "${#keepers[@]}" != 3 ]] +then + printf "ERROR: expected 3 ClickHouse Keeper hosts, found "${#keepers[@]}" instead\n" >&2 + exit "$SMF_EXIT_ERR_CONFIG" +fi + +# Identify the node type this is as this will influence how the config is constructed +# TODO(https://github.com/oxidecomputer/omicron/issues/3824): There are probably much +# better ways to do this service discovery, but this works for now. +# The services contain the same IDs as the hostnames. +CLICKHOUSE_SVC="$(zonename | tr -dc [:digit:])" +REPLICA_IDENTIFIER_01="$( echo "${REPLICA_HOST_01}" | tr -dc [:digit:])" +REPLICA_IDENTIFIER_02="$( echo "${REPLICA_HOST_02}" | tr -dc [:digit:])" +if [[ $REPLICA_IDENTIFIER_01 == $CLICKHOUSE_SVC ]] +then + REPLICA_DISPLAY_NAME="oximeter_cluster node 1" + REPLICA_NUMBER="01" +elif [[ $REPLICA_IDENTIFIER_02 == $CLICKHOUSE_SVC ]] +then + REPLICA_DISPLAY_NAME="oximeter_cluster node 2" + REPLICA_NUMBER="02" +else + printf 'ERROR: service name does not match any of the identified ClickHouse hostnames\n' >&2 + exit "$SMF_EXIT_ERR_CONFIG" +fi + +# Setting environment variables this way is best practice, but has the downside of +# obscuring the field values to anyone ssh-ing into the zone. To mitigate this, +# we will be saving them to ${DATASTORE}/config_env_vars +export CH_LOG="${DATASTORE}/clickhouse-server.log" +export CH_ERROR_LOG="${DATASTORE}/clickhouse-server.errlog" +export CH_REPLICA_DISPLAY_NAME=${REPLICA_DISPLAY_NAME} +export CH_LISTEN_ADDR=${LISTEN_ADDR} +export CH_LISTEN_PORT=${LISTEN_PORT} +export CH_DATASTORE=${DATASTORE} +export CH_TMP_PATH="${DATASTORE}/tmp/" +export CH_USER_FILES_PATH="${DATASTORE}/user_files/" +export CH_USER_LOCAL_DIR="${DATASTORE}/access/" +export CH_FORMAT_SCHEMA_PATH="${DATASTORE}/format_schemas/" +export CH_REPLICA_NUMBER=${REPLICA_NUMBER} +export CH_REPLICA_HOST_01=${REPLICA_HOST_01} +export CH_REPLICA_HOST_02=${REPLICA_HOST_02} +export CH_KEEPER_HOST_01="${keepers[0]}" +export CH_KEEPER_HOST_02="${keepers[1]}" +export CH_KEEPER_HOST_03="${keepers[2]}" + +content="CH_LOG="${CH_LOG}"\n\ +CH_ERROR_LOG="${CH_ERROR_LOG}"\n\ +CH_REPLICA_DISPLAY_NAME="${CH_REPLICA_DISPLAY_NAME}"\n\ +CH_LISTEN_ADDR="${CH_LISTEN_ADDR}"\n\ +CH_LISTEN_PORT="${CH_LISTEN_PORT}"\n\ +CH_DATASTORE="${CH_DATASTORE}"\n\ +CH_TMP_PATH="${CH_TMP_PATH}"\n\ +CH_USER_FILES_PATH="${CH_USER_FILES_PATH}"\n\ +CH_USER_LOCAL_DIR="${CH_USER_LOCAL_DIR}"\n\ +CH_FORMAT_SCHEMA_PATH="${CH_FORMAT_SCHEMA_PATH}"\n\ +CH_REPLICA_NUMBER="${CH_REPLICA_NUMBER}"\n\ +CH_REPLICA_HOST_01="${CH_REPLICA_HOST_01}"\n\ +CH_REPLICA_HOST_02="${CH_REPLICA_HOST_02}"\n\ +CH_KEEPER_HOST_01="${CH_KEEPER_HOST_01}"\n\ +CH_KEEPER_HOST_02="${CH_KEEPER_HOST_02}"\n\ +CH_KEEPER_HOST_03="${CH_KEEPER_HOST_03}"" + +echo $content >> "${DATASTORE}/config_env_vars" + + +# The clickhouse binary must be run from within the directory that contains it. +# Otherwise, it does not automatically detect the configuration files, nor does +# it append them when necessary +cd /opt/oxide/clickhouse_server/ + +exec ./clickhouse server & \ No newline at end of file diff --git a/smf/oximeter/config.toml b/smf/oximeter/replicated-cluster/config.toml similarity index 91% rename from smf/oximeter/config.toml rename to smf/oximeter/replicated-cluster/config.toml index ca14fe6ec8..f7958e5eb1 100644 --- a/smf/oximeter/config.toml +++ b/smf/oximeter/replicated-cluster/config.toml @@ -3,6 +3,7 @@ [db] batch_size = 1000 batch_interval = 5 # In seconds +replicated = true [log] level = "debug" diff --git a/smf/oximeter/single-node/config.toml b/smf/oximeter/single-node/config.toml new file mode 100644 index 0000000000..bc0418159c --- /dev/null +++ b/smf/oximeter/single-node/config.toml @@ -0,0 +1,12 @@ +# Example configuration file for running an oximeter collector server + +[db] +batch_size = 1000 +batch_interval = 5 # In seconds +replicated = false + +[log] +level = "debug" +mode = "file" +path = "/dev/stdout" +if_exists = "append" From 256c06663a6298f8fe7f33a003888c6dd923f3db Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 20 Aug 2024 09:52:33 -0700 Subject: [PATCH 79/91] [sp-sim] rudimentary simulation of sensors (#6313) In order to develop Oximeter metrics for SP sensor readings, emitted by MGS, we would like the `sp-sim` binary to be able to simulate the protocol for reading SP sensors. This branch adds a fairly rudimentary implementation of this: components configured in the `sp-sim` config file may now include an array of one or more `sensors`, like this: ```toml # ... [[simulated_sps.gimlet.components]] id = "dev-0" device = "tmp117" description = "FAKE Southwest temperature sensor" capabilities = 2 presence = "Present" sensors = [ { name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 }, ] ``` Once this is added, the simulated SP will implement the `num_component_details`, `component_details`, and `read_sensor` functions for any such components: ```console eliza@noctis ~/Code/oxide/omicron $ curl -s http://127.0.0.1:11111/sp/sled/0/component | jq { "components": [ { "component": "sp3-host-cpu", "device": "sp3-host-cpu", "serial_number": null, "description": "FAKE host cpu", "capabilities": 0, "presence": "present" }, { "component": "dev-0", "device": "tmp117", "serial_number": null, "description": "FAKE Southwest temperature sensor", "capabilities": 2, "presence": "present" } ] } eliza@noctis ~/Code/oxide/omicron $ curl -s http://127.0.0.1:11111/sp/sled/0/component/dev-0 | jq [ { "type": "measurement", "name": "Southwest", "kind": { "kind": "temperature" }, "value": 41.789062 } ] ``` In the future, I would like to extend this functionality substantially: it would be nice to add a notion of a simulated global timestamp, and a mechanism for changing the values of sensor readings dynamically. I think this would be useful for testing the timebase synchronization code we will no doubt need to write eventually for this. But, for now, being able to hard-code sensor values is a start. --- sp-sim/examples/config.toml | 21 ++++ sp-sim/src/config.rs | 14 +++ sp-sim/src/gimlet.rs | 33 ++++-- sp-sim/src/lib.rs | 1 + sp-sim/src/sensors.rs | 218 ++++++++++++++++++++++++++++++++++++ sp-sim/src/sidecar.rs | 35 ++++-- 6 files changed, 307 insertions(+), 15 deletions(-) create mode 100644 sp-sim/src/sensors.rs diff --git a/sp-sim/examples/config.toml b/sp-sim/examples/config.toml index cf338ecf2e..f53ea7cfd8 100644 --- a/sp-sim/examples/config.toml +++ b/sp-sim/examples/config.toml @@ -24,6 +24,16 @@ capabilities = 0 presence = "Present" serial_console = "[::1]:33312" +[[simulated_sps.gimlet.components]] +id = "dev-0" +device = "tmp117" +description = "FAKE Southwest temperature sensor" +capabilities = 2 +presence = "Present" +sensors = [ + { name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 }, +] + [[simulated_sps.gimlet]] multicast_addr = "ff15:0:1de::2" bind_addrs = ["[::]:33320", "[::]:33321"] @@ -39,6 +49,17 @@ capabilities = 0 presence = "Present" serial_console = "[::1]:33322" +[[simulated_sps.gimlet.components]] +id = "dev-0" +device = "tmp117" +description = "FAKE Southwest temperature sensor" +capabilities = 2 +presence = "Present" +sensors = [ + { name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 }, +] + + [log] # Show log messages of this level and more severe level = "debug" diff --git a/sp-sim/src/config.rs b/sp-sim/src/config.rs index b64953e5ed..d45e956dee 100644 --- a/sp-sim/src/config.rs +++ b/sp-sim/src/config.rs @@ -5,6 +5,7 @@ //! Interfaces for parsing configuration files and working with a simulated SP //! configuration +use crate::sensors; use dropshot::ConfigLogging; use gateway_messages::DeviceCapabilities; use gateway_messages::DevicePresence; @@ -59,6 +60,9 @@ pub struct SpComponentConfig { /// /// Only supported for components inside a [`GimletConfig`]. pub serial_console: Option, + + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub sensors: Vec, } /// Configuration of a simulated sidecar SP @@ -93,6 +97,16 @@ pub struct Config { pub log: ConfigLogging, } +/// Configuration for a component's sensor readings. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct SensorConfig { + #[serde(flatten)] + pub def: sensors::SensorDef, + + #[serde(flatten)] + pub state: sensors::SensorState, +} + impl Config { /// Load a `Config` from the given TOML file /// diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index e980a4b67d..70c2e72fcb 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -6,6 +6,7 @@ use crate::config::GimletConfig; use crate::config::SpComponentConfig; use crate::helpers::rot_slot_id_from_u16; use crate::helpers::rot_slot_id_to_u16; +use crate::sensors::Sensors; use crate::serial_number_padded; use crate::server; use crate::server::SimSpHandler; @@ -630,6 +631,7 @@ struct Handler { startup_options: StartupOptions, update_state: SimSpUpdate, reset_pending: Option, + sensors: Sensors, last_request_handled: Option, @@ -665,9 +667,12 @@ impl Handler { .push(&*Box::leak(c.description.clone().into_boxed_str())); } + let sensors = Sensors::from_component_configs(&components); + Self { log, components, + sensors, leaked_component_device_strings, leaked_component_description_strings, serial_number, @@ -1206,13 +1211,16 @@ impl SpHandler for Handler { port: SpPort, component: SpComponent, ) -> Result { + let num_details = + self.sensors.num_component_details(&component).unwrap_or(0); debug!( - &self.log, "asked for component details (returning 0 details)"; + &self.log, "asked for number of component details"; "sender" => %sender, "port" => ?port, "component" => ?component, + "num_details" => num_details ); - Ok(0) + Ok(num_details) } fn component_details( @@ -1220,9 +1228,20 @@ impl SpHandler for Handler { component: SpComponent, index: BoundsChecked, ) -> ComponentDetails { - // We return 0 for all components, so we should never be called (`index` - // would have to have been bounds checked to live in 0..0). - unreachable!("asked for {component:?} details index {index:?}") + let Some(sensor_details) = + self.sensors.component_details(&component, index) + else { + unreachable!( + "this is a gimlet, so it should have no port status details" + ); + }; + debug!( + &self.log, "asked for component details for a sensor"; + "component" => ?component, + "index" => index.0, + "details" => ?sensor_details + ); + sensor_details } fn component_clear_status( @@ -1445,9 +1464,9 @@ impl SpHandler for Handler { fn read_sensor( &mut self, - _request: gateway_messages::SensorRequest, + request: gateway_messages::SensorRequest, ) -> std::result::Result { - Err(SpError::RequestUnsupportedForSp) + self.sensors.read_sensor(request).map_err(SpError::Sensor) } fn current_time(&mut self) -> std::result::Result { diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs index 0f340ed642..15f2034aa8 100644 --- a/sp-sim/src/lib.rs +++ b/sp-sim/src/lib.rs @@ -5,6 +5,7 @@ pub mod config; mod gimlet; mod helpers; +mod sensors; mod server; mod sidecar; mod update; diff --git a/sp-sim/src/sensors.rs b/sp-sim/src/sensors.rs new file mode 100644 index 0000000000..fc684af01b --- /dev/null +++ b/sp-sim/src/sensors.rs @@ -0,0 +1,218 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::config::SpComponentConfig; +use gateway_messages::measurement::MeasurementError; +use gateway_messages::measurement::MeasurementKind; +use gateway_messages::sp_impl::BoundsChecked; +use gateway_messages::ComponentDetails; +use gateway_messages::DeviceCapabilities; +use gateway_messages::Measurement; +use gateway_messages::SensorDataMissing; +use gateway_messages::SensorError; +use gateway_messages::SensorReading; +use gateway_messages::SensorRequest; +use gateway_messages::SensorRequestKind; +use gateway_messages::SensorResponse; +use gateway_messages::SpComponent; + +use std::collections::HashMap; + +pub(crate) struct Sensors { + by_component: HashMap>, + sensors: Vec, +} + +#[derive(Debug)] +struct Sensor { + def: SensorDef, + state: SensorState, +} + +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize, PartialEq)] +pub struct SensorDef { + pub name: String, + pub kind: MeasurementKind, +} + +// TODO(eliza): note that currently, we just hardcode these in +// `MeasurementConfig`. Eventually, it would be neat to allow the sensor to be +// changed dynamically as part of a simulation. +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct SensorState { + #[serde(default)] + pub last_error: Option, + + #[serde(default)] + pub last_data: Option, +} + +#[derive( + Clone, Copy, Debug, serde::Serialize, serde::Deserialize, PartialEq, +)] +pub struct LastError { + pub timestamp: u64, + pub value: SensorDataMissing, +} + +#[derive( + Clone, Copy, Debug, serde::Serialize, serde::Deserialize, PartialEq, +)] +pub struct LastData { + pub timestamp: u64, + pub value: f32, +} + +impl SensorState { + fn last_reading(&self) -> SensorReading { + match self { + Self { last_data: Some(data), last_error: Some(error) } => { + if data.timestamp >= error.timestamp { + SensorReading { + value: Ok(data.value), + timestamp: data.timestamp, + } + } else { + SensorReading { + value: Err(error.value), + timestamp: error.timestamp, + } + } + } + Self { last_data: Some(data), last_error: None } => SensorReading { + value: Ok(data.value), + timestamp: data.timestamp, + }, + Self { last_data: None, last_error: Some(error) } => { + SensorReading { + value: Err(error.value), + timestamp: error.timestamp, + } + } + Self { last_data: None, last_error: None } => SensorReading { + value: Err(SensorDataMissing::DeviceNotPresent), + timestamp: 0, // TODO(eliza): what do? + }, + } + } +} + +impl Sensors { + pub(crate) fn from_component_configs<'a>( + cfgs: impl IntoIterator, + ) -> Self { + let mut sensors = Vec::new(); + let mut by_component = HashMap::new(); + for cfg in cfgs { + if cfg.sensors.is_empty() { + continue; + } + if !cfg + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + panic!( + "invalid component config: a device with sensors should \ + have the `HAS_MEASUREMENT_CHANNELS` capability:{cfg:#?}" + ); + } + + let mut ids = Vec::with_capacity(cfg.sensors.len()); + for sensor in &cfg.sensors { + let sensor_id = sensors.len() as u32; + sensors.push(Sensor { + def: sensor.def.clone(), + state: sensor.state.clone(), + }); + ids.push(sensor_id) + } + + let component = SpComponent::try_from(cfg.id.as_str()).unwrap(); + let prev = by_component.insert(component, ids); + assert!(prev.is_none(), "component ID {component} already exists!"); + } + Self { sensors, by_component } + } + + fn sensor_for_component<'sensors>( + &'sensors self, + component: &SpComponent, + index: BoundsChecked, + ) -> Option<&'sensors Sensor> { + let &id = self.by_component.get(component)?.get(index.0 as usize)?; + self.sensors.get(id as usize) + } + + pub(crate) fn num_component_details( + &self, + component: &SpComponent, + ) -> Option { + let len = self + .by_component + .get(component)? + .len() + .try_into() + .expect("why would you have more than `u32::MAX` sensors?"); + Some(len) + } + + /// This method returns an `Option` because the component's details might + /// be a port status rather than a measurement, if we eventually decide to + /// implement port statuses in the simulated sidecar... + pub(crate) fn component_details( + &self, + component: &SpComponent, + index: BoundsChecked, + ) -> Option { + let sensor = self.sensor_for_component(component, index)?; + let value = + sensor.state.last_reading().value.map_err(|err| match err { + SensorDataMissing::DeviceError => MeasurementError::DeviceError, + SensorDataMissing::DeviceNotPresent => { + MeasurementError::NotPresent + } + SensorDataMissing::DeviceOff => MeasurementError::DeviceOff, + SensorDataMissing::DeviceTimeout => { + MeasurementError::DeviceTimeout + } + SensorDataMissing::DeviceUnavailable => { + MeasurementError::DeviceUnavailable + } + }); + Some(ComponentDetails::Measurement(Measurement { + name: sensor.def.name.clone(), + kind: sensor.def.kind, + value, + })) + } + + pub(crate) fn read_sensor( + &self, + SensorRequest { id, kind }: SensorRequest, + ) -> Result { + let sensor = + self.sensors.get(id as usize).ok_or(SensorError::InvalidSensor)?; + match kind { + SensorRequestKind::LastReading => { + Ok(SensorResponse::LastReading(sensor.state.last_reading())) + } + SensorRequestKind::ErrorCount => { + let count = + // TODO(eliza): simulate more than one error... + if sensor.state.last_error.is_some() { 1 } else { 0 }; + Ok(SensorResponse::ErrorCount(count)) + } + SensorRequestKind::LastData => { + let LastData { timestamp, value } = + sensor.state.last_data.ok_or(SensorError::NoReading)?; + Ok(SensorResponse::LastData { value, timestamp }) + } + SensorRequestKind::LastError => { + let LastError { timestamp, value } = + sensor.state.last_error.ok_or(SensorError::NoReading)?; + Ok(SensorResponse::LastError { value, timestamp }) + } + } + } +} diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs index c2fb2467d8..bef1d26c78 100644 --- a/sp-sim/src/sidecar.rs +++ b/sp-sim/src/sidecar.rs @@ -8,6 +8,7 @@ use crate::config::SimulatedSpsConfig; use crate::config::SpComponentConfig; use crate::helpers::rot_slot_id_from_u16; use crate::helpers::rot_slot_id_to_u16; +use crate::sensors::Sensors; use crate::serial_number_padded; use crate::server; use crate::server::SimSpHandler; @@ -377,6 +378,7 @@ struct Handler { // our life as a simulator. leaked_component_device_strings: Vec<&'static str>, leaked_component_description_strings: Vec<&'static str>, + sensors: Sensors, serial_number: String, ignition: FakeIgnition, @@ -417,9 +419,12 @@ impl Handler { .push(&*Box::leak(c.description.clone().into_boxed_str())); } + let sensors = Sensors::from_component_configs(&components); + Self { log, components, + sensors, leaked_component_device_strings, leaked_component_description_strings, serial_number, @@ -929,13 +934,18 @@ impl SpHandler for Handler { port: SpPort, component: SpComponent, ) -> Result { - warn!( - &self.log, "asked for component details (returning 0 details)"; + let num_sensor_details = + self.sensors.num_component_details(&component).unwrap_or(0); + // TODO: here is where we might also handle port statuses, if we decide + // to simulate that later... + debug!( + &self.log, "asked for number of component details"; "sender" => %sender, "port" => ?port, "component" => ?component, + "num_details" => num_sensor_details ); - Ok(0) + Ok(num_sensor_details) } fn component_details( @@ -943,9 +953,18 @@ impl SpHandler for Handler { component: SpComponent, index: BoundsChecked, ) -> ComponentDetails { - // We return 0 for all components, so we should never be called (`index` - // would have to have been bounds checked to live in 0..0). - unreachable!("asked for {component:?} details index {index:?}") + let Some(sensor_details) = + self.sensors.component_details(&component, index) + else { + todo!("simulate port status details..."); + }; + debug!( + &self.log, "asked for component details for a sensor"; + "component" => ?component, + "index" => index.0, + "details" => ?sensor_details + ); + sensor_details } fn component_clear_status( @@ -1163,9 +1182,9 @@ impl SpHandler for Handler { fn read_sensor( &mut self, - _request: gateway_messages::SensorRequest, + request: gateway_messages::SensorRequest, ) -> std::result::Result { - Err(SpError::RequestUnsupportedForSp) + self.sensors.read_sensor(request).map_err(SpError::Sensor) } fn current_time(&mut self) -> std::result::Result { From 2e0025cc6b0c19e9024419a6d6096c5b49c8d720 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 20 Aug 2024 10:26:17 -0700 Subject: [PATCH 80/91] back out "Update Rust crate tokio to 1.39.2 (#6249)" (#6356) Tokio 1.39 updated its mio dependency to 1.0, which changed the waker impl on illumos from a self-pipe to eventfd. That has caused several issues already: * https://github.com/oxidecomputer/helios/issues/169 * https://github.com/oxidecomputer/helios/pull/171 Based on these and the potential for other lurking issues, we're making a policy decision to roll back to 1.38 (mio 0.8) for r10. We can't be off of the train forever so we're aiming to land the 1.39 update early in the r11 cycle. This backs out commit d7d4beaf0dbfa82c6ae0da10a6ce43b3c5a89142. --- Cargo.lock | 15 ++++++++------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6bd71f6d38..96a1db4983 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6494,7 +6494,7 @@ dependencies = [ "log", "managed", "memchr", - "mio 1.0.2", + "mio 0.8.11", "nix 0.28.0", "nom", "num-bigint-dig", @@ -10605,27 +10605,28 @@ dependencies = [ [[package]] name = "tokio" -version = "1.39.2" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", "libc", - "mio 1.0.2", + "mio 0.8.11", + "num_cpus", "parking_lot 0.12.2", "pin-project-lite", "signal-hook-registry", "socket2 0.5.7", "tokio-macros", - "windows-sys 0.52.0", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 83aea83ddf..ce2b7f8eb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -577,7 +577,7 @@ textwrap = "0.16.1" test-strategy = "0.3.1" thiserror = "1.0" tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "main" } -tokio = "1.39.2" +tokio = "1.38.1" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.15" tokio-tungstenite = "0.20" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 1c58626d2d..014444c542 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -107,7 +107,7 @@ string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } -tokio = { version = "1.39.2", features = ["full", "test-util"] } +tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } @@ -217,7 +217,7 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extr syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] } -tokio = { version = "1.39.2", features = ["full", "test-util"] } +tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } @@ -237,7 +237,7 @@ zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } [target.x86_64-unknown-linux-gnu.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -246,35 +246,35 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su [target.x86_64-unknown-linux-gnu.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] } -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.dependencies] -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.x86_64-apple-darwin.build-dependencies] -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.dependencies] -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] } [target.aarch64-apple-darwin.build-dependencies] -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -282,7 +282,7 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su [target.x86_64-unknown-illumos.dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } @@ -291,7 +291,7 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu [target.x86_64-unknown-illumos.build-dependencies] dof = { version = "0.3.0", default-features = false, features = ["des"] } -mio = { version = "1.0.2", features = ["net", "os-ext"] } +mio = { version = "0.8.11", features = ["net", "os-ext"] } nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] } once_cell = { version = "1.19.0" } rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] } From ab29a029ba612722cc062f0d691a7b63deb0afe7 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 20 Aug 2024 21:58:12 -0700 Subject: [PATCH 81/91] [reconfigurator] a couple of type improvements (#6395) * Use a typestate for the physical disk dependency. Just want to make sure the ordering is right once this moves to being expressed via the update engine. * Add strong typing around the output of `realize_blueprint`. --- .../execution/src/cockroachdb.rs | 22 ++-- nexus/reconfigurator/execution/src/dns.rs | 106 +++++++++--------- nexus/reconfigurator/execution/src/lib.rs | 28 +++-- .../execution/src/omicron_physical_disks.rs | 51 ++++++--- .../background/tasks/blueprint_execution.rs | 41 +++++-- 5 files changed, 156 insertions(+), 92 deletions(-) diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs index 277f5f91c4..01baebfb57 100644 --- a/nexus/reconfigurator/execution/src/cockroachdb.rs +++ b/nexus/reconfigurator/execution/src/cockroachdb.rs @@ -34,6 +34,7 @@ pub(crate) async fn ensure_settings( mod test { use super::*; use crate::overridables::Overridables; + use crate::RealizeBlueprintOutput; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_test_utils_macros::nexus_test; @@ -97,16 +98,17 @@ mod test { .await; // Execute the initial blueprint. let overrides = Overridables::for_test(cptestctx); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute initial blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute initial blueprint"); // The CockroachDB settings should not have changed. assert_eq!( settings, diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 4395944b25..9ca14f8e24 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -458,6 +458,7 @@ pub fn blueprint_nexus_external_ips(blueprint: &Blueprint) -> Vec { mod test { use super::*; use crate::overridables::Overridables; + use crate::RealizeBlueprintOutput; use crate::Sled; use dns_service_client::DnsDiff; use internal_dns::config::Host; @@ -1245,16 +1246,17 @@ mod test { // Now, execute the initial blueprint. let overrides = Overridables::for_test(cptestctx); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute initial blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute initial blueprint"); // DNS ought not to have changed. verify_dns_unchanged( @@ -1385,16 +1387,17 @@ mod test { .await .expect("failed to set blueprint as target"); - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint"); // Now fetch DNS again. Both should have changed this time. let dns_latest_internal = datastore @@ -1459,16 +1462,17 @@ mod test { } // If we execute it again, we should see no more changes. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint again"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); verify_dns_unchanged( &opctx, datastore, @@ -1495,16 +1499,17 @@ mod test { // One more time, make sure that executing the blueprint does not do // anything. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint2, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute second blueprint again"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint2, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute second blueprint again"); verify_dns_unchanged( &opctx, datastore, @@ -1589,16 +1594,17 @@ mod test { ); // If we execute the blueprint, DNS should not be changed. - crate::realize_blueprint_with_overrides( - &opctx, - datastore, - resolver, - &blueprint, - Uuid::new_v4(), - &overrides, - ) - .await - .expect("failed to execute blueprint"); + let _: RealizeBlueprintOutput = + crate::realize_blueprint_with_overrides( + &opctx, + datastore, + resolver, + &blueprint, + Uuid::new_v4(), + &overrides, + ) + .await + .expect("failed to execute blueprint"); let dns_latest_internal = datastore .dns_config_read(&opctx, DnsGroup::Internal) .await diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 8606187762..2c70c7acbb 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -70,6 +70,15 @@ impl From for Sled { } } +/// The result of calling [`realize_blueprint`] or +/// [`realize_blueprint_with_overrides`]. +#[derive(Debug)] +#[must_use = "the output of realize_blueprint should probably be used"] +pub struct RealizeBlueprintOutput { + /// Whether any sagas need to be reassigned to a new Nexus. + pub needs_saga_recovery: bool, +} + /// Make one attempt to realize the given blueprint, meaning to take actions to /// alter the real system to match the blueprint /// @@ -81,7 +90,7 @@ pub async fn realize_blueprint( resolver: &Resolver, blueprint: &Blueprint, nexus_id: Uuid, -) -> Result> { +) -> Result> { realize_blueprint_with_overrides( opctx, datastore, @@ -100,7 +109,7 @@ pub async fn realize_blueprint_with_overrides( blueprint: &Blueprint, nexus_id: Uuid, overrides: &Overridables, -) -> Result> { +) -> Result> { let opctx = opctx.child(BTreeMap::from([( "comment".to_string(), blueprint.comment.clone(), @@ -132,7 +141,7 @@ pub async fn realize_blueprint_with_overrides( }) .collect(); - omicron_physical_disks::deploy_disks( + let deploy_disks_done = omicron_physical_disks::deploy_disks( &opctx, &sleds_by_id, &blueprint.blueprint_disks, @@ -205,11 +214,12 @@ pub async fn realize_blueprint_with_overrides( ) .await?; - // This depends on the "deploy_disks" call earlier -- disk expungement is a - // statement of policy, but we need to be assured that the Sled Agent has - // stopped using that disk before we can mark its state as decommissioned. - omicron_physical_disks::decommission_expunged_disks(&opctx, datastore) - .await?; + omicron_physical_disks::decommission_expunged_disks( + &opctx, + datastore, + deploy_disks_done, + ) + .await?; // From this point on, we'll assume that any errors that we encounter do // *not* require stopping execution. We'll just accumulate them and return @@ -244,7 +254,7 @@ pub async fn realize_blueprint_with_overrides( } if errors.is_empty() { - Ok(needs_saga_recovery) + Ok(RealizeBlueprintOutput { needs_saga_recovery }) } else { Err(errors) } diff --git a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs index 7adc41213e..af95eb8e77 100644 --- a/nexus/reconfigurator/execution/src/omicron_physical_disks.rs +++ b/nexus/reconfigurator/execution/src/omicron_physical_disks.rs @@ -25,7 +25,7 @@ pub(crate) async fn deploy_disks( opctx: &OpContext, sleds_by_id: &BTreeMap, sled_configs: &BTreeMap, -) -> Result<(), Vec> { +) -> Result> { let errors: Vec<_> = stream::iter(sled_configs) .filter_map(|(sled_id, config)| async move { let log = opctx.log.new(o!( @@ -92,16 +92,26 @@ pub(crate) async fn deploy_disks( .await; if errors.is_empty() { - Ok(()) + Ok(DeployDisksDone {}) } else { Err(errors) } } -/// Decommissions all disks which are currently expunged +/// Typestate indicating that the deploy disks step was performed. +#[derive(Debug)] +#[must_use = "this should be passed into decommission_expunged_disks"] +pub(crate) struct DeployDisksDone {} + +/// Decommissions all disks which are currently expunged. pub(crate) async fn decommission_expunged_disks( opctx: &OpContext, datastore: &DataStore, + // This is taken as a parameter to ensure that this depends on a + // "deploy_disks" call made earlier. Disk expungement is a statement of + // policy, but we need to be assured that the Sled Agent has stopped using + // that disk before we can mark its state as decommissioned. + _deploy_disks_done: DeployDisksDone, ) -> Result<(), Vec> { datastore .physical_disk_decommission_all_expunged(&opctx) @@ -113,6 +123,7 @@ pub(crate) async fn decommission_expunged_disks( #[cfg(test)] mod test { use super::deploy_disks; + use super::DeployDisksDone; use crate::DataStore; use crate::Sled; @@ -217,9 +228,13 @@ mod test { // Get a success result back when the blueprint has an empty set of // disks. let (_, blueprint) = create_blueprint(BTreeMap::new()); - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy no disks"); + // Use an explicit type here because not doing so can cause errors to + // be ignored (this behavior is genuinely terrible). Instead, ensure + // that the type has the right result. + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy no disks"); // Disks are updated in a particular order, but each request contains // the full set of disks that must be running. @@ -272,9 +287,10 @@ mod test { } // Execute it. - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy initial disks"); + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy initial disks"); s1.verify_and_clear(); s2.verify_and_clear(); @@ -293,9 +309,10 @@ mod test { )), ); } - deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) - .await - .expect("failed to deploy same disks"); + let _: DeployDisksDone = + deploy_disks(&opctx, &sleds_by_id, &blueprint.blueprint_disks) + .await + .expect("failed to deploy same disks"); s1.verify_and_clear(); s2.verify_and_clear(); @@ -567,7 +584,15 @@ mod test { assert_eq!(d.disk_state, PhysicalDiskState::Active); assert_eq!(d.disk_policy, PhysicalDiskPolicy::InService); - super::decommission_expunged_disks(&opctx, &datastore).await.unwrap(); + super::decommission_expunged_disks( + &opctx, + &datastore, + // This is an internal test, and we're testing decommissioning in + // isolation, so it's okay to create the typestate here. + DeployDisksDone {}, + ) + .await + .unwrap(); // After decommissioning, we see the expunged disk become // decommissioned. The other disk remains in-service. diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index b430270ec9..d13e5428f8 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -10,6 +10,7 @@ use futures::FutureExt; use internal_dns::resolver::Resolver; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_reconfigurator_execution::RealizeBlueprintOutput; use nexus_types::deployment::{Blueprint, BlueprintTarget}; use serde_json::json; use std::sync::Arc; @@ -98,16 +99,23 @@ impl BlueprintExecutor { // Trigger anybody waiting for this to finish. self.tx.send_modify(|count| *count = *count + 1); - // If executing the blueprint requires activating the saga recovery - // background task, do that now. - info!(&opctx.log, "activating saga recovery task"); - if let Ok(true) = result { - self.saga_recovery.activate(); - } - // Return the result as a `serde_json::Value` match result { - Ok(_) => json!({}), + Ok(RealizeBlueprintOutput { needs_saga_recovery }) => { + // If executing the blueprint requires activating the saga + // recovery background task, do that now. + if let Ok(output) = &result { + if output.needs_saga_recovery { + info!(&opctx.log, "activating saga recovery task"); + self.saga_recovery.activate(); + } + } + + json!({ + "target_id": blueprint.id.to_string(), + "needs_saga_recovery": needs_saga_recovery, + }) + } Err(errors) => { let errors: Vec<_> = errors.into_iter().map(|e| format!("{:#}", e)).collect(); @@ -302,10 +310,17 @@ mod test { ) .await, ); + let blueprint_id = blueprint.1.id; blueprint_tx.send(Some(blueprint)).unwrap(); let value = task.activate(&opctx).await; println!("activating with no zones: {:?}", value); - assert_eq!(value, json!({})); + assert_eq!( + value, + json!({ + "target_id": blueprint_id, + "needs_saga_recovery": false, + }) + ); // Create a non-empty blueprint describing two servers and verify that // the task correctly winds up making requests to both of them and @@ -393,7 +408,13 @@ mod test { // Activate the task to trigger zone configuration on the sled-agents let value = task.activate(&opctx).await; println!("activating two sled agents: {:?}", value); - assert_eq!(value, json!({})); + assert_eq!( + value, + json!({ + "target_id": blueprint.1.id.to_string(), + "needs_saga_recovery": false, + }) + ); s1.verify_and_clear(); s2.verify_and_clear(); From 6dde3f86ca91a758d5a99dafb5a2ad40ec31e735 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 20 Aug 2024 22:31:46 -0700 Subject: [PATCH 82/91] [4/6] [openapi-manager] richer extra validation (#6370) With the Nexus external API, the validator generates a `nexus_tags.txt` file that must be kept track of. Instead of the validation function simply erroring out if the file is different, it is a better experience for users if it records what it expects the file to be, and then the OpenAPI manager simply treats it as an extra file similar to the document itself. With this pattern, the check and generate functions can both work on the extra file just like they work on the document. In order for the there to be a richer protocol for validation, the interface needs to be split into its own crate. This way, the API crates can depend on this minimal interface, and the OpenAPI manager itself can depend on the API crates. This isn't used yet, but will be in #6373. --- Cargo.lock | 11 + Cargo.toml | 3 + dev-tools/openapi-manager/Cargo.toml | 1 + dev-tools/openapi-manager/src/check.rs | 125 +++++--- dev-tools/openapi-manager/src/dispatch.rs | 8 +- dev-tools/openapi-manager/src/generate.rs | 26 +- dev-tools/openapi-manager/src/output.rs | 54 +++- dev-tools/openapi-manager/src/spec.rs | 276 ++++++++++++++---- dev-tools/openapi-manager/types/Cargo.toml | 13 + dev-tools/openapi-manager/types/src/lib.rs | 12 + .../openapi-manager/types/src/validation.rs | 47 +++ workspace-hack/Cargo.toml | 2 + 12 files changed, 453 insertions(+), 125 deletions(-) create mode 100644 dev-tools/openapi-manager/types/Cargo.toml create mode 100644 dev-tools/openapi-manager/types/src/lib.rs create mode 100644 dev-tools/openapi-manager/types/src/validation.rs diff --git a/Cargo.lock b/Cargo.lock index 96a1db4983..830ec523a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6510,6 +6510,7 @@ dependencies = [ "postgres-types", "predicates", "proc-macro2", + "quote", "regex", "regex-automata 0.4.6", "regex-syntax 0.8.4", @@ -6637,6 +6638,7 @@ dependencies = [ "nexus-internal-api", "omicron-workspace-hack", "openapi-lint", + "openapi-manager-types", "openapiv3", "owo-colors", "oximeter-api", @@ -6647,6 +6649,15 @@ dependencies = [ "wicketd-api", ] +[[package]] +name = "openapi-manager-types" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "omicron-workspace-hack", +] + [[package]] name = "openapiv3" version = "2.0.0" diff --git a/Cargo.toml b/Cargo.toml index ce2b7f8eb0..55859ae9e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,7 @@ members = [ "dev-tools/omicron-dev", "dev-tools/omicron-dev-lib", "dev-tools/openapi-manager", + "dev-tools/openapi-manager/types", "dev-tools/oxlog", "dev-tools/reconfigurator-cli", "dev-tools/releng", @@ -145,6 +146,7 @@ default-members = [ "dev-tools/omicron-dev", "dev-tools/omicron-dev-lib", "dev-tools/openapi-manager", + "dev-tools/openapi-manager/types", "dev-tools/oxlog", "dev-tools/reconfigurator-cli", "dev-tools/releng", @@ -453,6 +455,7 @@ oxlog = { path = "dev-tools/oxlog" } oxnet = { git = "https://github.com/oxidecomputer/oxnet" } once_cell = "1.19.0" openapi-lint = { git = "https://github.com/oxidecomputer/openapi-lint", branch = "main" } +openapi-manager-types = { path = "dev-tools/openapi-manager/types" } openapiv3 = "2.0.0" # must match samael's crate! openssl = "0.10" diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index fe90737d9e..2ca1bc3e4d 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -25,6 +25,7 @@ nexus-internal-api.workspace = true omicron-workspace-hack.workspace = true openapiv3.workspace = true openapi-lint.workspace = true +openapi-manager-types.workspace = true owo-colors.workspace = true oximeter-api.workspace = true serde_json.workspace = true diff --git a/dev-tools/openapi-manager/src/check.rs b/dev-tools/openapi-manager/src/check.rs index 182ed9fb19..b43e43e7e5 100644 --- a/dev-tools/openapi-manager/src/check.rs +++ b/dev-tools/openapi-manager/src/check.rs @@ -5,17 +5,16 @@ use std::{io::Write, process::ExitCode}; use anyhow::Result; -use camino::Utf8Path; use indent_write::io::IndentWriter; use owo_colors::OwoColorize; use similar::TextDiff; use crate::{ output::{ - display_api_spec, display_error, display_summary, headers::*, plural, - write_diff, OutputOpts, Styles, + display_api_spec, display_api_spec_file, display_error, + display_summary, headers::*, plural, write_diff, OutputOpts, Styles, }, - spec::{all_apis, CheckStatus}, + spec::{all_apis, CheckStale, Environment}, FAILURE_EXIT_CODE, NEEDS_UPDATE_EXIT_CODE, }; @@ -37,7 +36,7 @@ impl CheckResult { } pub(crate) fn check_impl( - dir: &Utf8Path, + env: &Environment, output: &OutputOpts, ) -> Result { let mut styles = Styles::default(); @@ -48,6 +47,7 @@ pub(crate) fn check_impl( let all_apis = all_apis(); let total = all_apis.len(); let count_width = total.to_string().len(); + let count_section_indent = count_section_indent(count_width); let continued_indent = continued_indent(count_width); eprintln!("{:>HEADER_WIDTH$}", SEPARATOR); @@ -58,57 +58,89 @@ pub(crate) fn check_impl( total.style(styles.bold), plural::documents(total), ); - let mut num_up_to_date = 0; + let mut num_fresh = 0; let mut num_stale = 0; - let mut num_missing = 0; let mut num_failed = 0; for (ix, spec) in all_apis.iter().enumerate() { let count = ix + 1; - match spec.check(&dir) { - Ok(status) => match status { - CheckStatus::Ok(summary) => { - eprintln!( - "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}", - UP_TO_DATE.style(styles.success_header), - display_api_spec(spec, &styles), - display_summary(&summary, &styles), - ); + match spec.check(env) { + Ok(status) => { + let total_errors = status.total_errors(); + let total_errors_width = total_errors.to_string().len(); + + if total_errors == 0 { + // Success case. + let extra = if status.extra_files_len() > 0 { + format!( + ", {} extra files", + status.extra_files_len().style(styles.bold) + ) + } else { + "".to_string() + }; - num_up_to_date += 1; - } - CheckStatus::Stale { full_path, actual, expected } => { eprintln!( - "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}", - STALE.style(styles.warning_header), + "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}{extra}", + FRESH.style(styles.success_header), display_api_spec(spec, &styles), + display_summary(&status.summary, &styles), ); - let diff = TextDiff::from_lines(&actual, &expected); - write_diff( - &diff, - &full_path, - &styles, - // Add an indent to align diff with the status message. - &mut IndentWriter::new( - &continued_indent, - std::io::stderr(), - ), - )?; - - num_stale += 1; + num_fresh += 1; + continue; } - CheckStatus::Missing => { - eprintln!( - "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}", - MISSING.style(styles.warning_header), - display_api_spec(spec, &styles), - ); - num_missing += 1; + // Out of date: print errors. + eprintln!( + "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}", + STALE.style(styles.warning_header), + display_api_spec(spec, &styles), + ); + num_stale += 1; + + for (error_ix, (spec_file, error)) in + status.iter_errors().enumerate() + { + let error_count = error_ix + 1; + + let display_heading = |heading: &str| { + eprintln!( + "{:>HEADER_WIDTH$}{count_section_indent}\ + ({error_count:>total_errors_width$}/{total_errors}) {}", + heading.style(styles.warning_header), + display_api_spec_file(spec, spec_file, &styles), + ); + }; + + match error { + CheckStale::Modified { + full_path, + actual, + expected, + } => { + display_heading(MODIFIED); + + let diff = + TextDiff::from_lines(&**actual, &**expected); + write_diff( + &diff, + &full_path, + &styles, + // Add an indent to align diff with the status message. + &mut IndentWriter::new( + &continued_indent, + std::io::stderr(), + ), + )?; + } + CheckStale::New => { + display_heading(NEW); + } + } } - }, + } Err(error) => { eprint!( "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}", @@ -138,13 +170,12 @@ pub(crate) fn check_impl( }; eprintln!( - "{:>HEADER_WIDTH$} {} {} checked: {} up-to-date, {} stale, {} missing, {} failed", + "{:>HEADER_WIDTH$} {} {} checked: {} fresh, {} stale, {} failed", status_header, total.style(styles.bold), plural::documents(total), - num_up_to_date.style(styles.bold), + num_fresh.style(styles.bold), num_stale.style(styles.bold), - num_missing.style(styles.bold), num_failed.style(styles.bold), ); if num_failed > 0 { @@ -170,14 +201,14 @@ pub(crate) fn check_impl( mod tests { use std::process::ExitCode; - use crate::spec::find_openapi_dir; + use crate::spec::Environment; use super::*; #[test] fn check_apis_up_to_date() -> Result { let output = OutputOpts { color: clap::ColorChoice::Auto }; - let dir = find_openapi_dir()?; + let dir = Environment::new(None)?; let result = check_impl(&dir, &output)?; Ok(result.to_exit_code()) diff --git a/dev-tools/openapi-manager/src/dispatch.rs b/dev-tools/openapi-manager/src/dispatch.rs index 937a8b485f..ca2989396f 100644 --- a/dev-tools/openapi-manager/src/dispatch.rs +++ b/dev-tools/openapi-manager/src/dispatch.rs @@ -10,7 +10,7 @@ use clap::{Args, Parser, Subcommand}; use crate::{ check::check_impl, generate::generate_impl, list::list_impl, - output::OutputOpts, spec::openapi_dir, + output::OutputOpts, spec::Environment, }; /// Manage OpenAPI specifications. @@ -73,7 +73,7 @@ pub struct GenerateArgs { impl GenerateArgs { fn exec(self, output: &OutputOpts) -> anyhow::Result { - let dir = openapi_dir(self.dir)?; + let dir = Environment::new(self.dir)?; Ok(generate_impl(&dir, output)?.to_exit_code()) } } @@ -87,8 +87,8 @@ pub struct CheckArgs { impl CheckArgs { fn exec(self, output: &OutputOpts) -> anyhow::Result { - let dir = openapi_dir(self.dir)?; - Ok(check_impl(&dir, output)?.to_exit_code()) + let env = Environment::new(self.dir)?; + Ok(check_impl(&env, output)?.to_exit_code()) } } diff --git a/dev-tools/openapi-manager/src/generate.rs b/dev-tools/openapi-manager/src/generate.rs index f776ff2709..1cf9ebbb61 100644 --- a/dev-tools/openapi-manager/src/generate.rs +++ b/dev-tools/openapi-manager/src/generate.rs @@ -5,7 +5,6 @@ use std::{io::Write, process::ExitCode}; use anyhow::Result; -use camino::Utf8Path; use indent_write::io::IndentWriter; use owo_colors::OwoColorize; @@ -14,7 +13,7 @@ use crate::{ display_api_spec, display_error, display_summary, headers::*, plural, OutputOpts, Styles, }, - spec::{all_apis, OverwriteStatus}, + spec::{all_apis, Environment}, FAILURE_EXIT_CODE, }; @@ -34,7 +33,7 @@ impl GenerateResult { } pub(crate) fn generate_impl( - dir: &Utf8Path, + env: &Environment, output: &OutputOpts, ) -> Result { let mut styles = Styles::default(); @@ -62,27 +61,30 @@ pub(crate) fn generate_impl( for (ix, spec) in all_apis.iter().enumerate() { let count = ix + 1; - match spec.overwrite(&dir) { - Ok((status, summary)) => match status { - OverwriteStatus::Updated => { + match spec.overwrite(env) { + Ok(status) => { + let updated_count = status.updated_count(); + + if updated_count > 0 { eprintln!( - "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}", + "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {} ({} {} updated)", UPDATED.style(styles.success_header), display_api_spec(spec, &styles), - display_summary(&summary, &styles), + display_summary(&status.summary, &styles), + updated_count.style(styles.bold), + plural::files(updated_count), ); num_updated += 1; - } - OverwriteStatus::Unchanged => { + } else { eprintln!( "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}", UNCHANGED.style(styles.unchanged_header), display_api_spec(spec, &styles), - display_summary(&summary, &styles), + display_summary(&status.summary, &styles), ); num_unchanged += 1; } - }, + } Err(err) => { eprintln!( "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}", diff --git a/dev-tools/openapi-manager/src/output.rs b/dev-tools/openapi-manager/src/output.rs index 6cd578e778..fee7f0f15c 100644 --- a/dev-tools/openapi-manager/src/output.rs +++ b/dev-tools/openapi-manager/src/output.rs @@ -10,7 +10,7 @@ use indent_write::fmt::IndentWriter; use owo_colors::{OwoColorize, Style}; use similar::{ChangeTag, DiffableStr, TextDiff}; -use crate::spec::{ApiSpec, DocumentSummary}; +use crate::spec::{ApiSpec, ApiSpecFile, DocumentSummary}; #[derive(Debug, Args)] #[clap(next_help_heading = "Global options")] @@ -123,6 +123,21 @@ pub(crate) fn display_api_spec(spec: &ApiSpec, styles: &Styles) -> String { ) } +pub(crate) fn display_api_spec_file( + spec: &ApiSpec, + spec_file: ApiSpecFile<'_>, + styles: &Styles, +) -> String { + match spec_file { + ApiSpecFile::Openapi => { + format!("OpenAPI document {}", spec.filename.style(styles.filename)) + } + ApiSpecFile::Extra(path) => { + format!("Extra file {}", path.style(styles.filename)) + } + } +} + pub(crate) fn display_summary( summary: &DocumentSummary, styles: &Styles, @@ -201,9 +216,14 @@ pub(crate) mod headers { pub(crate) static CHECKING: &str = "Checking"; pub(crate) static GENERATING: &str = "Generating"; - pub(crate) static UP_TO_DATE: &str = "Up-to-date"; + pub(crate) static FRESH: &str = "Fresh"; + + // Stale encompasses: + // - Stale: the file on disk is different from what we generated. + // - Missing: the file on disk does not exist. pub(crate) static STALE: &str = "Stale"; - pub(crate) static MISSING: &str = "Missing"; + pub(crate) static NEW: &str = "-> New"; + pub(crate) static MODIFIED: &str = "-> Modified"; pub(crate) static UPDATED: &str = "Updated"; pub(crate) static UNCHANGED: &str = "Unchanged"; @@ -211,22 +231,38 @@ pub(crate) mod headers { pub(crate) static SUCCESS: &str = "Success"; pub(crate) static FAILURE: &str = "Failure"; - pub(crate) fn continued_indent(count_width: usize) -> String { + fn count_section_width(count_width: usize) -> usize { // Status strings are of the form: // // Generated [ 1/12] api.json: 1 path, 1 schema + // ^^^^^^^^^ // - // So the continued indent is: - // - // HEADER_WIDTH for the status string - // + (count_width * 2) for current and total counts + // So the width of the count section is: + // (count_width * 2) for current and total counts // + 3 for '[/]' // + 2 for spaces on either side. - " ".repeat(HEADER_WIDTH + count_width * 2 + 3 + 2) + count_width * 2 + 3 + 2 + } + + pub(crate) fn count_section_indent(count_width: usize) -> String { + " ".repeat(count_section_width(count_width)) + } + + pub(crate) fn continued_indent(count_width: usize) -> String { + // HEADER_WIDTH for the status string + count_section_width + " ".repeat(HEADER_WIDTH + count_section_width(count_width)) } } pub(crate) mod plural { + pub(crate) fn files(count: usize) -> &'static str { + if count == 1 { + "file" + } else { + "files" + } + } + pub(crate) fn documents(count: usize) -> &'static str { if count == 1 { "document" diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 29601a63d6..e74cf7ed7a 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -9,6 +9,7 @@ use atomicwrites::AtomicFile; use camino::{Utf8Path, Utf8PathBuf}; use dropshot::{ApiDescription, ApiDescriptionBuildErrors, StubContext}; use fs_err as fs; +use openapi_manager_types::{ValidationBackend, ValidationContext}; use openapiv3::OpenAPI; /// All APIs managed by openapi-manager. @@ -143,47 +144,64 @@ pub struct ApiSpec { pub filename: &'static str, /// Extra validation to perform on the OpenAPI spec, if any. - pub extra_validation: Option anyhow::Result<()>>, + pub extra_validation: Option)>, } impl ApiSpec { pub(crate) fn overwrite( &self, - dir: &Utf8Path, - ) -> Result<(OverwriteStatus, DocumentSummary)> { + env: &Environment, + ) -> Result { let contents = self.to_json_bytes()?; - let summary = self + let (summary, validation_result) = self .validate_json(&contents) .context("OpenAPI document validation failed")?; - let full_path = dir.join(&self.filename); - let status = overwrite_file(&full_path, &contents)?; - - Ok((status, summary)) + let full_path = env.openapi_dir.join(&self.filename); + let openapi_doc_status = overwrite_file(&full_path, &contents)?; + + let extra_files = validation_result + .extra_files + .into_iter() + .map(|(path, contents)| { + let full_path = env.workspace_root.join(&path); + let status = overwrite_file(&full_path, &contents)?; + Ok((path, status)) + }) + .collect::>()?; + + Ok(SpecOverwriteStatus { + summary, + openapi_doc: openapi_doc_status, + extra_files, + }) } - pub(crate) fn check(&self, dir: &Utf8Path) -> Result { + pub(crate) fn check(&self, env: &Environment) -> Result { let contents = self.to_json_bytes()?; - let summary = self + let (summary, validation_result) = self .validate_json(&contents) .context("OpenAPI document validation failed")?; - let full_path = dir.join(&self.filename); - let existing_contents = - read_opt(&full_path).context("failed to read contents on disk")?; - - match existing_contents { - Some(existing_contents) if existing_contents == contents => { - Ok(CheckStatus::Ok(summary)) - } - Some(existing_contents) => Ok(CheckStatus::Stale { - full_path, - actual: existing_contents, - expected: contents, - }), - None => Ok(CheckStatus::Missing), - } + let full_path = env.openapi_dir.join(&self.filename); + let openapi_doc_status = check_file(full_path, contents)?; + + let extra_files = validation_result + .extra_files + .into_iter() + .map(|(path, contents)| { + let full_path = env.workspace_root.join(&path); + let status = check_file(full_path, contents)?; + Ok((path, status)) + }) + .collect::>()?; + + Ok(SpecCheckStatus { + summary, + openapi_doc: openapi_doc_status, + extra_files, + }) } pub(crate) fn to_openapi_doc(&self) -> Result { @@ -216,7 +234,10 @@ impl ApiSpec { Ok(contents) } - fn validate_json(&self, contents: &[u8]) -> Result { + fn validate_json( + &self, + contents: &[u8], + ) -> Result<(DocumentSummary, ValidationResult)> { let openapi_doc = contents_to_openapi(contents) .context("JSON returned by ApiDescription is not valid OpenAPI")?; @@ -231,11 +252,51 @@ impl ApiSpec { return Err(anyhow::anyhow!("{}", errors.join("\n\n"))); } - if let Some(extra_validation) = self.extra_validation { - extra_validation(&openapi_doc)?; - } + let extra_files = if let Some(extra_validation) = self.extra_validation + { + let mut validation_context = + ValidationContextImpl { errors: Vec::new(), files: Vec::new() }; + extra_validation( + &openapi_doc, + ValidationContext::new(&mut validation_context), + ); + + if !validation_context.errors.is_empty() { + return Err(anyhow::anyhow!( + "OpenAPI document extended validation failed:\n{}", + validation_context + .errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n") + )); + } + + validation_context.files + } else { + Vec::new() + }; + + Ok(( + DocumentSummary::new(&openapi_doc), + ValidationResult { extra_files }, + )) + } +} + +struct ValidationContextImpl { + errors: Vec, + files: Vec<(Utf8PathBuf, Vec)>, +} + +impl ValidationBackend for ValidationContextImpl { + fn report_error(&mut self, error: anyhow::Error) { + self.errors.push(error); + } - Ok(DocumentSummary::new(&openapi_doc)) + fn record_file_contents(&mut self, path: Utf8PathBuf, contents: Vec) { + self.files.push((path, contents)); } } @@ -260,6 +321,32 @@ impl fmt::Display for ApiBoundary { } } +#[derive(Debug)] +#[must_use] +pub(crate) struct SpecOverwriteStatus { + pub(crate) summary: DocumentSummary, + openapi_doc: OverwriteStatus, + extra_files: Vec<(Utf8PathBuf, OverwriteStatus)>, +} + +impl SpecOverwriteStatus { + pub(crate) fn updated_count(&self) -> usize { + self.iter() + .filter(|(_, status)| matches!(status, OverwriteStatus::Updated)) + .count() + } + + fn iter( + &self, + ) -> impl Iterator, &OverwriteStatus)> { + std::iter::once((ApiSpecFile::Openapi, &self.openapi_doc)).chain( + self.extra_files.iter().map(|(file_name, status)| { + (ApiSpecFile::Extra(file_name), status) + }), + ) + } +} + #[derive(Debug)] #[must_use] pub(crate) enum OverwriteStatus { @@ -267,12 +354,58 @@ pub(crate) enum OverwriteStatus { Unchanged, } +#[derive(Debug)] +#[must_use] +pub(crate) struct SpecCheckStatus { + pub(crate) summary: DocumentSummary, + pub(crate) openapi_doc: CheckStatus, + pub(crate) extra_files: Vec<(Utf8PathBuf, CheckStatus)>, +} + +impl SpecCheckStatus { + pub(crate) fn total_errors(&self) -> usize { + self.iter_errors().count() + } + + pub(crate) fn extra_files_len(&self) -> usize { + self.extra_files.len() + } + + pub(crate) fn iter_errors( + &self, + ) -> impl Iterator, &CheckStale)> { + std::iter::once((ApiSpecFile::Openapi, &self.openapi_doc)) + .chain(self.extra_files.iter().map(|(file_name, status)| { + (ApiSpecFile::Extra(file_name), status) + })) + .filter_map(|(spec_file, status)| { + if let CheckStatus::Stale(e) = status { + Some((spec_file, e)) + } else { + None + } + }) + } +} + +#[derive(Clone, Copy, Debug)] +pub(crate) enum ApiSpecFile<'a> { + Openapi, + Extra(&'a Utf8Path), +} + #[derive(Debug)] #[must_use] pub(crate) enum CheckStatus { - Ok(DocumentSummary), - Stale { full_path: Utf8PathBuf, actual: Vec, expected: Vec }, - Missing, + Fresh, + Stale(CheckStale), +} + +#[derive(Debug)] +#[must_use] +pub(crate) enum CheckStale { + Modified { full_path: Utf8PathBuf, actual: Vec, expected: Vec }, + New, } #[derive(Debug)] @@ -295,31 +428,45 @@ impl DocumentSummary { } } -pub(crate) fn openapi_dir(dir: Option) -> Result { - match dir { - Some(dir) => Ok(dir.canonicalize_utf8().with_context(|| { - format!("failed to canonicalize directory: {}", dir) - })?), - None => find_openapi_dir().context("failed to find openapi directory"), - } +#[derive(Debug)] +#[must_use] +struct ValidationResult { + // Extra files recorded by the validation context. + extra_files: Vec<(Utf8PathBuf, Vec)>, } -pub(crate) fn find_openapi_dir() -> Result { - let mut root = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")); - // This crate is two levels down from the root of omicron, so go up twice. - root.pop(); - root.pop(); +pub(crate) struct Environment { + pub(crate) workspace_root: Utf8PathBuf, + pub(crate) openapi_dir: Utf8PathBuf, +} - root.push("openapi"); - let root = root.canonicalize_utf8().with_context(|| { - format!("failed to canonicalize openapi directory: {}", root) - })?; +impl Environment { + pub(crate) fn new(openapi_dir: Option) -> Result { + let mut root = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // This crate is two levels down from the root of omicron, so go up twice. + root.pop(); + root.pop(); - if !root.is_dir() { - anyhow::bail!("openapi root is not a directory: {}", root); - } + let workspace_root = root.canonicalize_utf8().with_context(|| { + format!("failed to canonicalize workspace root: {}", root) + })?; - Ok(root) + let openapi_dir = + openapi_dir.unwrap_or_else(|| workspace_root.join("openapi")); + let openapi_dir = + openapi_dir.canonicalize_utf8().with_context(|| { + format!( + "failed to canonicalize openapi directory: {}", + openapi_dir + ) + })?; + + if !openapi_dir.is_dir() { + anyhow::bail!("openapi root is not a directory: {}", root); + } + + Ok(Self { workspace_root, openapi_dir }) + } } /// Overwrite a file with new contents, if the contents are different. @@ -344,6 +491,29 @@ fn overwrite_file(path: &Utf8Path, contents: &[u8]) -> Result { Ok(OverwriteStatus::Updated) } +/// Check a file against expected contents. +fn check_file( + full_path: Utf8PathBuf, + contents: Vec, +) -> Result { + let existing_contents = + read_opt(&full_path).context("failed to read contents on disk")?; + + match existing_contents { + Some(existing_contents) if existing_contents == contents => { + Ok(CheckStatus::Fresh) + } + Some(existing_contents) => { + Ok(CheckStatus::Stale(CheckStale::Modified { + full_path, + actual: existing_contents, + expected: contents, + })) + } + None => Ok(CheckStatus::Stale(CheckStale::New)), + } +} + fn read_opt(path: &Utf8Path) -> std::io::Result>> { match fs::read(path) { Ok(contents) => Ok(Some(contents)), diff --git a/dev-tools/openapi-manager/types/Cargo.toml b/dev-tools/openapi-manager/types/Cargo.toml new file mode 100644 index 0000000000..262529f1a9 --- /dev/null +++ b/dev-tools/openapi-manager/types/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "openapi-manager-types" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +camino.workspace = true +omicron-workspace-hack.workspace = true diff --git a/dev-tools/openapi-manager/types/src/lib.rs b/dev-tools/openapi-manager/types/src/lib.rs new file mode 100644 index 0000000000..b48ea03e74 --- /dev/null +++ b/dev-tools/openapi-manager/types/src/lib.rs @@ -0,0 +1,12 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Shared types for the OpenAPI manager. +//! +//! API trait crates can depend on this crate to get access to interfaces +//! exposed by the OpenAPI manager. + +mod validation; + +pub use validation::*; diff --git a/dev-tools/openapi-manager/types/src/validation.rs b/dev-tools/openapi-manager/types/src/validation.rs new file mode 100644 index 0000000000..6f22228f4d --- /dev/null +++ b/dev-tools/openapi-manager/types/src/validation.rs @@ -0,0 +1,47 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use camino::Utf8PathBuf; + +/// Context for validation of OpenAPI specifications. +pub struct ValidationContext<'a> { + backend: &'a mut dyn ValidationBackend, +} + +impl<'a> ValidationContext<'a> { + /// Note part of the public API -- only called by the OpenAPI manager. + #[doc(hidden)] + pub fn new(backend: &'a mut dyn ValidationBackend) -> Self { + Self { backend } + } + + /// Reports a validation error. + pub fn report_error(&mut self, error: anyhow::Error) { + self.backend.report_error(error); + } + + /// Records that the file has the given contents. + /// + /// In check mode, if the files differ, an error is logged. + /// + /// In generate mode, the file is overwritten with the given contents. + /// + /// The path is treated as relative to the root of the repository. + pub fn record_file_contents( + &mut self, + path: impl Into, + contents: Vec, + ) { + self.backend.record_file_contents(path.into(), contents); + } +} + +/// The backend for validation. +/// +/// Not part of the public API -- only implemented by the OpenAPI manager. +#[doc(hidden)] +pub trait ValidationBackend { + fn report_error(&mut self, error: anyhow::Error); + fn record_file_contents(&mut self, path: Utf8PathBuf, contents: Vec); +} diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 014444c542..a39daa5735 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -85,6 +85,7 @@ pkcs8 = { version = "0.10.2", default-features = false, features = ["encryption" postgres-types = { version = "0.2.7", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } +quote = { version = "1.0.36" } regex = { version = "1.10.6" } regex-automata = { version = "0.4.6", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "unicode"] } regex-syntax = { version = "0.8.4" } @@ -193,6 +194,7 @@ pkcs8 = { version = "0.10.2", default-features = false, features = ["encryption" postgres-types = { version = "0.2.7", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } predicates = { version = "3.1.2" } proc-macro2 = { version = "1.0.86" } +quote = { version = "1.0.36" } regex = { version = "1.10.6" } regex-automata = { version = "0.4.6", default-features = false, features = ["dfa", "hybrid", "meta", "nfa", "perf", "unicode"] } regex-syntax = { version = "0.8.4" } From 14b94f785820c68cd2972ae4058f1476c2de96b3 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Tue, 20 Aug 2024 23:27:07 -0700 Subject: [PATCH 83/91] plumb static route local preference (#6361) --- Cargo.lock | 4 +- Cargo.toml | 4 +- common/src/api/external/mod.rs | 3 + common/src/api/internal/shared.rs | 3 + nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/switch_port.rs | 5 +- .../src/db/datastore/switch_port.rs | 1 + .../tasks/sync_switch_configuration.rs | 90 +++++++++++-------- nexus/src/app/rack.rs | 1 + nexus/tests/integration_tests/switch_port.rs | 1 + nexus/types/src/external_api/params.rs | 4 + openapi/bootstrap-agent.json | 8 ++ openapi/nexus-internal.json | 8 ++ openapi/nexus.json | 14 +++ openapi/sled-agent.json | 8 ++ openapi/wicketd.json | 8 ++ package-manifest.toml | 12 +-- schema/crdb/dbinit.sql | 3 +- schema/crdb/route-local-pref/up.sql | 1 + schema/rss-sled-plan.json | 10 +++ sled-agent/src/bootstrap/early_networking.rs | 3 +- sled-agent/src/rack_setup/service.rs | 1 + .../tests/integration_tests/early_network.rs | 1 + .../madrid-rss-sled-plan.json | 6 +- sled-agent/types/src/early_networking.rs | 6 ++ tools/generate-nexus-api.sh | 1 - tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 4 +- tools/maghemite_mgd_checksums | 4 +- wicket-common/src/example.rs | 2 + wicket/src/cli/rack_setup/config_toml.rs | 5 +- wicket/src/ui/panes/rack_setup.rs | 10 ++- wicketd/src/rss_config.rs | 1 + 34 files changed, 175 insertions(+), 63 deletions(-) create mode 100644 schema/crdb/route-local-pref/up.sql diff --git a/Cargo.lock b/Cargo.lock index 830ec523a3..91617c2eb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1786,7 +1786,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=220dd026e83142b83bd93123f465a64dd4600201#220dd026e83142b83bd93123f465a64dd4600201" +source = "git+https://github.com/oxidecomputer/maghemite?rev=73e63eaae3fe616bd7c48a20c69736d7e025836b#73e63eaae3fe616bd7c48a20c69736d7e025836b" dependencies = [ "oxnet", "percent-encoding", @@ -4697,7 +4697,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=220dd026e83142b83bd93123f465a64dd4600201#220dd026e83142b83bd93123f465a64dd4600201" +source = "git+https://github.com/oxidecomputer/maghemite?rev=73e63eaae3fe616bd7c48a20c69736d7e025836b#73e63eaae3fe616bd7c48a20c69736d7e025836b" dependencies = [ "anyhow", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 55859ae9e6..413990c9a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -408,8 +408,8 @@ macaddr = { version = "1.0.1", features = ["serde_std"] } maplit = "1.0.2" mockall = "0.13" newtype_derive = "0.1.6" -mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "220dd026e83142b83bd93123f465a64dd4600201" } -ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "220dd026e83142b83bd93123f465a64dd4600201" } +mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "73e63eaae3fe616bd7c48a20c69736d7e025836b" } +ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "73e63eaae3fe616bd7c48a20c69736d7e025836b" } multimap = "0.10.0" nexus-auth = { path = "nexus/auth" } nexus-client = { path = "clients/nexus-client" } diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index c7421aa5ee..986f45bfd1 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -2492,6 +2492,9 @@ pub struct SwitchPortRouteConfig { /// The VLAN identifier for the route. Use this if the gateway is reachable /// over an 802.1Q tagged L2 segment. pub vlan_id: Option, + + /// Local preference indicating priority within and across protocols. + pub local_pref: Option, } /* diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 089ff9b324..395bc3d132 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -305,6 +305,9 @@ pub struct RouteConfig { /// The VLAN id associated with this route. #[serde(default)] pub vlan_id: Option, + /// The local preference associated with this route. + #[serde(default)] + pub local_pref: Option, } #[derive( diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 845da13a44..d1205dac65 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -195,6 +195,7 @@ table! { dst -> Inet, gw -> Inet, vid -> Nullable, + local_pref -> Nullable, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1e0caabb02..ef9a11c330 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(87, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(88, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(88, "route-local-pref"), KnownVersion::new(87, "add-clickhouse-server-enum-variants"), KnownVersion::new(86, "snapshot-replacement"), KnownVersion::new(85, "add-migrations-by-time-created-index"), diff --git a/nexus/db-model/src/switch_port.rs b/nexus/db-model/src/switch_port.rs index f790d7d527..9b36cbda48 100644 --- a/nexus/db-model/src/switch_port.rs +++ b/nexus/db-model/src/switch_port.rs @@ -554,6 +554,7 @@ pub struct SwitchPortRouteConfig { pub dst: IpNetwork, pub gw: IpNetwork, pub vid: Option, + pub local_pref: Option, } impl SwitchPortRouteConfig { @@ -563,8 +564,9 @@ impl SwitchPortRouteConfig { dst: IpNetwork, gw: IpNetwork, vid: Option, + local_pref: Option, ) -> Self { - Self { port_settings_id, interface_name, dst, gw, vid } + Self { port_settings_id, interface_name, dst, gw, vid, local_pref } } } @@ -576,6 +578,7 @@ impl Into for SwitchPortRouteConfig { dst: self.dst.into(), gw: self.gw.into(), vlan_id: self.vid.map(Into::into), + local_pref: self.local_pref.map(Into::into), } } } diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 159933dce0..f9c61147f3 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -1120,6 +1120,7 @@ async fn do_switch_port_settings_create( route.dst.into(), route.gw.into(), route.vid.map(Into::into), + route.local_pref.map(Into::into), )); } } diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index 20a12d1127..6ecdaa2e55 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -977,6 +977,7 @@ impl BackgroundTask for SwitchPortSettingsManager { destination: r.dst.into(), nexthop: r.gw.ip(), vlan_id: r.vid.map(|x| x.0), + local_pref: r.local_pref.map(|x| x.0), }) .collect(), switch: *location, @@ -1455,7 +1456,8 @@ fn build_sled_agent_clients( sled_agent_clients } -type SwitchStaticRoutes = HashSet<(Ipv4Addr, Prefix4, Option)>; +type SwitchStaticRoutes = + HashSet<(Ipv4Addr, Prefix4, Option, Option)>; fn static_routes_to_del( current_static_routes: HashMap, @@ -1471,10 +1473,11 @@ fn static_routes_to_del( // if it's on the switch but not desired (in our db), it should be removed let stale_routes = routes_on_switch .difference(routes_wanted) - .map(|(nexthop, prefix, vlan_id)| StaticRoute4 { + .map(|(nexthop, prefix, vlan_id, local_pref)| StaticRoute4 { nexthop: *nexthop, prefix: *prefix, vlan_id: *vlan_id, + local_pref: *local_pref, }) .collect::>(); @@ -1488,10 +1491,11 @@ fn static_routes_to_del( // if no desired routes are present, all routes on this switch should be deleted let stale_routes = routes_on_switch .iter() - .map(|(nexthop, prefix, vlan_id)| StaticRoute4 { + .map(|(nexthop, prefix, vlan_id, local_pref)| StaticRoute4 { nexthop: *nexthop, prefix: *prefix, vlan_id: *vlan_id, + local_pref: *local_pref, }) .collect::>(); @@ -1538,10 +1542,11 @@ fn static_routes_to_add( }; let missing_routes = routes_wanted .difference(routes_on_switch) - .map(|(nexthop, prefix, vlan_id)| StaticRoute4 { + .map(|(nexthop, prefix, vlan_id, local_pref)| StaticRoute4 { nexthop: *nexthop, prefix: *prefix, vlan_id: *vlan_id, + local_pref: *local_pref, }) .collect::>(); @@ -1590,7 +1595,12 @@ fn static_routes_in_db( } IpAddr::V6(_) => continue, }; - routes.insert((nexthop, prefix, route.vid.map(|x| x.0))); + routes.insert(( + nexthop, + prefix, + route.vid.map(|x| x.0), + route.local_pref.map(|x| x.0), + )); } match routes_from_db.entry(*location) { @@ -1768,44 +1778,46 @@ async fn static_routes_on_switch<'a>( let mut routes_on_switch = HashMap::new(); for (location, client) in mgd_clients { - let static_routes: SwitchStaticRoutes = - match client.static_list_v4_routes().await { - Ok(routes) => { - let mut flattened = HashSet::new(); - for (destination, paths) in routes.iter() { - let Ok(dst) = destination.parse() else { - error!( - log, - "failed to parse static route destination: \ + let static_routes: SwitchStaticRoutes = match client + .static_list_v4_routes() + .await + { + Ok(routes) => { + let mut flattened = HashSet::new(); + for (destination, paths) in routes.iter() { + let Ok(dst) = destination.parse() else { + error!( + log, + "failed to parse static route destination: \ {destination}" - ); - continue; + ); + continue; + }; + for p in paths.iter() { + let nh = match p.nexthop { + IpAddr::V4(addr) => addr, + IpAddr::V6(addr) => { + error!( + log, + "ipv6 nexthops not supported: {addr}" + ); + continue; + } }; - for p in paths.iter() { - let nh = match p.nexthop { - IpAddr::V4(addr) => addr, - IpAddr::V6(addr) => { - error!( - log, - "ipv6 nexthops not supported: {addr}" - ); - continue; - } - }; - flattened.insert((nh, dst, p.vlan_id)); - } + flattened.insert((nh, dst, p.vlan_id, p.local_pref)); } - flattened - } - Err(_) => { - error!( - &log, - "unable to retrieve routes from switch"; - "switch_location" => ?location, - ); - continue; } - }; + flattened + } + Err(_) => { + error!( + &log, + "unable to retrieve routes from switch"; + "switch_location" => ?location, + ); + continue; + } + }; routes_on_switch.insert(*location, static_routes); } routes_on_switch diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 13b30fd47a..4eb9883bcc 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -570,6 +570,7 @@ impl super::Nexus { dst: r.destination, gw: r.nexthop, vid: r.vlan_id, + local_pref: r.local_pref, }) .collect(); diff --git a/nexus/tests/integration_tests/switch_port.rs b/nexus/tests/integration_tests/switch_port.rs index 0b71ddb2cf..2485d82c45 100644 --- a/nexus/tests/integration_tests/switch_port.rs +++ b/nexus/tests/integration_tests/switch_port.rs @@ -140,6 +140,7 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { dst: "1.2.3.0/24".parse().unwrap(), gw: "1.2.3.4".parse().unwrap(), vid: None, + local_pref: None, }], }, ); diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index a7dd0a72cc..effd067ec8 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1581,6 +1581,10 @@ pub struct Route { /// VLAN id the gateway is reachable over. pub vid: Option, + + /// Local preference for route. Higher preference indictes precedence + /// within and across protocols. + pub local_pref: Option, } /// Select a BGP config by a name or id. diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 7b4f257670..b109eaf43e 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -1183,6 +1183,14 @@ } ] }, + "local_pref": { + "nullable": true, + "description": "The local preference associated with this route.", + "default": null, + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "nexthop": { "description": "The nexthop/gateway address.", "type": "string", diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index d054591f3a..6b9a63d7f2 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4665,6 +4665,14 @@ } ] }, + "local_pref": { + "nullable": true, + "description": "The local preference associated with this route.", + "default": null, + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "nexthop": { "description": "The nexthop/gateway address.", "type": "string", diff --git a/openapi/nexus.json b/openapi/nexus.json index c29cb8a95c..e622239fa2 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -17180,6 +17180,13 @@ "type": "string", "format": "ip" }, + "local_pref": { + "nullable": true, + "description": "Local preference for route. Higher preference indictes precedence within and across protocols.", + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "vid": { "nullable": true, "description": "VLAN id the gateway is reachable over.", @@ -19238,6 +19245,13 @@ "description": "The interface name this route configuration is assigned to.", "type": "string" }, + "local_pref": { + "nullable": true, + "description": "Local preference indicating priority within and across protocols.", + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "port_settings_id": { "description": "The port settings object this route configuration belongs to.", "type": "string", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 1241248a5e..6459595b65 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4379,6 +4379,14 @@ } ] }, + "local_pref": { + "nullable": true, + "description": "The local preference associated with this route.", + "default": null, + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "nexthop": { "description": "The nexthop/gateway address.", "type": "string", diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 757383897b..5041fb5e56 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -3062,6 +3062,14 @@ } ] }, + "local_pref": { + "nullable": true, + "description": "The local preference associated with this route.", + "default": null, + "type": "integer", + "format": "uint32", + "minimum": 0 + }, "nexthop": { "description": "The nexthop/gateway address.", "type": "string", diff --git a/package-manifest.toml b/package-manifest.toml index 0822225837..0f42025fba 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -628,10 +628,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" +source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "b0f08e754f7c834d7ca05093b13a574863f500cff56210591ef4cc7eaf20159b" +source.sha256 = "6b2b5b5fed0c8ea36d78138d8d9bb455e8768ae61e7443985ddea48535cfc2da" output.type = "tarball" [package.mg-ddm] @@ -644,10 +644,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" +source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "499962b57404626aff1ecd62d5045ba2ee06070d45f7cb2a8fc284e53eed17d6" +source.sha256 = "725a5b1eeed5bc34ad5473cb54b8df4b3993f3ed3808cc50304696082e490a4a" output.type = "zone" output.intermediate_only = true @@ -659,10 +659,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" +source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "e15db7d262b5b2f08a2e2799668c67d0cb883e84c72736a30d299688115bf055" +source.sha256 = "1f9833ce2d38bdb57099c3f7e7e9f2c414b17492fe0a3574e043b65756b78192" output.type = "zone" output.intermediate_only = true diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ddc399d282..d0eba7847e 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2715,6 +2715,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_route_config ( dst INET, gw INET, vid INT4, + local_pref INT8, /* TODO https://github.com/oxidecomputer/omicron/issues/3013 */ PRIMARY KEY (port_settings_id, interface_name, dst, gw) @@ -4217,7 +4218,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '87.0.0', NULL) + (TRUE, NOW(), NOW(), '88.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/route-local-pref/up.sql b/schema/crdb/route-local-pref/up.sql new file mode 100644 index 0000000000..d1051ccd0c --- /dev/null +++ b/schema/crdb/route-local-pref/up.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.switch_port_settings_route_config ADD COLUMN IF NOT EXISTS local_pref INT8; diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index a3d3425870..f8dfb935ce 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -894,6 +894,16 @@ } ] }, + "local_pref": { + "description": "The local preference associated with this route.", + "default": null, + "type": [ + "integer", + "null" + ], + "format": "uint32", + "minimum": 0.0 + }, "nexthop": { "description": "The nexthop/gateway address.", "type": "string", diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index 95a1f873f6..abc88d67c1 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -631,7 +631,8 @@ impl<'a> EarlyNetworkSetup<'a> { IpAddr::V6(_) => continue, }; let vlan_id = r.vlan_id; - let sr = StaticRoute4 { nexthop, prefix, vlan_id }; + let local_pref = r.local_pref; + let sr = StaticRoute4 { nexthop, prefix, vlan_id, local_pref }; rq.routes.list.push(sr); } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index bead95be80..2505985101 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -757,6 +757,7 @@ impl ServiceInner { destination: r.destination, nexthop: r.nexthop, vlan_id: r.vlan_id, + local_pref: r.local_pref, }) .collect(), addresses: config diff --git a/sled-agent/tests/integration_tests/early_network.rs b/sled-agent/tests/integration_tests/early_network.rs index 6fa91e0e4a..8da67729da 100644 --- a/sled-agent/tests/integration_tests/early_network.rs +++ b/sled-agent/tests/integration_tests/early_network.rs @@ -126,6 +126,7 @@ fn current_config_example() -> (&'static str, EarlyNetworkConfig) { destination: "10.1.9.32/16".parse().unwrap(), nexthop: "10.1.9.32".parse().unwrap(), vlan_id: None, + local_pref: None, }], addresses: vec!["2001:db8::/96".parse().unwrap()], switch: SwitchLocation::Switch0, diff --git a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json index efd1a3c167..7df143d41d 100644 --- a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json +++ b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json @@ -128,7 +128,8 @@ { "destination": "0.0.0.0/0", "nexthop": "172.20.15.33", - "vlan_id": null + "vlan_id": null, + "local_pref": null } ], "addresses": [ @@ -149,7 +150,8 @@ { "destination": "0.0.0.0/0", "nexthop": "172.20.15.33", - "vlan_id": null + "vlan_id": null, + "local_pref": null } ], "addresses": [ diff --git a/sled-agent/types/src/early_networking.rs b/sled-agent/types/src/early_networking.rs index dc93aa1300..c4afbd0adb 100644 --- a/sled-agent/types/src/early_networking.rs +++ b/sled-agent/types/src/early_networking.rs @@ -322,6 +322,8 @@ pub mod back_compat { pub uplink_cidr: Ipv4Net, /// VLAN id to use for uplink pub uplink_vid: Option, + /// Local preference + pub local_pref: Option, } impl From for PortConfigV2 { @@ -331,6 +333,7 @@ pub mod back_compat { destination: "0.0.0.0/0".parse().unwrap(), nexthop: value.gateway_ip.into(), vlan_id: value.uplink_vid, + local_pref: value.local_pref, }], addresses: vec![UplinkAddressConfig { address: value.uplink_cidr.into(), @@ -472,6 +475,7 @@ mod tests { uplink_port_fec: PortFec::None, uplink_cidr: "192.168.0.1/16".parse().unwrap(), uplink_vid: None, + local_pref: None, }], }), }; @@ -501,6 +505,7 @@ mod tests { destination: "0.0.0.0/0".parse().unwrap(), nexthop: uplink.gateway_ip.into(), vlan_id: None, + local_pref: None, }], addresses: vec![UplinkAddressConfig { address: uplink.uplink_cidr.into(), @@ -545,6 +550,7 @@ mod tests { destination: "0.0.0.0/0".parse().unwrap(), nexthop: "192.168.0.2".parse().unwrap(), vlan_id: None, + local_pref: None, }], addresses: vec!["192.168.0.1/16".parse().unwrap()], switch: SwitchLocation::Switch0, diff --git a/tools/generate-nexus-api.sh b/tools/generate-nexus-api.sh index a0c7d13165..9e3f8d63f6 100755 --- a/tools/generate-nexus-api.sh +++ b/tools/generate-nexus-api.sh @@ -1,4 +1,3 @@ #!/usr/bin/env bash ./target/debug/nexus nexus/examples/config.toml -O > openapi/nexus.json -./target/debug/nexus nexus/examples/config.toml -I > openapi/nexus-internal.json diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index c1e011e38d..3b07ab4e61 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" +COMMIT="73e63eaae3fe616bd7c48a20c69736d7e025836b" SHA2="007bfb717ccbc077c0250dee3121aeb0c5bb0d1c16795429a514fa4f8635a5ef" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 1184f6e4fd..691df704d7 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="0c4292fe5b3c8ac27d99b5a4502d595acdbf7441" -SHA2="e4b42ab9daad90f0c561a830b62a9d17e294b4d0da0a6d44b4030929b0c37b7e" +COMMIT="73e63eaae3fe616bd7c48a20c69736d7e025836b" +SHA2="34536d8f55fc054d0b8114b5654b38c968099aafc7770562e04d405168f5be95" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index 7ca642fa70..f54745b92a 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="e15db7d262b5b2f08a2e2799668c67d0cb883e84c72736a30d299688115bf055" -MGD_LINUX_SHA256="915e7b5cac8ff1deb6549b86e4ba49fd5c6adbdcc56ae5dc3c7b3e69555a7c2c" \ No newline at end of file +CIDL_SHA256="1f9833ce2d38bdb57099c3f7e7e9f2c414b17492fe0a3574e043b65756b78192" +MGD_LINUX_SHA256="3c47a55af8daa4dc2cd7da5ecc3c5043cef5e6890b60d070a2e8672101cdbd30" \ No newline at end of file diff --git a/wicket-common/src/example.rs b/wicket-common/src/example.rs index bb70273b45..63d12aea6d 100644 --- a/wicket-common/src/example.rs +++ b/wicket-common/src/example.rs @@ -176,6 +176,7 @@ impl ExampleRackSetupData { destination: "0.0.0.0/0".parse().unwrap(), nexthop: "172.30.0.10".parse().unwrap(), vlan_id: Some(1), + local_pref: None, }], bgp_peers: switch0_port0_bgp_peers, uplink_port_speed: PortSpeed::Speed400G, @@ -192,6 +193,7 @@ impl ExampleRackSetupData { destination: "0.0.0.0/0".parse().unwrap(), nexthop: "172.33.0.10".parse().unwrap(), vlan_id: Some(1), + local_pref: None, }], bgp_peers: switch1_port0_bgp_peers, uplink_port_speed: PortSpeed::Speed400G, diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 68485815a8..198c740754 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -327,13 +327,16 @@ fn populate_uplink_table(cfg: &UserSpecifiedPortConfig) -> Table { // routes = [] let mut routes_out = Array::new(); for r in routes { - let RouteConfig { destination, nexthop, vlan_id } = r; + let RouteConfig { destination, nexthop, vlan_id, local_pref } = r; let mut route = InlineTable::new(); route.insert("nexthop", string_value(nexthop)); route.insert("destination", string_value(destination)); if let Some(vlan_id) = vlan_id { route.insert("vlan_id", i64_value(i64::from(*vlan_id))); } + if let Some(local_pref) = local_pref { + route.insert("local_pref", i64_value(i64::from(*local_pref))); + } routes_out.push(Value::InlineTable(route)); } uplink.insert("routes", Item::Value(Value::Array(routes_out))); diff --git a/wicket/src/ui/panes/rack_setup.rs b/wicket/src/ui/panes/rack_setup.rs index 7bb63b6b1b..76a240e981 100644 --- a/wicket/src/ui/panes/rack_setup.rs +++ b/wicket/src/ui/panes/rack_setup.rs @@ -771,7 +771,8 @@ fn rss_config_text<'a>( ]; let routes = routes.iter().map(|r| { - let RouteConfig { destination, nexthop, vlan_id } = r; + let RouteConfig { destination, nexthop, vlan_id, local_pref } = + r; let mut items = vec![ Span::styled(" • Route : ", label_style), @@ -787,6 +788,13 @@ fn rss_config_text<'a>( Span::styled(")", label_style), ]); } + if let Some(local_pref) = local_pref { + items.extend([ + Span::styled(" (local_pref=", label_style), + Span::styled(local_pref.to_string(), ok_style), + Span::styled(")", label_style), + ]); + } items }); diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index c6f2dd5892..cb40d56dd6 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -703,6 +703,7 @@ fn build_port_config( destination: r.destination, nexthop: r.nexthop, vlan_id: r.vlan_id, + local_pref: r.local_pref, }) .collect(), addresses: config From 5a63771941a629ef17b9234495bee4aebe78099c Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Wed, 21 Aug 2024 00:54:54 -0700 Subject: [PATCH 84/91] API for showing exported prefixes (#6397) --- Cargo.lock | 4 +- Cargo.toml | 4 +- common/src/api/external/mod.rs | 10 +++ nexus/src/app/bgp.rs | 73 +++++++++++++++++++++- nexus/src/external_api/http_entrypoints.rs | 31 ++++++++- nexus/tests/integration_tests/endpoints.rs | 11 ++++ nexus/tests/output/nexus_tags.txt | 1 + openapi/nexus.json | 46 ++++++++++++++ package-manifest.toml | 12 ++-- tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 4 +- tools/maghemite_mgd_checksums | 4 +- 12 files changed, 182 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91617c2eb6..f8699e62d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1786,7 +1786,7 @@ dependencies = [ [[package]] name = "ddm-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=73e63eaae3fe616bd7c48a20c69736d7e025836b#73e63eaae3fe616bd7c48a20c69736d7e025836b" +source = "git+https://github.com/oxidecomputer/maghemite?rev=9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42#9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" dependencies = [ "oxnet", "percent-encoding", @@ -4697,7 +4697,7 @@ dependencies = [ [[package]] name = "mg-admin-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/maghemite?rev=73e63eaae3fe616bd7c48a20c69736d7e025836b#73e63eaae3fe616bd7c48a20c69736d7e025836b" +source = "git+https://github.com/oxidecomputer/maghemite?rev=9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42#9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" dependencies = [ "anyhow", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 413990c9a8..cfb097ef3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -408,8 +408,8 @@ macaddr = { version = "1.0.1", features = ["serde_std"] } maplit = "1.0.2" mockall = "0.13" newtype_derive = "0.1.6" -mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "73e63eaae3fe616bd7c48a20c69736d7e025836b" } -ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "73e63eaae3fe616bd7c48a20c69736d7e025836b" } +mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" } +ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" } multimap = "0.10.0" nexus-auth = { path = "nexus/auth" } nexus-client = { path = "clients/nexus-client" } diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 986f45bfd1..f3f5372749 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -23,6 +23,7 @@ pub use dropshot::PaginationOrder; pub use error::*; use futures::stream::BoxStream; use oxnet::IpNet; +use oxnet::Ipv4Net; use parse_display::Display; use parse_display::FromStr; use rand::thread_rng; @@ -2708,6 +2709,15 @@ pub struct BgpPeerStatus { pub switch: SwitchLocation, } +/// The current status of a BGP peer. +#[derive( + Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq, Default, +)] +pub struct BgpExported { + /// Exported routes indexed by peer address. + pub exports: HashMap>, +} + /// Opaque object representing BGP message history for a given BGP peer. The /// contents of this object are not yet stable. #[derive(Clone, Debug, Deserialize, Serialize)] diff --git a/nexus/src/app/bgp.rs b/nexus/src/app/bgp.rs index 118011500a..d192f1ccf9 100644 --- a/nexus/src/app/bgp.rs +++ b/nexus/src/app/bgp.rs @@ -9,8 +9,9 @@ use nexus_db_model::{BgpAnnounceSet, BgpAnnouncement, BgpConfig}; use nexus_db_queries::context::OpContext; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::{ - self, BgpImportedRouteIpv4, BgpMessageHistory, BgpPeerStatus, CreateResult, - DeleteResult, ListResultVec, LookupResult, NameOrId, SwitchBgpHistory, + self, BgpExported, BgpImportedRouteIpv4, BgpMessageHistory, BgpPeerStatus, + CreateResult, DeleteResult, ListResultVec, LookupResult, NameOrId, + SwitchBgpHistory, }; use std::net::IpAddr; @@ -145,6 +146,74 @@ impl super::Nexus { Ok(result) } + pub async fn bgp_exported( + &self, + opctx: &OpContext, + ) -> LookupResult { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + let mut result = BgpExported::default(); + for (switch, client) in &self.mg_clients().await.map_err(|e| { + external::Error::internal_error(&format!( + "failed to get mg clients: {e}" + )) + })? { + let router_info = match client.read_routers().await { + Ok(result) => result.into_inner(), + Err(e) => { + error!( + self.log, + "failed to get routers from {switch}: {e}" + ); + continue; + } + }; + for r in &router_info { + let asn = r.asn; + + let exported = match client + .get_exported(&mg_admin_client::types::AsnSelector { asn }) + .await + { + Ok(result) => result.into_inner(), + Err(e) => { + error!( + self.log, + "failed to get exports for asn {asn} from {switch}: {e}" + ); + continue; + } + }; + for (addr, exports) in exported { + let mut xps = Vec::new(); + for ex in exports.iter() { + let net = match ex { + mg_admin_client::types::Prefix::V4(v4) => { + oxnet::Ipv4Net::new_unchecked( + v4.value, v4.length, + ) + } + mg_admin_client::types::Prefix::V6(v6) => { + let v6 = oxnet::IpNet::V6( + oxnet::Ipv6Net::new_unchecked( + v6.value, v6.length, + ), + ); + warn!( + self.log, + "{v6}: ipv6 exports not supported yet" + ); + continue; + } + }; + xps.push(net); + } + result.exports.insert(addr.to_string(), xps); + } + } + } + Ok(result) + } + pub async fn bgp_message_history( &self, opctx: &OpContext, diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 5b80c973e3..015fe11e3a 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -41,6 +41,7 @@ use nexus_db_queries::db::lookup::ImageLookup; use nexus_db_queries::db::lookup::ImageParentLookup; use nexus_db_queries::db::model::Name; use nexus_types::external_api::shared::{BfdStatus, ProbeInfo}; +use omicron_common::api::external::http_pagination::data_page_params_for; use omicron_common::api::external::http_pagination::marker_for_name; use omicron_common::api::external::http_pagination::marker_for_name_or_id; use omicron_common::api::external::http_pagination::name_or_id_pagination; @@ -55,9 +56,11 @@ use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::AddressLot; use omicron_common::api::external::AddressLotBlock; use omicron_common::api::external::AddressLotCreateResponse; +use omicron_common::api::external::AggregateBgpMessageHistory; use omicron_common::api::external::BgpAnnounceSet; use omicron_common::api::external::BgpAnnouncement; use omicron_common::api::external::BgpConfig; +use omicron_common::api::external::BgpExported; use omicron_common::api::external::BgpImportedRouteIpv4; use omicron_common::api::external::BgpPeerStatus; use omicron_common::api::external::DataPageParams; @@ -78,9 +81,6 @@ use omicron_common::api::external::TufRepoGetResponse; use omicron_common::api::external::TufRepoInsertResponse; use omicron_common::api::external::VpcFirewallRuleUpdateParams; use omicron_common::api::external::VpcFirewallRules; -use omicron_common::api::external::{ - http_pagination::data_page_params_for, AggregateBgpMessageHistory, -}; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; use parse_display::Display; @@ -277,6 +277,7 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(networking_bgp_config_create)?; api.register(networking_bgp_config_list)?; api.register(networking_bgp_status)?; + api.register(networking_bgp_exported)?; api.register(networking_bgp_imported_routes_ipv4)?; api.register(networking_bgp_config_delete)?; api.register(networking_bgp_announce_set_update)?; @@ -3937,6 +3938,30 @@ async fn networking_bgp_status( .await } +//TODO pagination? the normal by-name/by-id stuff does not work here +/// Get BGP exported routes +#[endpoint { + method = GET, + path = "/v1/system/networking/bgp-exported", + tags = ["system/networking"], +}] +async fn networking_bgp_exported( + rqctx: RequestContext, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let handler = async { + let nexus = &apictx.context.nexus; + let result = nexus.bgp_exported(&opctx).await?; + Ok(HttpResponseOk(result)) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await +} + /// Get BGP router message history #[endpoint { method = GET, diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 9097082a20..381d59e073 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -587,6 +587,8 @@ pub static DEMO_BGP_ANNOUNCE: Lazy = }); pub const DEMO_BGP_STATUS_URL: &'static str = "/v1/system/networking/bgp-status"; +pub const DEMO_BGP_EXPORTED_URL: &'static str = + "/v1/system/networking/bgp-exported"; pub const DEMO_BGP_ROUTES_IPV4_URL: &'static str = "/v1/system/networking/bgp-routes-ipv4?asn=47"; pub const DEMO_BGP_MESSAGE_HISTORY_URL: &'static str = @@ -2307,6 +2309,15 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { ], }, + VerifyEndpoint { + url: &DEMO_BGP_EXPORTED_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::GetNonexistent, + ], + }, + VerifyEndpoint { url: &DEMO_BGP_ROUTES_IPV4_URL, visibility: Visibility::Public, diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 340d72569b..053f56cf5c 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -184,6 +184,7 @@ networking_bgp_announce_set_update PUT /v1/system/networking/bgp-anno networking_bgp_config_create POST /v1/system/networking/bgp networking_bgp_config_delete DELETE /v1/system/networking/bgp networking_bgp_config_list GET /v1/system/networking/bgp +networking_bgp_exported GET /v1/system/networking/bgp-exported networking_bgp_imported_routes_ipv4 GET /v1/system/networking/bgp-routes-ipv4 networking_bgp_message_history GET /v1/system/networking/bgp-message-history networking_bgp_status GET /v1/system/networking/bgp-status diff --git a/openapi/nexus.json b/openapi/nexus.json index e622239fa2..f6ba231c02 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -6640,6 +6640,33 @@ } } }, + "/v1/system/networking/bgp-exported": { + "get": { + "tags": [ + "system/networking" + ], + "summary": "Get BGP exported routes", + "operationId": "networking_bgp_exported", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BgpExported" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/system/networking/bgp-message-history": { "get": { "tags": [ @@ -10351,6 +10378,25 @@ "items" ] }, + "BgpExported": { + "description": "The current status of a BGP peer.", + "type": "object", + "properties": { + "exports": { + "description": "Exported routes indexed by peer address.", + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Ipv4Net" + } + } + } + }, + "required": [ + "exports" + ] + }, "BgpImportedRouteIpv4": { "description": "A route imported from a BGP peer.", "type": "object", diff --git a/package-manifest.toml b/package-manifest.toml index 0f42025fba..e846e9da31 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -628,10 +628,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" +source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "6b2b5b5fed0c8ea36d78138d8d9bb455e8768ae61e7443985ddea48535cfc2da" +source.sha256 = "c53a87b6c08323ea58c1604e3db24df061b9ee457e7d2b1dc6168abda4a686bc" output.type = "tarball" [package.mg-ddm] @@ -644,10 +644,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" +source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "725a5b1eeed5bc34ad5473cb54b8df4b3993f3ed3808cc50304696082e490a4a" +source.sha256 = "00b2433504cb4c984163c5cdfd455eee595858b125a29deadaa791628668e384" output.type = "zone" output.intermediate_only = true @@ -659,10 +659,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "73e63eaae3fe616bd7c48a20c69736d7e025836b" +source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "1f9833ce2d38bdb57099c3f7e7e9f2c414b17492fe0a3574e043b65756b78192" +source.sha256 = "67856e05347304523b03e7ddfbd7ec18e833b7bf291f39105d5d8c4c5c383392" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 3b07ab4e61..8c549b3eb5 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="73e63eaae3fe616bd7c48a20c69736d7e025836b" +COMMIT="9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" SHA2="007bfb717ccbc077c0250dee3121aeb0c5bb0d1c16795429a514fa4f8635a5ef" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 691df704d7..7befaae8db 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="73e63eaae3fe616bd7c48a20c69736d7e025836b" -SHA2="34536d8f55fc054d0b8114b5654b38c968099aafc7770562e04d405168f5be95" +COMMIT="9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +SHA2="5b327f213f8f341cf9072d428980f53757b2c6383f684ac80bbccfb1984ffe5f" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index f54745b92a..e361263531 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="1f9833ce2d38bdb57099c3f7e7e9f2c414b17492fe0a3574e043b65756b78192" -MGD_LINUX_SHA256="3c47a55af8daa4dc2cd7da5ecc3c5043cef5e6890b60d070a2e8672101cdbd30" \ No newline at end of file +CIDL_SHA256="67856e05347304523b03e7ddfbd7ec18e833b7bf291f39105d5d8c4c5c383392" +MGD_LINUX_SHA256="6e37daa25ddb8310a4dd215db590bbd18999d55decf0f8a9baf7b919cf101c52" \ No newline at end of file From 18ee28341d9c4eed80fe9966e7084e793022221b Mon Sep 17 00:00:00 2001 From: "oxide-reflector-bot[bot]" <130185838+oxide-reflector-bot[bot]@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:11:44 +0000 Subject: [PATCH 85/91] Update maghemite to c92d6ff (#6282) Updated maghemite to commit c92d6ff. --------- Co-authored-by: reflector[bot] <130185838+reflector[bot]@users.noreply.github.com> --- package-manifest.toml | 12 ++++++------ tools/maghemite_ddm_openapi_version | 2 +- tools/maghemite_mg_openapi_version | 2 +- tools/maghemite_mgd_checksums | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index e846e9da31..95017ca653 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -628,10 +628,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +source.commit = "c92d6ff85db8992066f49da176cf686acfd8fe0f" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm-gz.sha256.txt -source.sha256 = "c53a87b6c08323ea58c1604e3db24df061b9ee457e7d2b1dc6168abda4a686bc" +source.sha256 = "c33915998894dd36a2d1078f7e13717aa20760924c30640d7647d4791dd5f2ee" output.type = "tarball" [package.mg-ddm] @@ -644,10 +644,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +source.commit = "c92d6ff85db8992066f49da176cf686acfd8fe0f" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mg-ddm.sha256.txt -source.sha256 = "00b2433504cb4c984163c5cdfd455eee595858b125a29deadaa791628668e384" +source.sha256 = "be9d657ec22a69468b18f2b4d48e55621538eade8b8d3e367a1d8d5cc686cfbe" output.type = "zone" output.intermediate_only = true @@ -659,10 +659,10 @@ source.repo = "maghemite" # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when # building `ddm-admin-client` (which will instruct you to update # `tools/maghemite_openapi_version`). -source.commit = "9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +source.commit = "c92d6ff85db8992066f49da176cf686acfd8fe0f" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image//mgd.sha256.txt -source.sha256 = "67856e05347304523b03e7ddfbd7ec18e833b7bf291f39105d5d8c4c5c383392" +source.sha256 = "e000485f7e04ac1cf9b3532b60bcf23598ab980331ba4f1c6788a7e95c1e9ef8" output.type = "zone" output.intermediate_only = true diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version index 8c549b3eb5..0c223c85a8 100644 --- a/tools/maghemite_ddm_openapi_version +++ b/tools/maghemite_ddm_openapi_version @@ -1,2 +1,2 @@ -COMMIT="9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +COMMIT="c92d6ff85db8992066f49da176cf686acfd8fe0f" SHA2="007bfb717ccbc077c0250dee3121aeb0c5bb0d1c16795429a514fa4f8635a5ef" diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version index 7befaae8db..0db6a3b63d 100644 --- a/tools/maghemite_mg_openapi_version +++ b/tools/maghemite_mg_openapi_version @@ -1,2 +1,2 @@ -COMMIT="9e0fe45ca3862176dc31ad8cc83f605f8a7e1a42" +COMMIT="c92d6ff85db8992066f49da176cf686acfd8fe0f" SHA2="5b327f213f8f341cf9072d428980f53757b2c6383f684ac80bbccfb1984ffe5f" diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums index e361263531..2e180a83db 100644 --- a/tools/maghemite_mgd_checksums +++ b/tools/maghemite_mgd_checksums @@ -1,2 +1,2 @@ -CIDL_SHA256="67856e05347304523b03e7ddfbd7ec18e833b7bf291f39105d5d8c4c5c383392" -MGD_LINUX_SHA256="6e37daa25ddb8310a4dd215db590bbd18999d55decf0f8a9baf7b919cf101c52" \ No newline at end of file +CIDL_SHA256="e000485f7e04ac1cf9b3532b60bcf23598ab980331ba4f1c6788a7e95c1e9ef8" +MGD_LINUX_SHA256="1c3d93bbfbe4ce97af7cb81c13e42a2eea464e18de6827794a55d5bfd971b66c" \ No newline at end of file From d24003b74d232d2ff643a6fc9b5ee10cb5f2b056 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 21 Aug 2024 09:29:21 -0700 Subject: [PATCH 86/91] Tiny cleanup: remove unnecessary `result` check (#6401) --- nexus/src/app/background/tasks/blueprint_execution.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index d13e5428f8..dbbfcb3b14 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -104,11 +104,9 @@ impl BlueprintExecutor { Ok(RealizeBlueprintOutput { needs_saga_recovery }) => { // If executing the blueprint requires activating the saga // recovery background task, do that now. - if let Ok(output) = &result { - if output.needs_saga_recovery { - info!(&opctx.log, "activating saga recovery task"); - self.saga_recovery.activate(); - } + if needs_saga_recovery { + info!(&opctx.log, "activating saga recovery task"); + self.saga_recovery.activate(); } json!({ From d8f46b2d45f2939e5a41f18ad5176d15412b9d96 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Wed, 21 Aug 2024 17:47:40 -0500 Subject: [PATCH 87/91] Bump web console (#6403) https://github.com/oxidecomputer/console/compare/33b7a505...8dcddcef * [8dcddcef](https://github.com/oxidecomputer/console/commit/8dcddcef) oxidecomputer/console#2392 * [f53bb383](https://github.com/oxidecomputer/console/commit/f53bb383) oxidecomputer/console#2387 * [6a005a0b](https://github.com/oxidecomputer/console/commit/6a005a0b) oxidecomputer/console#2389 * [2f83a1fb](https://github.com/oxidecomputer/console/commit/2f83a1fb) tweak firewall rule copy (closes oxidecomputer/console#2348) * [22bdac78](https://github.com/oxidecomputer/console/commit/22bdac78) oxidecomputer/console#2386 * [5353a6ea](https://github.com/oxidecomputer/console/commit/5353a6ea) oxidecomputer/console#2385 * [7cf1ed54](https://github.com/oxidecomputer/console/commit/7cf1ed54) oxidecomputer/console#2383 * [c52cc37b](https://github.com/oxidecomputer/console/commit/c52cc37b) oxidecomputer/console#2382 * [e442f6bd](https://github.com/oxidecomputer/console/commit/e442f6bd) oxidecomputer/console#2378 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 994d30396b..ef59f6e40c 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="33b7a505a222b258a155636e8ee79c7ee3c132d2" -SHA2="f9089e18d52d7a54149b364a0b3ae4efba421c13eca6f7752a23b74dc3fa1a8e" +COMMIT="8dcddcef62b8d10dfcd3adb470439212b23b3d5e" +SHA2="30a5ecc4d7b82dfc8bbd5ea59d5d92b8414d0362425c1ce1011da8c722a8ec4c" From 45fb57573685713350da05d908ca9ed760b60a86 Mon Sep 17 00:00:00 2001 From: Nils Nieuwejaar Date: Wed, 21 Aug 2024 21:34:04 -0400 Subject: [PATCH 88/91] Add LLDP configuration support (#6185) --- clients/sled-agent-client/src/lib.rs | 1 + common/src/api/external/mod.rs | 34 +++---- common/src/api/internal/shared.rs | 84 +++++++++++++++- illumos-utils/src/smf_helper.rs | 2 +- nexus/db-model/src/schema.rs | 23 ++--- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/switch_port.rs | 97 ++++++++++--------- .../src/db/datastore/switch_port.rs | 56 +++++------ .../tasks/sync_switch_configuration.rs | 47 ++++++++- nexus/src/app/rack.rs | 26 ++++- nexus/tests/integration_tests/switch_port.rs | 40 ++++++-- nexus/types/src/external_api/params.rs | 29 ++++-- openapi/bootstrap-agent.json | 70 +++++++++++++ openapi/nexus-internal.json | 70 +++++++++++++ openapi/nexus.json | 86 ++++++++++++---- openapi/sled-agent.json | 78 +++++++++++++++ openapi/wicketd.json | 70 +++++++++++++ .../tests/output/self-stat-schema.json | 91 +++++++++++++++++ package-manifest.toml | 4 +- schema/crdb/collapse_lldp_settings/up1.sql | 4 + schema/crdb/collapse_lldp_settings/up2.sql | 4 + schema/crdb/collapse_lldp_settings/up3.sql | 5 + schema/crdb/collapse_lldp_settings/up4.sql | 4 + schema/crdb/collapse_lldp_settings/up5.sql | 13 +++ schema/crdb/collapse_lldp_settings/up6.sql | 1 + schema/crdb/dbinit.sql | 32 +++--- schema/rss-sled-plan.json | 84 ++++++++++++++++ sled-agent/src/rack_setup/plan/sled.rs | 2 +- sled-agent/src/rack_setup/service.rs | 86 +++++++++++----- sled-agent/src/services.rs | 77 +++++++++++++-- .../tests/integration_tests/early_network.rs | 1 + .../madrid-rss-sled-plan.json | 6 +- sled-agent/types/src/early_networking.rs | 4 + smf/sled-agent/non-gimlet/config-rss.toml | 16 +++ tools/update_lldp.sh | 4 +- wicket-common/src/example.rs | 31 +++++- wicket-common/src/rack_setup.rs | 3 + wicket/src/cli/rack_setup/config_toml.rs | 42 ++++++++ wicket/src/ui/panes/rack_setup.rs | 64 ++++++++++++ wicket/tests/output/example_non_empty.toml | 17 ++++ wicketd/src/rss_config.rs | 17 ++++ 41 files changed, 1212 insertions(+), 216 deletions(-) create mode 100644 oximeter/collector/tests/output/self-stat-schema.json create mode 100644 schema/crdb/collapse_lldp_settings/up1.sql create mode 100644 schema/crdb/collapse_lldp_settings/up2.sql create mode 100644 schema/crdb/collapse_lldp_settings/up3.sql create mode 100644 schema/crdb/collapse_lldp_settings/up4.sql create mode 100644 schema/crdb/collapse_lldp_settings/up5.sql create mode 100644 schema/crdb/collapse_lldp_settings/up6.sql diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 4ed5aaa1cb..ed96d762dc 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -29,6 +29,7 @@ progenitor::generate_api!( BfdPeerConfig = { derives = [Eq, Hash] }, BgpConfig = { derives = [Eq, Hash] }, BgpPeerConfig = { derives = [Eq, Hash] }, + LldpPortConfig = { derives = [Eq, Hash, PartialOrd, Ord] }, OmicronPhysicalDiskConfig = { derives = [Eq, Hash, PartialOrd, Ord] }, PortConfigV2 = { derives = [Eq, Hash] }, RouteConfig = { derives = [Eq, Hash] }, diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index f3f5372749..07e4fd0b83 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -2229,7 +2229,7 @@ pub struct SwitchPortSettingsView { pub links: Vec, /// Link-layer discovery protocol (LLDP) settings. - pub link_lldp: Vec, + pub link_lldp: Vec, /// Layer 3 interface settings. pub interfaces: Vec, @@ -2371,7 +2371,7 @@ pub struct SwitchPortLinkConfig { /// The link-layer discovery protocol service configuration id for this /// link. - pub lldp_service_config_id: Uuid, + pub lldp_link_config_id: Uuid, /// The name of this link. pub link_name: String, @@ -2391,34 +2391,30 @@ pub struct SwitchPortLinkConfig { /// A link layer discovery protocol (LLDP) service configuration. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq)] -pub struct LldpServiceConfig { +pub struct LldpLinkConfig { /// The id of this LLDP service instance. pub id: Uuid, - /// The link-layer discovery protocol configuration for this service. - pub lldp_config_id: Option, - /// Whether or not the LLDP service is enabled. pub enabled: bool, -} -/// A link layer discovery protocol (LLDP) base configuration. -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq)] -pub struct LldpConfig { - #[serde(flatten)] - pub identity: IdentityMetadata, + /// The LLDP link name TLV. + pub link_name: Option, + + /// The LLDP link description TLV. + pub link_description: Option, /// The LLDP chassis identifier TLV. - pub chassis_id: String, + pub chassis_id: Option, - /// THE LLDP system name TLV. - pub system_name: String, + /// The LLDP system name TLV. + pub system_name: Option, - /// THE LLDP system description TLV. - pub system_description: String, + /// The LLDP system description TLV. + pub system_description: Option, - /// THE LLDP management IP TLV. - pub management_ip: oxnet::IpNet, + /// The LLDP management IP TLV. + pub management_ip: Option, } /// Describes the kind of an switch interface. diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 395bc3d132..5945efe16d 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -379,6 +379,84 @@ impl FromStr for UplinkAddressConfig { } } +#[derive( + Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +/// To what extent should this port participate in LLDP +pub enum LldpAdminStatus { + #[default] + Enabled, + Disabled, + RxOnly, + TxOnly, +} + +impl fmt::Display for LldpAdminStatus { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LldpAdminStatus::Enabled => write!(f, "enabled"), + LldpAdminStatus::Disabled => write!(f, "disabled"), + LldpAdminStatus::RxOnly => write!(f, "rx_only"), + LldpAdminStatus::TxOnly => write!(f, "tx_only"), + } + } +} + +#[derive(Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct ParseLldpAdminStatusError(String); + +impl std::fmt::Display for ParseLldpAdminStatusError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "LLDP admin status error: {}", self.0) + } +} + +impl FromStr for LldpAdminStatus { + type Err = ParseLldpAdminStatusError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "enabled" => Ok(Self::Enabled), + "disabled" => Ok(Self::Disabled), + "rxonly" | "rx_only" => Ok(Self::RxOnly), + "txonly" | "tx_only" => Ok(Self::TxOnly), + _ => Err(ParseLldpAdminStatusError(format!( + "not a valid admin status: {s}" + ))), + } + } +} + +/// Per-port LLDP configuration settings. Only the "status" setting is +/// mandatory. All other fields have natural defaults or may be inherited from +/// the switch. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema)] +pub struct LldpPortConfig { + /// To what extent should this port participate in LLDP + pub status: LldpAdminStatus, + /// Chassis ID to advertise. If this is set, it will be advertised as a + /// LocallyAssigned ID type. If this is not set, it will be + /// inherited from the switch-level settings. + pub chassis_id: Option, + /// Port ID to advertise. If this is set, it will be advertised as a + /// LocallyAssigned ID type. If this is not set, it will be set to + /// the port name. e.g., qsfp0/0. + pub port_id: Option, + /// Port description to advertise. If this is not set, no + /// description will be advertised. + pub port_description: Option, + /// System name to advertise. If this is not set, it will be + /// inherited from the switch-level settings. + pub system_name: Option, + /// System description to advertise. If this is not set, it will be + /// inherited from the switch-level settings. + pub system_description: Option, + /// Management IP addresses to advertise. If this is not set, it will be + /// inherited from the switch-level settings. + pub management_addrs: Option>, +} + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema)] pub struct PortConfigV2 { /// The set of routes associated with this port. @@ -398,6 +476,8 @@ pub struct PortConfigV2 { /// Whether or not to set autonegotiation #[serde(default)] pub autoneg: bool, + /// LLDP configuration for this port + pub lldp: Option, } /// A set of switch uplinks. @@ -414,11 +494,13 @@ pub struct HostPortConfig { /// IP Address and prefix (e.g., `192.168.0.1/16`) to apply to switchport /// (must be in infra_ip pool). May also include an optional VLAN ID. pub addrs: Vec, + + pub lldp: Option, } impl From for HostPortConfig { fn from(x: PortConfigV2) -> Self { - Self { port: x.port, addrs: x.addresses } + Self { port: x.port, addrs: x.addresses, lldp: x.lldp.clone() } } } diff --git a/illumos-utils/src/smf_helper.rs b/illumos-utils/src/smf_helper.rs index 2c24ceaa4d..2d29376950 100644 --- a/illumos-utils/src/smf_helper.rs +++ b/illumos-utils/src/smf_helper.rs @@ -77,7 +77,7 @@ impl<'t> SmfHelper<'t> { "addpropvalue", &prop.to_string(), &format!("{}:", valtype.to_string()), - &val.to_string(), + &format!("\"{}\"", val.to_string()), ]) .map_err(|err| Error::ZoneCommand { intent: format!("add {} smf property value", prop.to_string()), diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index d1205dac65..1bcb3ac229 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -139,35 +139,28 @@ table! { table! { switch_port_settings_link_config (port_settings_id, link_name) { port_settings_id -> Uuid, - lldp_service_config_id -> Uuid, link_name -> Text, mtu -> Int4, fec -> crate::SwitchLinkFecEnum, speed -> crate::SwitchLinkSpeedEnum, autoneg -> Bool, + lldp_link_config_id -> Uuid, } } table! { - lldp_service_config (id) { + lldp_link_config (id) { id -> Uuid, enabled -> Bool, - lldp_config_id -> Nullable, - } -} - -table! { - lldp_config (id) { - id -> Uuid, - name -> Text, - description -> Text, + link_name -> Nullable, + link_description -> Nullable, + chassis_id -> Nullable, + system_name -> Nullable, + system_description -> Nullable, + management_ip -> Nullable, time_created -> Timestamptz, time_modified -> Timestamptz, time_deleted -> Nullable, - chassis_id -> Text, - system_name -> Text, - system_description -> Text, - management_ip -> Inet, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index ef9a11c330..649355f8e4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(88, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(89, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(89, "collapse_lldp_settings"), KnownVersion::new(88, "route-local-pref"), KnownVersion::new(87, "add-clickhouse-server-enum-variants"), KnownVersion::new(86, "snapshot-replacement"), diff --git a/nexus/db-model/src/switch_port.rs b/nexus/db-model/src/switch_port.rs index 9b36cbda48..09f1327be2 100644 --- a/nexus/db-model/src/switch_port.rs +++ b/nexus/db-model/src/switch_port.rs @@ -3,7 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::schema::{ - lldp_config, lldp_service_config, switch_port, switch_port_settings, + lldp_link_config, switch_port, switch_port_settings, switch_port_settings_address_config, switch_port_settings_bgp_peer_config, switch_port_settings_bgp_peer_config_allow_export, switch_port_settings_bgp_peer_config_allow_import, @@ -14,6 +14,7 @@ use crate::schema::{ }; use crate::{impl_enum_type, SqlU32}; use crate::{SqlU16, SqlU8}; +use chrono::{DateTime, Utc}; use db_macros::Resource; use diesel::AsChangeset; use ipnetwork::IpNetwork; @@ -380,7 +381,7 @@ impl Into for SwitchPortConfig { #[diesel(table_name = switch_port_settings_link_config)] pub struct SwitchPortLinkConfig { pub port_settings_id: Uuid, - pub lldp_service_config_id: Uuid, + pub lldp_link_config_id: Uuid, pub link_name: String, pub mtu: SqlU16, pub fec: SwitchLinkFec, @@ -391,7 +392,7 @@ pub struct SwitchPortLinkConfig { impl SwitchPortLinkConfig { pub fn new( port_settings_id: Uuid, - lldp_service_config_id: Uuid, + lldp_link_config_id: Uuid, link_name: String, mtu: u16, fec: SwitchLinkFec, @@ -400,7 +401,7 @@ impl SwitchPortLinkConfig { ) -> Self { Self { port_settings_id, - lldp_service_config_id, + lldp_link_config_id, link_name, fec, speed, @@ -414,7 +415,7 @@ impl Into for SwitchPortLinkConfig { fn into(self) -> external::SwitchPortLinkConfig { external::SwitchPortLinkConfig { port_settings_id: self.port_settings_id, - lldp_service_config_id: self.lldp_service_config_id, + lldp_link_config_id: self.lldp_link_config_id, link_name: self.link_name.clone(), mtu: self.mtu.into(), fec: self.fec.into(), @@ -434,57 +435,61 @@ impl Into for SwitchPortLinkConfig { Deserialize, AsChangeset, )] -#[diesel(table_name = lldp_service_config)] -pub struct LldpServiceConfig { +#[diesel(table_name = lldp_link_config)] +pub struct LldpLinkConfig { pub id: Uuid, pub enabled: bool, - pub lldp_config_id: Option, -} - -impl LldpServiceConfig { - pub fn new(enabled: bool, lldp_config_id: Option) -> Self { - Self { id: Uuid::new_v4(), enabled, lldp_config_id } + pub link_name: Option, + pub link_description: Option, + pub chassis_id: Option, + pub system_name: Option, + pub system_description: Option, + pub management_ip: Option, + pub time_created: DateTime, + pub time_modified: DateTime, + pub time_deleted: Option>, +} + +impl LldpLinkConfig { + pub fn new( + enabled: bool, + link_name: Option, + link_description: Option, + chassis_id: Option, + system_name: Option, + system_description: Option, + management_ip: Option, + ) -> Self { + let now = Utc::now(); + Self { + id: Uuid::new_v4(), + enabled, + link_name, + link_description, + chassis_id, + system_name, + system_description, + management_ip, + time_created: now, + time_modified: now, + time_deleted: None, + } } } -impl Into for LldpServiceConfig { - fn into(self) -> external::LldpServiceConfig { - external::LldpServiceConfig { +// This converts the internal database version of the config into the +// user-facing version. +impl Into for LldpLinkConfig { + fn into(self) -> external::LldpLinkConfig { + external::LldpLinkConfig { id: self.id, - lldp_config_id: self.lldp_config_id, enabled: self.enabled, - } - } -} - -#[derive( - Queryable, - Insertable, - Selectable, - Clone, - Debug, - Resource, - Serialize, - Deserialize, -)] -#[diesel(table_name = lldp_config)] -pub struct LldpConfig { - #[diesel(embed)] - pub identity: LldpConfigIdentity, - pub chassis_id: String, - pub system_name: String, - pub system_description: String, - pub management_ip: IpNetwork, -} - -impl Into for LldpConfig { - fn into(self) -> external::LldpConfig { - external::LldpConfig { - identity: self.identity(), + link_name: self.link_name.clone(), + link_description: self.link_description.clone(), chassis_id: self.chassis_id.clone(), system_name: self.system_name.clone(), system_description: self.system_description.clone(), - management_ip: self.management_ip.into(), + management_ip: self.management_ip.map(|a| a.into()), } } } diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index f9c61147f3..504e6cf936 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -15,7 +15,7 @@ use crate::db::datastore::UpdatePrecondition; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::{ - LldpServiceConfig, Name, SwitchInterfaceConfig, SwitchPort, + LldpLinkConfig, Name, SwitchInterfaceConfig, SwitchPort, SwitchPortAddressConfig, SwitchPortBgpPeerConfig, SwitchPortConfig, SwitchPortLinkConfig, SwitchPortRouteConfig, SwitchPortSettings, SwitchPortSettingsGroup, SwitchPortSettingsGroups, @@ -101,7 +101,7 @@ pub struct SwitchPortSettingsCombinedResult { pub groups: Vec, pub port: SwitchPortConfig, pub links: Vec, - pub link_lldp: Vec, + pub link_lldp: Vec, pub interfaces: Vec, pub vlan_interfaces: Vec, pub routes: Vec, @@ -451,19 +451,18 @@ impl DataStore { .load_async::(&conn) .await?; - let lldp_svc_ids: Vec = result + let lldp_link_ids: Vec = result .links .iter() - .map(|link| link.lldp_service_config_id) + .map(|link| link.lldp_link_config_id) .collect(); - use db::schema::lldp_service_config as lldp_config; - use db::schema::lldp_service_config::dsl as lldp_dsl; - result.link_lldp = lldp_dsl::lldp_service_config - .filter(lldp_config::id.eq_any(lldp_svc_ids)) - .select(LldpServiceConfig::as_select()) + use db::schema::lldp_link_config; + result.link_lldp = lldp_link_config::dsl::lldp_link_config + .filter(lldp_link_config::id.eq_any(lldp_link_ids)) + .select(LldpLinkConfig::as_select()) .limit(1) - .load_async::(&conn) + .load_async::(&conn) .await?; // get the interface configs @@ -987,7 +986,7 @@ async fn do_switch_port_settings_create( ) -> Result { use db::schema::{ address_lot::dsl as address_lot_dsl, bgp_config::dsl as bgp_config_dsl, - lldp_service_config::dsl as lldp_config_dsl, + lldp_link_config::dsl as lldp_link_config_dsl, switch_port_settings::dsl as port_settings_dsl, switch_port_settings_address_config::dsl as address_config_dsl, switch_port_settings_bgp_peer_config::dsl as bgp_peer_dsl, @@ -1047,17 +1046,21 @@ async fn do_switch_port_settings_create( let mut link_config = Vec::with_capacity(params.links.len()); for (link_name, c) in ¶ms.links { - let lldp_config_id = match c.lldp.lldp_config { - Some(_) => todo!(), // TODO actual lldp support - None => None, - }; - let lldp_svc_config = - LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); + let lldp_link_config = LldpLinkConfig::new( + c.lldp.enabled, + c.lldp.link_name.clone(), + c.lldp.link_description.clone(), + c.lldp.chassis_id.clone(), + c.lldp.system_name.clone(), + c.lldp.system_description.clone(), + c.lldp.management_ip.map(|a| a.into()), + ); + let lldp_config_id = lldp_link_config.id; + lldp_config.push(lldp_link_config); - lldp_config.push(lldp_svc_config.clone()); link_config.push(SwitchPortLinkConfig::new( psid, - lldp_svc_config.id, + lldp_config_id, link_name.clone(), c.mtu, c.fec.into(), @@ -1066,9 +1069,9 @@ async fn do_switch_port_settings_create( )); } result.link_lldp = - diesel::insert_into(lldp_config_dsl::lldp_service_config) + diesel::insert_into(lldp_link_config_dsl::lldp_link_config) .values(lldp_config.clone()) - .returning(LldpServiceConfig::as_returning()) + .returning(LldpLinkConfig::as_returning()) .get_results_async(conn) .await?; @@ -1390,13 +1393,12 @@ async fn do_switch_port_settings_delete( .returning(SwitchPortLinkConfig::as_returning()) .get_results_async(conn) .await?; - // delete lldp configs - use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; - let lldp_svc_ids: Vec = - links.iter().map(|link| link.lldp_service_config_id).collect(); - diesel::delete(lldp_config_dsl::lldp_service_config) - .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) + use db::schema::lldp_link_config; + let lldp_link_ids: Vec = + links.iter().map(|link| link.lldp_link_config_id).collect(); + diesel::delete(lldp_link_config::dsl::lldp_link_config) + .filter(lldp_link_config::id.eq_any(lldp_link_ids)) .execute_async(conn) .await?; diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index 6ecdaa2e55..154384dd5e 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -51,8 +51,9 @@ use omicron_common::{ use serde_json::json; use sled_agent_client::types::{ BgpConfig as SledBgpConfig, BgpPeerConfig as SledBgpPeerConfig, - EarlyNetworkConfig, EarlyNetworkConfigBody, HostPortConfig, PortConfigV2, - RackNetworkConfigV2, RouteConfig as SledRouteConfig, UplinkAddressConfig, + EarlyNetworkConfig, EarlyNetworkConfigBody, HostPortConfig, + LldpAdminStatus, LldpPortConfig, PortConfigV2, RackNetworkConfigV2, + RouteConfig as SledRouteConfig, UplinkAddressConfig, }; use std::{ collections::{hash_map::Entry, HashMap, HashSet}, @@ -993,7 +994,23 @@ impl BackgroundTask for SwitchPortSettingsManager { .map(|l| l.speed) .unwrap_or(SwitchLinkSpeed::Speed100G) .into(), - }; + lldp: info + .link_lldp + .get(0) //TODO https://github.com/oxidecomputer/omicron/issues/3062 + .map(|c| LldpPortConfig { + status: match c.enabled { + true => LldpAdminStatus::Enabled, + false=> LldpAdminStatus::Disabled, + }, + port_id: c.link_name.clone(), + port_description: c.link_description.clone(), + chassis_id: c.chassis_id.clone(), + system_name: c.system_name.clone(), + system_description: c.system_description.clone(), + management_addrs:c.management_ip.map(|a| vec![a.ip()]), + }) + } + ; for peer in port_config.bgp_peers.iter_mut() { peer.communities = match self @@ -1412,6 +1429,29 @@ fn uplinks( let PortSettingsChange::Apply(config) = change else { continue; }; + + let lldp = if config.link_lldp.is_empty() { + None + } else { + let x = &config.link_lldp[0]; + Some(LldpPortConfig { + status: if x.enabled { + LldpAdminStatus::Enabled + } else { + LldpAdminStatus::Disabled + }, + port_id: x.link_name.clone(), + port_description: x.link_description.clone(), + chassis_id: x.chassis_id.clone(), + system_name: x.system_name.clone(), + system_description: x.system_description.clone(), + management_addrs: x.management_ip.map(|a| { + let ip: oxnet::IpNet = a.into(); + vec![ip.addr()] + }), + }) + }; + let config = HostPortConfig { port: port.port_name.clone(), addrs: config @@ -1422,6 +1462,7 @@ fn uplinks( vlan_id: a.vlan_id.map(|v| v.into()), }) .collect(), + lldp, }; match uplinks.entry(*location) { diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 4eb9883bcc..09785b4cec 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -33,7 +33,7 @@ use nexus_types::external_api::params::BgpAnnounceSetCreate; use nexus_types::external_api::params::BgpAnnouncementCreate; use nexus_types::external_api::params::BgpConfigCreate; use nexus_types::external_api::params::LinkConfigCreate; -use nexus_types::external_api::params::LldpServiceConfigCreate; +use nexus_types::external_api::params::LldpLinkConfigCreate; use nexus_types::external_api::params::RouteConfig; use nexus_types::external_api::params::SwitchPortConfigCreate; use nexus_types::external_api::params::UninitializedSledId; @@ -61,6 +61,7 @@ use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::shared::ExternalPortDiscovery; +use omicron_common::api::internal::shared::LldpAdminStatus; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledUuid; use oxnet::IpNet; @@ -609,15 +610,30 @@ impl super::Nexus { .bgp_peers .insert("phy0".to_string(), BgpPeerConfig { peers }); - let link = LinkConfigCreate { - mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 - lldp: LldpServiceConfigCreate { + let lldp = match &uplink_config.lldp { + None => LldpLinkConfigCreate { enabled: false, - lldp_config: None, + ..Default::default() + }, + Some(l) => LldpLinkConfigCreate { + enabled: l.status == LldpAdminStatus::Enabled, + link_name: l.port_id.clone(), + link_description: l.port_description.clone(), + chassis_id: l.chassis_id.clone(), + system_name: l.system_name.clone(), + system_description: l.system_description.clone(), + management_ip: match &l.management_addrs { + Some(a) if !a.is_empty() => Some(a[0]), + _ => None, + }, }, + }; + let link = LinkConfigCreate { + mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 fec: uplink_config.uplink_port_fec.into(), speed: uplink_config.uplink_port_speed.into(), autoneg: uplink_config.autoneg, + lldp, }; port_settings_params.links.insert("phy".to_string(), link); diff --git a/nexus/tests/integration_tests/switch_port.rs b/nexus/tests/integration_tests/switch_port.rs index 2485d82c45..e1d52e464a 100644 --- a/nexus/tests/integration_tests/switch_port.rs +++ b/nexus/tests/integration_tests/switch_port.rs @@ -11,9 +11,9 @@ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params::{ Address, AddressConfig, AddressLotBlockCreate, AddressLotCreate, BgpAnnounceSetCreate, BgpAnnouncementCreate, BgpConfigCreate, - BgpPeerConfig, LinkConfigCreate, LldpServiceConfigCreate, Route, - RouteConfig, SwitchInterfaceConfigCreate, SwitchInterfaceKind, - SwitchPortApplySettings, SwitchPortSettingsCreate, + BgpPeerConfig, LinkConfigCreate, LldpLinkConfigCreate, Route, RouteConfig, + SwitchInterfaceConfigCreate, SwitchInterfaceKind, SwitchPortApplySettings, + SwitchPortSettingsCreate, }; use nexus_types::external_api::views::Rack; use omicron_common::api::external::ImportExportPolicy; @@ -118,7 +118,15 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { "phy0".into(), LinkConfigCreate { mtu: 4700, - lldp: LldpServiceConfigCreate { enabled: false, lldp_config: None }, + lldp: LldpLinkConfigCreate { + enabled: true, + link_name: Some("Link Name".into()), + link_description: Some("link_ Dscription".into()), + chassis_id: Some("Chassis ID".into()), + system_name: Some("System Name".into()), + system_description: Some("System description".into()), + management_ip: None, + }, fec: LinkFec::None, speed: LinkSpeed::Speed100G, autoneg: false, @@ -177,8 +185,16 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { assert_eq!(link0.mtu, 4700); let lldp0 = &created.link_lldp[0]; - assert_eq!(lldp0.enabled, false); - assert_eq!(lldp0.lldp_config_id, None); + assert_eq!(lldp0.enabled, true); + assert_eq!(lldp0.link_name, Some("Link Name".to_string())); + assert_eq!(lldp0.link_description, Some("Link Description".to_string())); + assert_eq!(lldp0.chassis_id, Some("Chassis ID".to_string())); + assert_eq!(lldp0.system_name, Some("System Name".to_string())); + assert_eq!( + lldp0.system_description, + Some("System Description".to_string()) + ); + assert_eq!(lldp0.management_ip, None); let ifx0 = &created.interfaces[0]; assert_eq!(&ifx0.interface_name, "phy0"); @@ -213,8 +229,16 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { assert_eq!(link0.mtu, 4700); let lldp0 = &roundtrip.link_lldp[0]; - assert_eq!(lldp0.enabled, false); - assert_eq!(lldp0.lldp_config_id, None); + assert_eq!(lldp0.enabled, true); + assert_eq!(lldp0.link_name, Some("Link Name".to_string())); + assert_eq!(lldp0.link_description, Some("Link Description".to_string())); + assert_eq!(lldp0.chassis_id, Some("Chassis ID".to_string())); + assert_eq!(lldp0.system_name, Some("System Name".to_string())); + assert_eq!( + lldp0.system_description, + Some("System Description".to_string()) + ); + assert_eq!(lldp0.management_ip, None); let ifx0 = &roundtrip.interfaces[0]; assert_eq!(&ifx0.interface_name, "phy0"); diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index effd067ec8..82fa01121e 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1500,7 +1500,7 @@ pub struct LinkConfigCreate { pub mtu: u16, /// The link-layer discovery protocol (LLDP) configuration for the link. - pub lldp: LldpServiceConfigCreate, + pub lldp: LldpLinkConfigCreate, /// The forward error correction mode of the link. pub fec: LinkFec, @@ -1512,16 +1512,29 @@ pub struct LinkConfigCreate { pub autoneg: bool, } -/// The LLDP configuration associated with a port. LLDP may be either enabled or -/// disabled, if enabled, an LLDP configuration must be provided by name or id. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct LldpServiceConfigCreate { +/// The LLDP configuration associated with a port. +#[derive(Clone, Debug, Default, Deserialize, Serialize, JsonSchema)] +pub struct LldpLinkConfigCreate { /// Whether or not LLDP is enabled. pub enabled: bool, - /// A reference to the LLDP configuration used. Must not be `None` when - /// `enabled` is `true`. - pub lldp_config: Option, + /// The LLDP link name TLV. + pub link_name: Option, + + /// The LLDP link description TLV. + pub link_description: Option, + + /// The LLDP chassis identifier TLV. + pub chassis_id: Option, + + /// The LLDP system name TLV. + pub system_name: Option, + + /// The LLDP system description TLV. + pub system_description: Option, + + /// The LLDP management IP TLV. + pub management_ip: Option, } /// A layer-3 switch interface configuration. When IPv6 is enabled, a link local diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index b109eaf43e..bd928001bb 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -732,6 +732,67 @@ "last" ] }, + "LldpAdminStatus": { + "description": "To what extent should this port participate in LLDP", + "type": "string", + "enum": [ + "enabled", + "disabled", + "rx_only", + "tx_only" + ] + }, + "LldpPortConfig": { + "description": "Per-port LLDP configuration settings. Only the \"status\" setting is mandatory. All other fields have natural defaults or may be inherited from the switch.", + "type": "object", + "properties": { + "chassis_id": { + "nullable": true, + "description": "Chassis ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "management_addrs": { + "nullable": true, + "description": "Management IP addresses to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "port_description": { + "nullable": true, + "description": "Port description to advertise. If this is not set, no description will be advertised.", + "type": "string" + }, + "port_id": { + "nullable": true, + "description": "Port ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be set to the port name. e.g., qsfp0/0.", + "type": "string" + }, + "status": { + "description": "To what extent should this port participate in LLDP", + "allOf": [ + { + "$ref": "#/components/schemas/LldpAdminStatus" + } + ] + }, + "system_description": { + "nullable": true, + "description": "System description to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "System name to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + } + }, + "required": [ + "status" + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID, but they may contain a UUID. They can be at most 63 characters long.", @@ -767,6 +828,15 @@ "$ref": "#/components/schemas/BgpPeerConfig" } }, + "lldp": { + "nullable": true, + "description": "LLDP configuration for this port", + "allOf": [ + { + "$ref": "#/components/schemas/LldpPortConfig" + } + ] + }, "port": { "description": "Nmae of the port this config applies to.", "type": "string" diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 6b9a63d7f2..54b4822e51 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3749,6 +3749,67 @@ "start_time" ] }, + "LldpAdminStatus": { + "description": "To what extent should this port participate in LLDP", + "type": "string", + "enum": [ + "enabled", + "disabled", + "rx_only", + "tx_only" + ] + }, + "LldpPortConfig": { + "description": "Per-port LLDP configuration settings. Only the \"status\" setting is mandatory. All other fields have natural defaults or may be inherited from the switch.", + "type": "object", + "properties": { + "chassis_id": { + "nullable": true, + "description": "Chassis ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "management_addrs": { + "nullable": true, + "description": "Management IP addresses to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "port_description": { + "nullable": true, + "description": "Port description to advertise. If this is not set, no description will be advertised.", + "type": "string" + }, + "port_id": { + "nullable": true, + "description": "Port ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be set to the port name. e.g., qsfp0/0.", + "type": "string" + }, + "status": { + "description": "To what extent should this port participate in LLDP", + "allOf": [ + { + "$ref": "#/components/schemas/LldpAdminStatus" + } + ] + }, + "system_description": { + "nullable": true, + "description": "System description to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "System name to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + } + }, + "required": [ + "status" + ] + }, "MacAddr": { "example": "ff:ff:ff:ff:ff:ff", "title": "A MAC address", @@ -4154,6 +4215,15 @@ "$ref": "#/components/schemas/BgpPeerConfig" } }, + "lldp": { + "nullable": true, + "description": "LLDP configuration for this port", + "allOf": [ + { + "$ref": "#/components/schemas/LldpPortConfig" + } + ] + }, "port": { "description": "Nmae of the port this config applies to.", "type": "string" diff --git a/openapi/nexus.json b/openapi/nexus.json index f6ba231c02..1e4113face 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -16026,7 +16026,7 @@ "description": "The link-layer discovery protocol (LLDP) configuration for the link.", "allOf": [ { - "$ref": "#/components/schemas/LldpServiceConfigCreate" + "$ref": "#/components/schemas/LldpLinkConfigCreate" } ] }, @@ -16147,10 +16147,15 @@ } ] }, - "LldpServiceConfig": { + "LldpLinkConfig": { "description": "A link layer discovery protocol (LLDP) service configuration.", "type": "object", "properties": { + "chassis_id": { + "nullable": true, + "description": "The LLDP chassis identifier TLV.", + "type": "string" + }, "enabled": { "description": "Whether or not the LLDP service is enabled.", "type": "boolean" @@ -16160,11 +16165,34 @@ "type": "string", "format": "uuid" }, - "lldp_config_id": { + "link_description": { "nullable": true, - "description": "The link-layer discovery protocol configuration for this service.", - "type": "string", - "format": "uuid" + "description": "The LLDP link description TLV.", + "type": "string" + }, + "link_name": { + "nullable": true, + "description": "The LLDP link name TLV.", + "type": "string" + }, + "management_ip": { + "nullable": true, + "description": "The LLDP management IP TLV.", + "allOf": [ + { + "$ref": "#/components/schemas/IpNet" + } + ] + }, + "system_description": { + "nullable": true, + "description": "The LLDP system description TLV.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "The LLDP system name TLV.", + "type": "string" } }, "required": [ @@ -16172,22 +16200,44 @@ "id" ] }, - "LldpServiceConfigCreate": { - "description": "The LLDP configuration associated with a port. LLDP may be either enabled or disabled, if enabled, an LLDP configuration must be provided by name or id.", + "LldpLinkConfigCreate": { + "description": "The LLDP configuration associated with a port.", "type": "object", "properties": { + "chassis_id": { + "nullable": true, + "description": "The LLDP chassis identifier TLV.", + "type": "string" + }, "enabled": { "description": "Whether or not LLDP is enabled.", "type": "boolean" }, - "lldp_config": { + "link_description": { "nullable": true, - "description": "A reference to the LLDP configuration used. Must not be `None` when `enabled` is `true`.", - "allOf": [ - { - "$ref": "#/components/schemas/NameOrId" - } - ] + "description": "The LLDP link description TLV.", + "type": "string" + }, + "link_name": { + "nullable": true, + "description": "The LLDP link name TLV.", + "type": "string" + }, + "management_ip": { + "nullable": true, + "description": "The LLDP management IP TLV.", + "type": "string", + "format": "ip" + }, + "system_description": { + "nullable": true, + "description": "The LLDP system description TLV.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "The LLDP system name TLV.", + "type": "string" } }, "required": [ @@ -19211,7 +19261,7 @@ "description": "The name of this link.", "type": "string" }, - "lldp_service_config_id": { + "lldp_link_config_id": { "description": "The link-layer discovery protocol service configuration id for this link.", "type": "string", "format": "uuid" @@ -19240,7 +19290,7 @@ "autoneg", "fec", "link_name", - "lldp_service_config_id", + "lldp_link_config_id", "mtu", "port_settings_id", "speed" @@ -19502,7 +19552,7 @@ "description": "Link-layer discovery protocol (LLDP) settings.", "type": "array", "items": { - "$ref": "#/components/schemas/LldpServiceConfig" + "$ref": "#/components/schemas/LldpLinkConfig" } }, "links": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 6459595b65..4c40fb5da0 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -2752,6 +2752,14 @@ "$ref": "#/components/schemas/UplinkAddressConfig" } }, + "lldp": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/LldpPortConfig" + } + ] + }, "port": { "description": "Switchport to use for external connectivity", "type": "string" @@ -3404,6 +3412,67 @@ "minLength": 1, "maxLength": 11 }, + "LldpAdminStatus": { + "description": "To what extent should this port participate in LLDP", + "type": "string", + "enum": [ + "enabled", + "disabled", + "rx_only", + "tx_only" + ] + }, + "LldpPortConfig": { + "description": "Per-port LLDP configuration settings. Only the \"status\" setting is mandatory. All other fields have natural defaults or may be inherited from the switch.", + "type": "object", + "properties": { + "chassis_id": { + "nullable": true, + "description": "Chassis ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "management_addrs": { + "nullable": true, + "description": "Management IP addresses to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "port_description": { + "nullable": true, + "description": "Port description to advertise. If this is not set, no description will be advertised.", + "type": "string" + }, + "port_id": { + "nullable": true, + "description": "Port ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be set to the port name. e.g., qsfp0/0.", + "type": "string" + }, + "status": { + "description": "To what extent should this port participate in LLDP", + "allOf": [ + { + "$ref": "#/components/schemas/LldpAdminStatus" + } + ] + }, + "system_description": { + "nullable": true, + "description": "System description to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "System name to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + } + }, + "required": [ + "status" + ] + }, "MacAddr": { "example": "ff:ff:ff:ff:ff:ff", "title": "A MAC address", @@ -4104,6 +4173,15 @@ "$ref": "#/components/schemas/BgpPeerConfig" } }, + "lldp": { + "nullable": true, + "description": "LLDP configuration for this port", + "allOf": [ + { + "$ref": "#/components/schemas/LldpPortConfig" + } + ] + }, "port": { "description": "Nmae of the port this config applies to.", "type": "string" diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 5041fb5e56..87cfe045d3 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -1773,6 +1773,67 @@ "last" ] }, + "LldpAdminStatus": { + "description": "To what extent should this port participate in LLDP", + "type": "string", + "enum": [ + "enabled", + "disabled", + "rx_only", + "tx_only" + ] + }, + "LldpPortConfig": { + "description": "Per-port LLDP configuration settings. Only the \"status\" setting is mandatory. All other fields have natural defaults or may be inherited from the switch.", + "type": "object", + "properties": { + "chassis_id": { + "nullable": true, + "description": "Chassis ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "management_addrs": { + "nullable": true, + "description": "Management IP addresses to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "port_description": { + "nullable": true, + "description": "Port description to advertise. If this is not set, no description will be advertised.", + "type": "string" + }, + "port_id": { + "nullable": true, + "description": "Port ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be set to the port name. e.g., qsfp0/0.", + "type": "string" + }, + "status": { + "description": "To what extent should this port participate in LLDP", + "allOf": [ + { + "$ref": "#/components/schemas/LldpAdminStatus" + } + ] + }, + "system_description": { + "nullable": true, + "description": "System description to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + }, + "system_name": { + "nullable": true, + "description": "System name to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": "string" + } + }, + "required": [ + "status" + ] + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID, but they may contain a UUID. They can be at most 63 characters long.", @@ -6304,6 +6365,15 @@ "$ref": "#/components/schemas/UserSpecifiedBgpPeerConfig" } }, + "lldp": { + "nullable": true, + "default": null, + "allOf": [ + { + "$ref": "#/components/schemas/LldpPortConfig" + } + ] + }, "routes": { "type": "array", "items": { diff --git a/oximeter/collector/tests/output/self-stat-schema.json b/oximeter/collector/tests/output/self-stat-schema.json new file mode 100644 index 0000000000..5d325281ab --- /dev/null +++ b/oximeter/collector/tests/output/self-stat-schema.json @@ -0,0 +1,91 @@ +{ + "oximeter_collector:collections": { + "timeseries_name": "oximeter_collector:collections", + "field_schema": [ + { + "name": "base_route", + "field_type": "string", + "source": "metric" + }, + { + "name": "collector_id", + "field_type": "uuid", + "source": "target" + }, + { + "name": "collector_ip", + "field_type": "ip_addr", + "source": "target" + }, + { + "name": "collector_port", + "field_type": "u16", + "source": "target" + }, + { + "name": "producer_id", + "field_type": "uuid", + "source": "metric" + }, + { + "name": "producer_ip", + "field_type": "ip_addr", + "source": "metric" + }, + { + "name": "producer_port", + "field_type": "u16", + "source": "metric" + } + ], + "datum_type": "cumulative_u64", + "created": "2024-06-24T17:15:06.069658599Z" + }, + "oximeter_collector:failed_collections": { + "timeseries_name": "oximeter_collector:failed_collections", + "field_schema": [ + { + "name": "base_route", + "field_type": "string", + "source": "metric" + }, + { + "name": "collector_id", + "field_type": "uuid", + "source": "target" + }, + { + "name": "collector_ip", + "field_type": "ip_addr", + "source": "target" + }, + { + "name": "collector_port", + "field_type": "u16", + "source": "target" + }, + { + "name": "producer_id", + "field_type": "uuid", + "source": "metric" + }, + { + "name": "producer_ip", + "field_type": "ip_addr", + "source": "metric" + }, + { + "name": "producer_port", + "field_type": "u16", + "source": "metric" + }, + { + "name": "reason", + "field_type": "string", + "source": "metric" + } + ], + "datum_type": "cumulative_u64", + "created": "2024-06-24T17:15:06.070765692Z" + } +} \ No newline at end of file diff --git a/package-manifest.toml b/package-manifest.toml index 95017ca653..125861f610 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -670,8 +670,8 @@ output.intermediate_only = true service_name = "lldp" source.type = "prebuilt" source.repo = "lldp" -source.commit = "30e5d89fae9190c69258ca77d5d5a1acec064742" -source.sha256 = "f58bfd1b77748544b5b1a99a07e52bab8dc5673b9bd3a745ebbfdd614d492328" +source.commit = "188f0f6d4c066f1515bd707050407cedd790fcf1" +source.sha256 = "132d0760be5208f60b58bcaed98fa6384b09f41dd5febf51970f5cbf46138ecf" output.type = "zone" output.intermediate_only = true diff --git a/schema/crdb/collapse_lldp_settings/up1.sql b/schema/crdb/collapse_lldp_settings/up1.sql new file mode 100644 index 0000000000..f7fb05d726 --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up1.sql @@ -0,0 +1,4 @@ +/* + * The old lldp_service_config_id is being replaced with lldp_link_config_id. + */ +ALTER TABLE omicron.public.switch_port_settings_link_config DROP COLUMN IF EXISTS lldp_service_config_id; diff --git a/schema/crdb/collapse_lldp_settings/up2.sql b/schema/crdb/collapse_lldp_settings/up2.sql new file mode 100644 index 0000000000..b2d884d068 --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up2.sql @@ -0,0 +1,4 @@ +/* + * Add a pointer to this link's LLDP config settings. + */ +ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS lldp_link_config_id UUID NOT NULL; diff --git a/schema/crdb/collapse_lldp_settings/up3.sql b/schema/crdb/collapse_lldp_settings/up3.sql new file mode 100644 index 0000000000..9c4ef8549b --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up3.sql @@ -0,0 +1,5 @@ +/* + * Drop the old lldp_service_config table, which has been incorporated into the + * new lldp_link_config. + */ +DROP TABLE IF EXISTS omicron.public.lldp_service_config; diff --git a/schema/crdb/collapse_lldp_settings/up4.sql b/schema/crdb/collapse_lldp_settings/up4.sql new file mode 100644 index 0000000000..3c8d4e86cf --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up4.sql @@ -0,0 +1,4 @@ +/* + * Drop the old lldp_config table, which has been replaced by lldp_link_config. + */ +DROP TABLE IF EXISTS omicron.public.lldp_config; diff --git a/schema/crdb/collapse_lldp_settings/up5.sql b/schema/crdb/collapse_lldp_settings/up5.sql new file mode 100644 index 0000000000..50dcd618d8 --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up5.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.lldp_link_config ( + id UUID PRIMARY KEY, + enabled BOOL NOT NULL, + link_name STRING(63), + link_description STRING(512), + chassis_id STRING(63), + system_name STRING(63), + system_description STRING(512), + management_ip TEXT, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + time_deleted TIMESTAMPTZ +); diff --git a/schema/crdb/collapse_lldp_settings/up6.sql b/schema/crdb/collapse_lldp_settings/up6.sql new file mode 100644 index 0000000000..3b16af6f4b --- /dev/null +++ b/schema/crdb/collapse_lldp_settings/up6.sql @@ -0,0 +1 @@ +DROP INDEX IF EXISTS lldp_config_by_name; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index d0eba7847e..5e1ba048b6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2650,40 +2650,30 @@ CREATE TYPE IF NOT EXISTS omicron.public.switch_link_speed AS ENUM ( CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_link_config ( port_settings_id UUID, - lldp_service_config_id UUID NOT NULL, link_name TEXT, mtu INT4, fec omicron.public.switch_link_fec, speed omicron.public.switch_link_speed, autoneg BOOL NOT NULL DEFAULT false, + lldp_link_config_id UUID NOT NULL, PRIMARY KEY (port_settings_id, link_name) ); -CREATE TABLE IF NOT EXISTS omicron.public.lldp_service_config ( +CREATE TABLE IF NOT EXISTS omicron.public.lldp_link_config ( id UUID PRIMARY KEY, - lldp_config_id UUID, - enabled BOOL NOT NULL -); - -CREATE TABLE IF NOT EXISTS omicron.public.lldp_config ( - id UUID PRIMARY KEY, - name STRING(63) NOT NULL, - description STRING(512) NOT NULL, + enabled BOOL NOT NULL, + link_name STRING(63), + link_description STRING(512), + chassis_id STRING(63), + system_name STRING(63), + system_description STRING(612), + management_ip TEXT, time_created TIMESTAMPTZ NOT NULL, time_modified TIMESTAMPTZ NOT NULL, - time_deleted TIMESTAMPTZ, - chassis_id TEXT, - system_name TEXT, - system_description TEXT, - management_ip TEXT + time_deleted TIMESTAMPTZ ); -CREATE UNIQUE INDEX IF NOT EXISTS lldp_config_by_name ON omicron.public.lldp_config ( - name -) WHERE - time_deleted IS NULL; - CREATE TYPE IF NOT EXISTS omicron.public.switch_interface_kind AS ENUM ( 'primary', 'vlan', @@ -4218,7 +4208,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '88.0.0', NULL) + (TRUE, NOW(), NOW(), '89.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index f8dfb935ce..b0abc8c67e 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -604,6 +604,79 @@ } } }, + "LldpAdminStatus": { + "description": "To what extent should this port participate in LLDP", + "type": "string", + "enum": [ + "enabled", + "disabled", + "rx_only", + "tx_only" + ] + }, + "LldpPortConfig": { + "description": "Per-port LLDP configuration settings. Only the \"status\" setting is mandatory. All other fields have natural defaults or may be inherited from the switch.", + "type": "object", + "required": [ + "status" + ], + "properties": { + "chassis_id": { + "description": "Chassis ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be inherited from the switch-level settings.", + "type": [ + "string", + "null" + ] + }, + "management_addrs": { + "description": "Management IP addresses to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": [ + "array", + "null" + ], + "items": { + "type": "string", + "format": "ip" + } + }, + "port_description": { + "description": "Port description to advertise. If this is not set, no description will be advertised.", + "type": [ + "string", + "null" + ] + }, + "port_id": { + "description": "Port ID to advertise. If this is set, it will be advertised as a LocallyAssigned ID type. If this is not set, it will be set to the port name. e.g., qsfp0/0.", + "type": [ + "string", + "null" + ] + }, + "status": { + "description": "To what extent should this port participate in LLDP", + "allOf": [ + { + "$ref": "#/definitions/LldpAdminStatus" + } + ] + }, + "system_description": { + "description": "System description to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": [ + "string", + "null" + ] + }, + "system_name": { + "description": "System name to advertise. If this is not set, it will be inherited from the switch-level settings.", + "type": [ + "string", + "null" + ] + } + } + }, "Name": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID, but they may contain a UUID. They can be at most 63 characters long.", @@ -648,6 +721,17 @@ "$ref": "#/definitions/BgpPeerConfig" } }, + "lldp": { + "description": "LLDP configuration for this port", + "anyOf": [ + { + "$ref": "#/definitions/LldpPortConfig" + }, + { + "type": "null" + } + ] + }, "port": { "description": "Nmae of the port this config applies to.", "type": "string" diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index c511cf1447..32906d0195 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -172,7 +172,7 @@ impl Plan { let mut ledger = Ledger::::new_with(log, paths, plan.clone()); ledger.commit().await?; - info!(log, "Sled plan written to storage"); + info!(log, "Sled plan written to storage: {plan:#?}"); Ok(plan) } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 2505985101..20cd5646c0 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -99,6 +99,7 @@ use nexus_types::external_api::views::SledState; use omicron_common::address::get_sled_address; use omicron_common::api::external::Generation; use omicron_common::api::internal::shared::ExternalPortDiscovery; +use omicron_common::api::internal::shared::LldpAdminStatus; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; @@ -750,7 +751,7 @@ impl ServiceInner { .iter() .map(|config| NexusTypes::PortConfigV2 { port: config.port.clone(), - routes: config + routes: config .routes .iter() .map(|r| NexusTypes::RouteConfig { @@ -760,14 +761,14 @@ impl ServiceInner { local_pref: r.local_pref, }) .collect(), - addresses: config - .addresses - .iter() - .map(|a| NexusTypes::UplinkAddressConfig { - address: a.address, - vlan_id: a.vlan_id - }) - .collect(), + addresses: config + .addresses + .iter() + .map(|a| NexusTypes::UplinkAddressConfig { + address: a.address, + vlan_id: a.vlan_id, + }) + .collect(), switch: config.switch.into(), uplink_port_speed: config.uplink_port_speed.into(), uplink_port_fec: config.uplink_port_fec.into(), @@ -787,7 +788,8 @@ impl ServiceInner { remote_asn: b.remote_asn, min_ttl: b.min_ttl, md5_auth_key: b.md5_auth_key.clone(), - multi_exit_discriminator: b.multi_exit_discriminator, + multi_exit_discriminator: b + .multi_exit_discriminator, local_pref: b.local_pref, enforce_first_as: b.enforce_first_as, communities: b.communities.clone(), @@ -796,6 +798,32 @@ impl ServiceInner { vlan_id: b.vlan_id, }) .collect(), + lldp: config.lldp.as_ref().map(|lp| { + NexusTypes::LldpPortConfig { + status: match lp.status { + LldpAdminStatus::Enabled => { + NexusTypes::LldpAdminStatus::Enabled + } + LldpAdminStatus::Disabled => { + NexusTypes::LldpAdminStatus::Disabled + } + LldpAdminStatus::TxOnly => { + NexusTypes::LldpAdminStatus::TxOnly + } + LldpAdminStatus::RxOnly => { + NexusTypes::LldpAdminStatus::RxOnly + } + }, + chassis_id: lp.chassis_id.clone(), + port_id: lp.port_id.clone(), + system_name: lp.system_name.clone(), + system_description: lp + .system_description + .clone(), + port_description: lp.port_description.clone(), + management_addrs: lp.management_addrs.clone(), + } + }), }) .collect(), bgp: config @@ -803,7 +831,12 @@ impl ServiceInner { .iter() .map(|config| NexusTypes::BgpConfig { asn: config.asn, - originate: config.originate.iter().cloned().map(Into::into).collect(), + originate: config + .originate + .iter() + .cloned() + .map(Into::into) + .collect(), shaper: config.shaper.clone(), checker: config.checker.clone(), }) @@ -811,25 +844,26 @@ impl ServiceInner { bfd: config .bfd .iter() - .map(|spec| NexusTypes::BfdPeerConfig { - detection_threshold: spec.detection_threshold, - local: spec.local, - mode: match spec.mode { - omicron_common::api::external::BfdMode::SingleHop => { - nexus_client::types::BfdMode::SingleHop - } - omicron_common::api::external::BfdMode::MultiHop => { - nexus_client::types::BfdMode::MultiHop - } - }, - remote: spec.remote, - required_rx: spec.required_rx, - switch: spec.switch.into(), + .map(|spec| { + NexusTypes::BfdPeerConfig { + detection_threshold: spec.detection_threshold, + local: spec.local, + mode: match spec.mode { + omicron_common::api::external::BfdMode::SingleHop => { + nexus_client::types::BfdMode::SingleHop + } + omicron_common::api::external::BfdMode::MultiHop => { + nexus_client::types::BfdMode::MultiHop + } + }, + remote: spec.remote, + required_rx: spec.required_rx, + switch: spec.switch.into(), + } }) .collect(), } }; - info!(self.log, "rack_network_config: {:#?}", rack_network_config); let physical_disks: Vec<_> = sled_configs_by_id diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index abc50aa06c..22cbb62f70 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3995,6 +3995,19 @@ impl ServiceManager { &self, our_ports: Vec, ) -> Result<(), Error> { + // Helper function to add a property-value pair + // if the config actually has a value set. + fn apv( + smfh: &SmfHelper, + prop: &str, + val: &Option, + ) -> Result<(), Error> { + if let Some(v) = val { + smfh.addpropvalue_type(prop, v, "astring")? + } + Ok(()) + } + // We expect the switch zone to be running, as we're called immediately // after `ensure_zone()` above and we just successfully configured // uplinks via DPD running in our switch zone. If somehow we're in any @@ -4017,26 +4030,76 @@ impl ServiceManager { } }; - info!(self.inner.log, "Setting up uplinkd service"); - let smfh = SmfHelper::new(&zone, &SwitchService::Uplink); + info!(self.inner.log, "ensuring scrimlet uplinks"); + let usmfh = SmfHelper::new(&zone, &SwitchService::Uplink); + let lsmfh = SmfHelper::new( + &zone, + &SwitchService::Lldpd { baseboard: Baseboard::Unknown }, + ); // We want to delete all the properties in the `uplinks` group, but we // don't know their names, so instead we'll delete and recreate the // group, then add all our properties. - smfh.delpropgroup("uplinks")?; - smfh.addpropgroup("uplinks", "application")?; + let _ = usmfh.delpropgroup("uplinks"); + usmfh.addpropgroup("uplinks", "application")?; for port_config in &our_ports { for addr in &port_config.addrs { - info!(self.inner.log, "configuring port: {port_config:?}"); - smfh.addpropvalue_type( + usmfh.addpropvalue_type( &format!("uplinks/{}_0", port_config.port,), &addr.to_string(), "astring", )?; } + + if let Some(lldp_config) = &port_config.lldp { + let group_name = format!("port_{}", port_config.port); + info!(self.inner.log, "setting up {group_name}"); + let _ = lsmfh.delpropgroup(&group_name); + lsmfh.addpropgroup(&group_name, "application")?; + apv( + &lsmfh, + &format!("{group_name}/status"), + &Some(lldp_config.status.to_string()), + )?; + apv( + &lsmfh, + &format!("{group_name}/chassis_id"), + &lldp_config.chassis_id, + )?; + apv( + &lsmfh, + &format!("{group_name}/system_name"), + &lldp_config.system_name, + )?; + apv( + &lsmfh, + &format!("{group_name}/system_description"), + &lldp_config.system_description, + )?; + apv( + &lsmfh, + &format!("{group_name}/port_description"), + &lldp_config.port_description, + )?; + apv( + &lsmfh, + &format!("{group_name}/port_id"), + &lldp_config.port_id, + )?; + if let Some(a) = &lldp_config.management_addrs { + for address in a { + apv( + &lsmfh, + &format!("{group_name}/management_addrs"), + &Some(address.to_string()), + )?; + } + } + } } - smfh.refresh()?; + usmfh.refresh()?; + lsmfh.refresh()?; Ok(()) } diff --git a/sled-agent/tests/integration_tests/early_network.rs b/sled-agent/tests/integration_tests/early_network.rs index 8da67729da..9b69975054 100644 --- a/sled-agent/tests/integration_tests/early_network.rs +++ b/sled-agent/tests/integration_tests/early_network.rs @@ -154,6 +154,7 @@ fn current_config_example() -> (&'static str, EarlyNetworkConfig) { vlan_id: None, }], autoneg: true, + lldp: None, }], bgp: vec![BgpConfig { asn: 20000, diff --git a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json index 7df143d41d..2da814042d 100644 --- a/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json +++ b/sled-agent/tests/output/new-rss-sled-plans/madrid-rss-sled-plan.json @@ -143,7 +143,8 @@ "uplink_port_speed": "speed40_g", "uplink_port_fec": "none", "bgp_peers": [], - "autoneg": false + "autoneg": false, + "lldp": null }, { "routes": [ @@ -165,7 +166,8 @@ "uplink_port_speed": "speed40_g", "uplink_port_fec": "none", "bgp_peers": [], - "autoneg": false + "autoneg": false, + "lldp": null } ], "bgp": [], diff --git a/sled-agent/types/src/early_networking.rs b/sled-agent/types/src/early_networking.rs index c4afbd0adb..755033dc23 100644 --- a/sled-agent/types/src/early_networking.rs +++ b/sled-agent/types/src/early_networking.rs @@ -299,6 +299,7 @@ pub mod back_compat { uplink_port_fec: v1.uplink_port_fec, bgp_peers: v1.bgp_peers.clone(), autoneg: v1.autoneg, + lldp: None, } } } @@ -345,6 +346,7 @@ pub mod back_compat { uplink_port_fec: value.uplink_port_fec, bgp_peers: vec![], autoneg: false, + lldp: None, } } } @@ -517,6 +519,7 @@ mod tests { uplink_port_fec: uplink.uplink_port_fec, autoneg: false, bgp_peers: vec![], + lldp: None, }], bgp: vec![], bfd: vec![], @@ -598,6 +601,7 @@ mod tests { uplink_port_fec: port.uplink_port_fec, autoneg: false, bgp_peers: vec![], + lldp: None, }], bgp: vec![], bfd: vec![], diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 90f5339e84..a61ac81d91 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -118,6 +118,22 @@ switch = "switch0" # Neighbors we expect to peer with over BGP on this port. bgp_peers = [] +# LLDP settings for this port +#[rack_network_config.switch0.qsfp0.lldp] +#status = "Enabled" +# Optional Port ID, overriding default of qsfpX/0 +#port_id = "" +## Optional port description +#port_description = "uplink 0" +# Optional chassid ID, overriding the switch-level setting +#chassis_id = "" +# Optional system name, overriding the switch-level setting +#system_name = "" +# Optional system description, overriding the switch-level setting +#system_description = "" +# Optional management addresses to advertise, overriding switch-level setting +#management_addrs = [] + # An allowlist of source IPs that can make requests to user-facing services can # be specified here. It can be written as the string "any" ... [allowed_source_ips] diff --git a/tools/update_lldp.sh b/tools/update_lldp.sh index bf7f19eb02..2a9d1d6bae 100755 --- a/tools/update_lldp.sh +++ b/tools/update_lldp.sh @@ -47,7 +47,9 @@ function main { esac done - TARGET_COMMIT=$(get_latest_commit_from_gh "$REPO" "$TARGET_COMMIT") + if [[ -z "$TARGET_COMMIT" ]]; then + TARGET_COMMIT=$(get_latest_commit_from_gh "$REPO" "$TARGET_BRANCH") + fi install_toml2json do_update_packages "$TARGET_COMMIT" "$DRY_RUN" "$REPO" "${PACKAGES[@]}" do_update_crates "$TARGET_COMMIT" "$DRY_RUN" "$REPO" "${CRATES[@]}" diff --git a/wicket-common/src/example.rs b/wicket-common/src/example.rs index 63d12aea6d..34af11e906 100644 --- a/wicket-common/src/example.rs +++ b/wicket-common/src/example.rs @@ -12,7 +12,8 @@ use omicron_common::{ api::{ external::AllowedSourceIps, internal::shared::{ - BgpConfig, BgpPeerConfig, PortFec, PortSpeed, RouteConfig, + BgpConfig, BgpPeerConfig, LldpAdminStatus, LldpPortConfig, PortFec, + PortSpeed, RouteConfig, }, }, }; @@ -166,13 +167,33 @@ impl ExampleRackSetupData { vlan_id: None, }]; + let switch0_port0_lldp = Some(LldpPortConfig { + status: LldpAdminStatus::Enabled, + chassis_id: Some("chassid id override".to_string()), + port_id: Some("port id override".to_string()), + system_name: Some("system name override".to_string()), + system_description: Some("system description override".to_string()), + port_description: Some("port description override".to_string()), + management_addrs: None, + }); + + let switch1_port0_lldp = Some(LldpPortConfig { + status: LldpAdminStatus::Enabled, + chassis_id: Some("chassid id override".to_string()), + port_id: Some("port id override".to_string()), + system_name: Some("system name override".to_string()), + system_description: Some("system description override".to_string()), + port_description: Some("port description override".to_string()), + management_addrs: Some(vec!["172.32.0.4".parse().unwrap()]), + }); + let rack_network_config = UserSpecifiedRackNetworkConfig { infra_ip_first: "172.30.0.1".parse().unwrap(), infra_ip_last: "172.30.0.10".parse().unwrap(), switch0: btreemap! { "port0".to_owned() => UserSpecifiedPortConfig { - addresses: vec!["172.30.0.1/24".parse().unwrap()], - routes: vec![RouteConfig { + addresses: vec!["172.30.0.1/24".parse().unwrap()], + routes: vec![RouteConfig { destination: "0.0.0.0/0".parse().unwrap(), nexthop: "172.30.0.10".parse().unwrap(), vlan_id: Some(1), @@ -181,9 +202,10 @@ impl ExampleRackSetupData { bgp_peers: switch0_port0_bgp_peers, uplink_port_speed: PortSpeed::Speed400G, uplink_port_fec: PortFec::Firecode, + lldp: switch0_port0_lldp, autoneg: true, }, - }, + }, switch1: btreemap! { // Use the same port name as in switch0 to test that it doesn't // collide. @@ -198,6 +220,7 @@ impl ExampleRackSetupData { bgp_peers: switch1_port0_bgp_peers, uplink_port_speed: PortSpeed::Speed400G, uplink_port_fec: PortFec::Firecode, + lldp: switch1_port0_lldp, autoneg: true, }, }, diff --git a/wicket-common/src/rack_setup.rs b/wicket-common/src/rack_setup.rs index 7fd83e522a..cb6b13422b 100644 --- a/wicket-common/src/rack_setup.rs +++ b/wicket-common/src/rack_setup.rs @@ -11,6 +11,7 @@ use omicron_common::api::external::SwitchLocation; use omicron_common::api::internal::shared::AllowedSourceIps; use omicron_common::api::internal::shared::BgpConfig; use omicron_common::api::internal::shared::BgpPeerConfig; +use omicron_common::api::internal::shared::LldpPortConfig; use omicron_common::api::internal::shared::PortFec; use omicron_common::api::internal::shared::PortSpeed; use omicron_common::api::internal::shared::RouteConfig; @@ -185,6 +186,8 @@ pub struct UserSpecifiedPortConfig { pub autoneg: bool, #[serde(default)] pub bgp_peers: Vec, + #[serde(default)] + pub lldp: Option, } /// User-specified version of [`BgpPeerConfig`]. diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 198c740754..17b31e7730 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -8,6 +8,7 @@ use omicron_common::address::IpRange; use omicron_common::api::external::AllowedSourceIps; use omicron_common::api::internal::shared::BgpConfig; +use omicron_common::api::internal::shared::LldpPortConfig; use omicron_common::api::internal::shared::RouteConfig; use omicron_common::api::internal::shared::UplinkAddressConfig; use serde::Serialize; @@ -320,6 +321,7 @@ fn populate_uplink_table(cfg: &UserSpecifiedPortConfig) -> Table { uplink_port_fec, autoneg, bgp_peers, + lldp, } = cfg; let mut uplink = Table::new(); @@ -491,6 +493,46 @@ fn populate_uplink_table(cfg: &UserSpecifiedPortConfig) -> Table { uplink.insert("bgp_peers", Item::ArrayOfTables(peers)); + if let Some(l) = lldp { + let LldpPortConfig { + status, + chassis_id, + port_id, + system_name, + system_description, + port_description, + management_addrs, + } = l; + let mut lldp = Table::new(); + lldp.insert("status", string_item(status)); + if let Some(x) = chassis_id { + lldp.insert("chassis_id", string_item(x)); + } + if let Some(x) = port_id { + lldp.insert("port_id", string_item(x)); + } + if let Some(x) = system_name { + lldp.insert("system_name", string_item(x)); + } + if let Some(x) = system_description { + lldp.insert("system_description", string_item(x)); + } + if let Some(x) = port_description { + lldp.insert("port_description", string_item(x)); + } + if let Some(addrs) = management_addrs { + let mut addresses_out = Array::new(); + for a in addrs { + addresses_out.push(string_value(a)); + } + lldp.insert( + "management_addrs", + Item::Value(Value::Array(addresses_out)), + ); + } + uplink.insert("lldp", Item::Table(lldp)); + } + uplink } diff --git a/wicket/src/ui/panes/rack_setup.rs b/wicket/src/ui/panes/rack_setup.rs index 76a240e981..f23bc3c816 100644 --- a/wicket/src/ui/panes/rack_setup.rs +++ b/wicket/src/ui/panes/rack_setup.rs @@ -21,6 +21,7 @@ use itertools::Itertools; use omicron_common::address::IpRange; use omicron_common::api::internal::shared::AllowedSourceIps; use omicron_common::api::internal::shared::BgpConfig; +use omicron_common::api::internal::shared::LldpPortConfig; use omicron_common::api::internal::shared::RouteConfig; use ratatui::layout::Constraint; use ratatui::layout::Direction; @@ -740,6 +741,7 @@ fn rss_config_text<'a>( uplink_port_fec, autoneg, bgp_peers, + lldp, } = uplink; let mut items = vec![ @@ -1035,6 +1037,68 @@ fn rss_config_text<'a>( items.extend(addresses); items.extend(peers); + if let Some(lp) = lldp { + let LldpPortConfig { + status, + chassis_id, + port_id, + system_name, + system_description, + port_description, + management_addrs, + } = lp; + + let mut lldp = vec![ + vec![Span::styled(" • LLDP port settings: ", label_style)], + vec![ + Span::styled(" • Admin status : ", label_style), + Span::styled(status.to_string(), ok_style), + ], + ]; + + if let Some(c) = chassis_id { + lldp.push(vec![ + Span::styled(" • Chassis ID : ", label_style), + Span::styled(c.to_string(), ok_style), + ]) + } + if let Some(s) = system_name { + lldp.push(vec![ + Span::styled(" • System name : ", label_style), + Span::styled(s.to_string(), ok_style), + ]) + } + if let Some(s) = system_description { + lldp.push(vec![ + Span::styled(" • System description: ", label_style), + Span::styled(s.to_string(), ok_style), + ]) + } + if let Some(p) = port_id { + lldp.push(vec![ + Span::styled(" • Port ID : ", label_style), + Span::styled(p.to_string(), ok_style), + ]) + } + if let Some(p) = port_description { + lldp.push(vec![ + Span::styled(" • Port description : ", label_style), + Span::styled(p.to_string(), ok_style), + ]) + } + if let Some(addrs) = management_addrs { + let mut label = " • Management addrs : "; + for a in addrs { + lldp.push(vec![ + Span::styled(label, label_style), + Span::styled(a.to_string(), ok_style), + ]); + label = " : "; + } + } + items.extend(lldp); + } + append_list( &mut spans, Cow::from(format!("Uplink {}: ", i + 1)), diff --git a/wicket/tests/output/example_non_empty.toml b/wicket/tests/output/example_non_empty.toml index 717e940ca5..fafb31048d 100644 --- a/wicket/tests/output/example_non_empty.toml +++ b/wicket/tests/output/example_non_empty.toml @@ -111,6 +111,14 @@ allowed_export = [] local_pref = 80 enforce_first_as = true +[rack_network_config.switch0.port0.lldp] +status = "enabled" +chassis_id = "chassid id override" +port_id = "port id override" +system_name = "system name override" +system_description = "system description override" +port_description = "port description override" + [rack_network_config.switch1.port0] routes = [{ nexthop = "172.33.0.10", destination = "0.0.0.0/0", vlan_id = 1 }] addresses = [{ address = "172.32.0.1/24" }] @@ -131,6 +139,15 @@ auth_key_id = "bgp-key-1" allowed_import = ["224.0.0.0/4"] enforce_first_as = false +[rack_network_config.switch1.port0.lldp] +status = "enabled" +chassis_id = "chassid id override" +port_id = "port id override" +system_name = "system name override" +system_description = "system description override" +port_description = "port description override" +management_addrs = ["172.32.0.4"] + [[rack_network_config.bgp]] asn = 47 originate = ["10.0.0.0/16"] diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index cb40d56dd6..56e83fcd41 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -686,11 +686,14 @@ fn build_port_config( bgp_auth_keys: &BTreeMap>, ) -> BaPortConfigV2 { use bootstrap_agent_client::types::BgpPeerConfig as BaBgpPeerConfig; + use bootstrap_agent_client::types::LldpAdminStatus as BaLldpAdminStatus; + use bootstrap_agent_client::types::LldpPortConfig as BaLldpPortConfig; use bootstrap_agent_client::types::PortFec as BaPortFec; use bootstrap_agent_client::types::PortSpeed as BaPortSpeed; use bootstrap_agent_client::types::RouteConfig as BaRouteConfig; use bootstrap_agent_client::types::SwitchLocation as BaSwitchLocation; use bootstrap_agent_client::types::UplinkAddressConfig as BaUplinkAddressConfig; + use omicron_common::api::internal::shared::LldpAdminStatus; use omicron_common::api::internal::shared::PortFec; use omicron_common::api::internal::shared::PortSpeed; @@ -780,6 +783,20 @@ fn build_port_config( PortFec::Rs => BaPortFec::Rs, }, autoneg: config.autoneg, + lldp: config.lldp.as_ref().map(|c| BaLldpPortConfig { + status: match c.status { + LldpAdminStatus::Enabled => BaLldpAdminStatus::Enabled, + LldpAdminStatus::Disabled => BaLldpAdminStatus::Disabled, + LldpAdminStatus::TxOnly => BaLldpAdminStatus::TxOnly, + LldpAdminStatus::RxOnly => BaLldpAdminStatus::RxOnly, + }, + chassis_id: c.chassis_id.clone(), + port_id: c.port_id.clone(), + system_name: c.system_name.clone(), + system_description: c.system_description.clone(), + port_description: c.port_description.clone(), + management_addrs: c.management_addrs.clone(), + }), } } From 1903eee4c8dc060b5b0fb23f51ed2479957fa8b2 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 21 Aug 2024 19:56:05 -0700 Subject: [PATCH 89/91] cockroachdb cluster version fixups (#6396) This is a followup to #5603, which contains various fixes I needed to write when testing an actual cluster upgrade from v22.1 to v22.2. We're currently deciding to park that plan, but I don't want these fixups to be forgotten. --- Cargo.lock | 1 + dev-tools/downloader/src/lib.rs | 5 +- dev-tools/omdb/Cargo.toml | 3 +- dev-tools/omdb/tests/successes.out | 12 ++--- dev-tools/omdb/tests/test_all_output.rs | 36 +++++++++++-- docs/crdb-upgrades.adoc | 12 +++-- .../src/db/datastore/cockroachdb_settings.rs | 54 +++++++++++++------ .../execution/src/cockroachdb.rs | 28 +++++----- nexus/src/app/rack.rs | 50 ++++++++++++----- nexus/types/src/deployment/planning_input.rs | 7 +++ 10 files changed, 147 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f8699e62d6..837015f3bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6153,6 +6153,7 @@ dependencies = [ "gateway-client", "gateway-messages", "gateway-test-utils", + "http 0.2.12", "humantime", "indicatif", "internal-dns", diff --git a/dev-tools/downloader/src/lib.rs b/dev-tools/downloader/src/lib.rs index d5b436244c..c3d6e165ff 100644 --- a/dev-tools/downloader/src/lib.rs +++ b/dev-tools/downloader/src/lib.rs @@ -586,7 +586,10 @@ impl<'a> Downloader<'a> { let version = version.trim(); let (url_base, suffix) = match os { - Os::Illumos => ("https://illumos.org/downloads", "tar.gz"), + Os::Illumos => ( + "https://oxide-cockroachdb-build.s3.us-west-2.amazonaws.com", + "tar.gz", + ), Os::Linux | Os::Mac => ("https://binaries.cockroachdb.com", "tgz"), }; let build = match os { diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index a92de1b6a9..4cc484b9a9 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -62,13 +62,14 @@ multimap.workspace = true indicatif.workspace = true [dev-dependencies] +camino-tempfile.workspace = true expectorate.workspace = true +http.workspace = true nexus-test-utils.workspace = true nexus-test-utils-macros.workspace = true omicron-nexus.workspace = true omicron-test-utils.workspace = true subprocess.workspace = true -camino-tempfile.workspace = true # Disable doc builds by default for our binaries to work around issue # rust-lang/cargo#8373. These docs would not be very useful anyway. diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 9d432525c3..2a9c9c8051 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -1135,8 +1135,8 @@ WARNING: Zones exist without physical disks! COCKROACHDB SETTINGS: - state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de - cluster.preserve_downgrade_option: "22.1" + state fingerprint::::::::::::::::: + cluster.preserve_downgrade_option: METADATA: created by::::::::::: nexus-test-utils @@ -1173,8 +1173,8 @@ WARNING: Zones exist without physical disks! COCKROACHDB SETTINGS: - state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de - cluster.preserve_downgrade_option: "22.1" + state fingerprint::::::::::::::::: + cluster.preserve_downgrade_option: METADATA: created by::::::::::: nexus-test-utils @@ -1214,8 +1214,8 @@ to: blueprint ............. COCKROACHDB SETTINGS: - state fingerprint::::::::::::::::: d4d87aa2ad877a4cc2fddd0573952362739110de (unchanged) - cluster.preserve_downgrade_option: "22.1" (unchanged) + state fingerprint::::::::::::::::: (unchanged) + cluster.preserve_downgrade_option: (unchanged) METADATA: internal DNS version: 1 (unchanged) diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 1afee71122..d266e59ce8 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -7,9 +7,12 @@ //! Feel free to change the tool's output. This test just makes it easy to make //! sure you're only breaking what you intend. +use dropshot::Method; use expectorate::assert_contents; +use http::StatusCode; use nexus_test_utils::{OXIMETER_UUID, PRODUCER_UUID}; use nexus_test_utils_macros::nexus_test; +use nexus_types::deployment::Blueprint; use nexus_types::deployment::SledFilter; use nexus_types::deployment::UnstableReconfiguratorState; use omicron_test_utils::dev::test_cmds::path_to_executable; @@ -134,6 +137,20 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { let tmppath = tmpdir.path().join("reconfigurator-save.out"); let initial_blueprint_id = cptestctx.initial_blueprint_id.to_string(); + // Get the CockroachDB metadata from the blueprint so we can redact it + let initial_blueprint: Blueprint = dropshot::test_util::read_json( + &mut cptestctx + .internal_client + .make_request_no_body( + Method::GET, + &format!("/deployment/blueprints/all/{initial_blueprint_id}"), + StatusCode::OK, + ) + .await + .unwrap(), + ) + .await; + let mut output = String::new(); let invocations: &[&[&str]] = &[ @@ -186,6 +203,19 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { // ControlPlaneTestContext. ]; + let mut redactions = ExtraRedactions::new(); + redactions + .variable_length("tmp_path", tmppath.as_str()) + .fixed_length("blueprint_id", &initial_blueprint_id) + .variable_length( + "cockroachdb_fingerprint", + &initial_blueprint.cockroachdb_fingerprint, + ); + let crdb_version = + initial_blueprint.cockroachdb_setting_preserve_downgrade.to_string(); + if initial_blueprint.cockroachdb_setting_preserve_downgrade.is_set() { + redactions.variable_length("cockroachdb_version", &crdb_version); + } for args in invocations { println!("running commands with args: {:?}", args); let p = postgres_url.to_string(); @@ -204,11 +234,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { }, &cmd_path, args, - Some( - ExtraRedactions::new() - .variable_length("tmp_path", tmppath.as_str()) - .fixed_length("blueprint_id", &initial_blueprint_id), - ), + Some(&redactions), ) .await; } diff --git a/docs/crdb-upgrades.adoc b/docs/crdb-upgrades.adoc index eecfa9194e..52231ee199 100644 --- a/docs/crdb-upgrades.adoc +++ b/docs/crdb-upgrades.adoc @@ -60,13 +60,15 @@ a tick, but they must occur in that order.) . Add an enum variant for the new version to `CockroachDbClusterVersion` in `nexus/types/src/deployment/planning_input.rs`, and change the associated constant `NEWLY_INITIALIZED` to that value. -. Run the test suite, which should catch any unexpected SQL +. Regenerate the Nexus internal OpenAPI document, which contains an enum + of CockroachDB versions: ++ +.... +EXPECTORATE=overwrite cargo nextest run -p omicron-nexus -- integration_tests::commands::test_nexus_openapi_internal +.... +. Run the full test suite, which should catch any unexpected SQL compatibility issues between releases and help validate that your build works. - * You will need to run the `test_omdb_success_cases` test from - omicron-omdb with `EXPECTORATE=overwrite`; this file contains the - expected output of various omdb commands, including a fingerprint of - CockroachDB's cluster state. . Submit a PR for your changes to garbage-compactor; when merged, publish the final build to the `oxide-cockroachdb-build` S3 bucket. . Update `tools/cockroachdb_checksums`. For non-illumos checksums, use diff --git a/nexus/db-queries/src/db/datastore/cockroachdb_settings.rs b/nexus/db-queries/src/db/datastore/cockroachdb_settings.rs index e7a975fa69..a38cfb8935 100644 --- a/nexus/db-queries/src/db/datastore/cockroachdb_settings.rs +++ b/nexus/db-queries/src/db/datastore/cockroachdb_settings.rs @@ -153,10 +153,22 @@ mod test { ); let settings = datastore.cockroachdb_settings(&opctx).await.unwrap(); - // With a fresh cluster, this is the expected state - let version = CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string(); - assert_eq!(settings.version, version); - assert_eq!(settings.preserve_downgrade, ""); + let version: CockroachDbClusterVersion = + settings.version.parse().expect("unexpected cluster version"); + if settings.preserve_downgrade == "" { + // This is the expected value while running tests normally. + assert_eq!(version, CockroachDbClusterVersion::NEWLY_INITIALIZED); + } else if settings.preserve_downgrade == version.to_string() { + // This is the expected value if the cluster was created on a + // previous version and `cluster.preserve_downgrade_option` was set. + assert_eq!(version, CockroachDbClusterVersion::POLICY); + } else { + panic!( + "`cluster.preserve_downgrade_option` is {:?}, + but it should be empty or \"{}\"", + settings.preserve_downgrade, version + ); + } // Verify that if a fingerprint is wrong, we get the expected SQL error // back. @@ -165,7 +177,7 @@ mod test { &opctx, String::new(), "cluster.preserve_downgrade_option", - version.clone(), + version.to_string(), ) .await else { @@ -190,7 +202,7 @@ mod test { &opctx, settings.state_fingerprint.clone(), "cluster.preserve_downgrade_option", - version.clone(), + version.to_string(), ) .await .unwrap(); @@ -198,8 +210,8 @@ mod test { datastore.cockroachdb_settings(&opctx).await.unwrap(), CockroachDbSettings { state_fingerprint: settings.state_fingerprint.clone(), - version: version.clone(), - preserve_downgrade: version.clone(), + version: version.to_string(), + preserve_downgrade: version.to_string(), } ); } @@ -215,14 +227,24 @@ mod test { ) .await .unwrap(); - assert_eq!( - datastore.cockroachdb_settings(&opctx).await.unwrap(), - CockroachDbSettings { - state_fingerprint: settings.state_fingerprint.clone(), - version: version.clone(), - preserve_downgrade: String::new(), - } - ); + let settings = + datastore.cockroachdb_settings(&opctx).await.unwrap(); + if version == CockroachDbClusterVersion::NEWLY_INITIALIZED { + assert_eq!( + settings, + CockroachDbSettings { + state_fingerprint: settings.state_fingerprint.clone(), + version: version.to_string(), + preserve_downgrade: String::new(), + } + ); + } else { + // Resetting it permits auto-finalization, so the state + // fingerprint and version are not predictable until that + // completes, but we can still verify that the variable was + // reset. + assert!(settings.preserve_downgrade.is_empty()); + } } db.cleanup().await.unwrap(); diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs index 01baebfb57..12ff896d9d 100644 --- a/nexus/reconfigurator/execution/src/cockroachdb.rs +++ b/nexus/reconfigurator/execution/src/cockroachdb.rs @@ -38,7 +38,7 @@ mod test { use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_test_utils_macros::nexus_test; - use nexus_types::deployment::CockroachDbClusterVersion; + use nexus_types::deployment::CockroachDbPreserveDowngrade; use std::sync::Arc; use uuid::Uuid; @@ -71,24 +71,26 @@ mod test { .await .expect("failed to get blueprint from datastore"); eprintln!("blueprint: {}", blueprint.display()); - // The initial blueprint should already have these filled in. + // The initial blueprint should already have the state fingerprint + // filled in. assert_eq!( blueprint.cockroachdb_fingerprint, settings.state_fingerprint ); - assert_eq!( - blueprint.cockroachdb_setting_preserve_downgrade, - CockroachDbClusterVersion::NEWLY_INITIALIZED.into() - ); - // The cluster version, preserve downgrade setting, and - // `NEWLY_INITIALIZED` should all match. - assert_eq!( - settings.version, - CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string() - ); + // The initial blueprint should already have the preserve downgrade + // setting filled in. (It might be the current or previous version, but + // it should be `Set` regardless.) + let CockroachDbPreserveDowngrade::Set(bp_preserve_downgrade) = + blueprint.cockroachdb_setting_preserve_downgrade + else { + panic!("blueprint does not set preserve downgrade option"); + }; + // The cluster version, preserve downgrade setting, and the value in the + // blueprint should all match. + assert_eq!(settings.version, bp_preserve_downgrade.to_string()); assert_eq!( settings.preserve_downgrade, - CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string() + bp_preserve_downgrade.to_string() ); // Record the zpools so we don't fail to ensure datasets (unrelated to // crdb settings) during blueprint execution. diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 09785b4cec..019314c527 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -242,22 +242,44 @@ impl super::Nexus { .internal_context( "fetching cockroachdb settings for rack initialization", )?; - self.datastore() - .cockroachdb_setting_set_string( - opctx, - cockroachdb_settings.state_fingerprint.clone(), - "cluster.preserve_downgrade_option", - CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string(), - ) - .await - .internal_context( - "setting `cluster.preserve_downgrade_option` \ - for rack initialization", - )?; + blueprint.cockroachdb_setting_preserve_downgrade = + if cockroachdb_settings.preserve_downgrade.is_empty() { + // Set the option to the current policy in both the database and + // the blueprint. + self.datastore() + .cockroachdb_setting_set_string( + opctx, + cockroachdb_settings.state_fingerprint.clone(), + "cluster.preserve_downgrade_option", + CockroachDbClusterVersion::NEWLY_INITIALIZED + .to_string(), + ) + .await + .internal_context( + "setting `cluster.preserve_downgrade_option` \ + for rack initialization", + )?; + CockroachDbClusterVersion::NEWLY_INITIALIZED + } else { + // `cluster.preserve_downgrade_option` is set, so fill in the + // blueprint with the current value. This branch should never + // be hit during normal rack initialization; it's here for + // eventual test cases where `cluster.preserve_downgrade_option` + // is set by a test harness prior to rack initialization. + CockroachDbClusterVersion::from_str( + &cockroachdb_settings.preserve_downgrade, + ) + .map_err(|_| { + Error::internal_error(&format!( + "database has `cluster.preserve_downgrade_option` \ + set to invalid version {}", + cockroachdb_settings.preserve_downgrade + )) + })? + } + .into(); blueprint.cockroachdb_fingerprint = cockroachdb_settings.state_fingerprint; - blueprint.cockroachdb_setting_preserve_downgrade = - CockroachDbClusterVersion::NEWLY_INITIALIZED.into(); // Administrators of the Recovery Silo are automatically made // administrators of the Fleet. diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index c6a61aac78..dabb47066e 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -280,6 +280,13 @@ pub enum CockroachDbPreserveDowngrade { } impl CockroachDbPreserveDowngrade { + pub fn is_set(self) -> bool { + match self { + CockroachDbPreserveDowngrade::Set(_) => true, + _ => false, + } + } + pub fn from_optional_string( value: &Option, ) -> Result { From f1e1aff423fd309cb7cbd4994be599c16c7351b2 Mon Sep 17 00:00:00 2001 From: Levon Tarver <11586085+internet-diglett@users.noreply.github.com> Date: Wed, 21 Aug 2024 22:48:07 -0500 Subject: [PATCH 90/91] Network Operator BGP API improvements (#6362) #6081 --- Refactored existing api to list bgp announce sets and added a new api to list announcements for an announce set #6244 and #6245 --- Refactored bgp datastore methods to propagate additional error context as well as emit error types for the external API (404, 403, etc) #6246 --- Two part fix: 1. Emit an error if operator attempts to create more than one bgp configuration for the same ASN 2. Emit an error if the operator tries to configure BGP peers for the same switch that point to bgp configs with _different_ ASNs #6349 --- Updated bgp datastore methods to always lookup the parent record (not just when the name needs to be resolved to a UUID). This guards us against situations where an invalid UUID is accidentally submitted. Related --- Resolving these issues for BGP - [x] closes #6081 - [x] closes #6244 - [x] closes #6245 - [x] closes #6246 - [x] #6349 --- nexus/db-model/src/schema.rs | 3 +- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/bgp.rs | 663 +++++++++++++----- nexus/db-queries/src/db/datastore/mod.rs | 1 + .../src/db/datastore/switch_port.rs | 287 +++++--- .../tasks/sync_switch_configuration.rs | 2 +- nexus/src/app/bgp.rs | 21 +- nexus/src/app/rack.rs | 2 +- nexus/src/external_api/http_entrypoints.rs | 64 +- nexus/tests/integration_tests/endpoints.rs | 26 +- nexus/tests/integration_tests/switch_port.rs | 2 +- nexus/tests/output/nexus_tags.txt | 7 +- nexus/types/src/external_api/params.rs | 7 + openapi/nexus.json | 89 ++- schema/crdb/dbinit.sql | 6 +- schema/crdb/lookup-bgp-config-by-asn/up01.sql | 3 + 16 files changed, 885 insertions(+), 301 deletions(-) create mode 100644 schema/crdb/lookup-bgp-config-by-asn/up01.sql diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 1bcb3ac229..f630bbbeac 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1889,7 +1889,8 @@ allow_tables_to_appear_in_same_query!( allow_tables_to_appear_in_same_query!( switch_port, - switch_port_settings_bgp_peer_config + switch_port_settings_bgp_peer_config, + bgp_config ); allow_tables_to_appear_in_same_query!(disk, virtual_provisioning_resource); diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 649355f8e4..d0542874fb 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(89, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(90, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(90, "lookup-bgp-config-by-asn"), KnownVersion::new(89, "collapse_lldp_settings"), KnownVersion::new(88, "route-local-pref"), KnownVersion::new(87, "add-clickhouse-server-enum-variants"), diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs index f4bea0f605..fdb9629543 100644 --- a/nexus/db-queries/src/db/datastore/bgp.rs +++ b/nexus/db-queries/src/db/datastore/bgp.rs @@ -28,7 +28,7 @@ use ref_cast::RefCast; use uuid::Uuid; impl DataStore { - pub async fn bgp_config_set( + pub async fn bgp_config_create( &self, opctx: &OpContext, config: ¶ms::BgpConfigCreate, @@ -37,80 +37,187 @@ impl DataStore { use db::schema::{ bgp_announce_set, bgp_announce_set::dsl as announce_set_dsl, }; - use diesel::sql_types; - use diesel::IntoSql; let conn = self.pool_connection_authorized(opctx).await?; - self.transaction_retry_wrapper("bgp_config_set") - .transaction(&conn, |conn| async move { - let announce_set_id: Uuid = match &config.bgp_announce_set_id { - NameOrId::Name(name) => { - announce_set_dsl::bgp_announce_set + let err = OptionalError::new(); + self.transaction_retry_wrapper("bgp_config_create") + .transaction(&conn, |conn| { + + let err = err.clone(); + async move { + let announce_set_id = match config.bgp_announce_set_id.clone() { + // Resolve Name to UUID + NameOrId::Name(name) => announce_set_dsl::bgp_announce_set .filter(bgp_announce_set::time_deleted.is_null()) .filter(bgp_announce_set::name.eq(name.to_string())) .select(bgp_announce_set::id) .limit(1) .first_async::(&conn) - .await? + .await + .map_err(|e| { + let msg = "failed to lookup announce set by name"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + err.bail(Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + )) + } + _ => err.bail(Error::internal_error(msg)), + + } + }), + + // We cannot assume that the provided UUID is actually real. + // Lookup the parent record by UUID to verify that it is valid. + NameOrId::Id(id) => announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::id.eq(id)) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup announce set by id"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + err.bail(Error::not_found_by_id( + ResourceType::BgpAnnounceSet, + &id, + )) + } + _ => err.bail(Error::internal_error(msg)), + + } + }), + }?; + + let config = + BgpConfig::from_config_create(config, announce_set_id); + + // Idempotency: + // Check to see if an exact match for the config already exists + let query = dsl::bgp_config + .filter(dsl::name.eq(config.name().to_string())) + .filter(dsl::asn.eq(config.asn)) + .filter(dsl::bgp_announce_set_id.eq(config.bgp_announce_set_id)) + .into_boxed(); + + let query = match config.vrf.clone() { + Some(v) => query.filter(dsl::vrf.eq(v)), + None => query.filter(dsl::vrf.is_null()), + }; + + let query = match config.shaper.clone() { + Some(v) => query.filter(dsl::shaper.eq(v)), + None => query.filter(dsl::shaper.is_null()), + }; + + let query = match config.checker.clone() { + Some(v) => query.filter(dsl::checker.eq(v)), + None => query.filter(dsl::checker.is_null()), + }; + + let matching_config = match query + .filter(dsl::time_deleted.is_null()) + .select(BgpConfig::as_select()) + .first_async::(&conn) + .await { + Ok(v) => Ok(Some(v)), + Err(e) => { + match e { + diesel::result::Error::NotFound => { + info!(opctx.log, "no matching bgp config found"); + Ok(None) + } + _ => { + let msg = "error while checking if bgp config exists"; + error!(opctx.log, "{msg}"; "error" => ?e); + Err(err.bail(Error::internal_error(msg))) + } + } + } + }?; + + // If so, we're done! + if let Some(existing_config) = matching_config { + return Ok(existing_config); } - NameOrId::Id(id) => *id, - }; - let config = - BgpConfig::from_config_create(config, announce_set_id); - - let matching_entry_subquery = dsl::bgp_config - .filter(dsl::name.eq(Name::from(config.name().clone()))) - .filter(dsl::time_deleted.is_null()) - .select(dsl::name); - - // SELECT exactly the values we're trying to INSERT, but only - // if it does not already exist. - let new_entry_subquery = diesel::dsl::select(( - config.id().into_sql::(), - config.name().to_string().into_sql::(), - config - .description() - .to_string() - .into_sql::(), - config.asn.into_sql::(), - config.bgp_announce_set_id.into_sql::(), - config - .vrf - .clone() - .into_sql::>(), - Utc::now().into_sql::(), - Utc::now().into_sql::(), - )) - .filter(diesel::dsl::not(diesel::dsl::exists( - matching_entry_subquery, - ))); - - diesel::insert_into(dsl::bgp_config) - .values(new_entry_subquery) - .into_columns(( - dsl::id, - dsl::name, - dsl::description, - dsl::asn, - dsl::bgp_announce_set_id, - dsl::vrf, - dsl::time_created, - dsl::time_modified, - )) - .execute_async(&conn) - .await?; + // TODO: remove once per-switch-multi-asn support is added + // Bail if a conflicting config for this ASN already exists. + // This is a temporary measure until multi-asn-per-switch is supported. + let configs_with_asn: Vec = dsl::bgp_config + .filter(dsl::asn.eq(config.asn)) + .filter(dsl::time_deleted.is_null()) + .select(BgpConfig::as_select()) + .load_async(&conn) + .await?; + + if !configs_with_asn.is_empty() { + error!( + opctx.log, + "different config for asn already exists"; + "asn" => ?config.asn, + "requested_config" => ?config, + "conflicting_configs" => ?configs_with_asn + ); + return Err(err.bail(Error::conflict("cannot have more than one configuration per ASN"))); + } - dsl::bgp_config - .filter(dsl::name.eq(Name::from(config.name().clone()))) - .filter(dsl::time_deleted.is_null()) - .select(BgpConfig::as_select()) - .limit(1) - .first_async(&conn) - .await + diesel::insert_into(dsl::bgp_config) + .values(config.clone()) + .returning(BgpConfig::as_returning()) + .get_result_async(&conn) + .await + .map_err(|e | { + let msg = "failed to insert bgp config"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::DatabaseError(kind, _) => { + match kind { + diesel::result::DatabaseErrorKind::UniqueViolation => { + err.bail(Error::conflict("a field that must be unique conflicts with an existing record")) + }, + // technically we don't use Foreign Keys but it doesn't hurt to match on them + // instead of returning a 500 by default in the event that we do switch to Foreign Keys + diesel::result::DatabaseErrorKind::ForeignKeyViolation => { + err.bail(Error::conflict("an id field references an object that does not exist")) + } + diesel::result::DatabaseErrorKind::NotNullViolation => { + err.bail(Error::invalid_request("a required field was not provided")) + } + diesel::result::DatabaseErrorKind::CheckViolation => { + err.bail(Error::invalid_request("one or more fields are not valid values")) + }, + _ => err.bail(Error::internal_error(msg)), + } + } + _ => err.bail(Error::internal_error(msg)), + } + }) + } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e|{ + let msg = "bgp_config_create failed"; + if let Some(err) = err.take() { + error!(opctx.log, "{msg}"; "error" => ?err); + err + } else { + // The transaction handler errors along with any errors emitted via "?" + // will fall through to here. These errors should truly be 500s + // because they are an internal hiccup that likely was not triggered by + // user input. + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn bgp_config_delete( @@ -124,11 +231,6 @@ impl DataStore { use db::schema::switch_port_settings_bgp_peer_config as sps_bgp_peer_config; use db::schema::switch_port_settings_bgp_peer_config::dsl as sps_bgp_peer_config_dsl; - #[derive(Debug)] - enum BgpConfigDeleteError { - ConfigInUse, - } - let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("bgp_config_delete") @@ -138,26 +240,60 @@ impl DataStore { let name_or_id = sel.name_or_id.clone(); let id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => { + NameOrId::Id(id) => bgp_config_dsl::bgp_config + .filter(bgp_config::id.eq(id)) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup bgp config by id"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + err.bail(Error::not_found_by_id( + ResourceType::BgpConfig, + &id, + )) + } + _ => err.bail(Error::internal_error(msg)), + + } + }), + NameOrId::Name(name) => bgp_config_dsl::bgp_config - .filter(bgp_config::name.eq(name.to_string())) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await? - } - }; + .filter(bgp_config::name.eq(name.to_string())) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup bgp config by name"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + err.bail(Error::not_found_by_name( + ResourceType::BgpConfig, + &name, + )) + } + _ => err.bail(Error::internal_error(msg)), + + } + }), + }?; let count = sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config - .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) - .count() - .execute_async(&conn) - .await?; + .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) + .count() + .execute_async(&conn) + .await?; if count > 0 { - return Err(err.bail(BgpConfigDeleteError::ConfigInUse)); + return Err(err.bail(Error::conflict("BGP Config is in use and cannot be deleted"))); } diesel::update(bgp_config_dsl::bgp_config) @@ -171,13 +307,12 @@ impl DataStore { }) .await .map_err(|e| { + let msg = "bgp_config_delete failed"; if let Some(err) = err.take() { - match err { - BgpConfigDeleteError::ConfigInUse => { - Error::invalid_request("BGP config in use") - } - } + error!(opctx.log, "{msg}"; "error" => ?err); + err } else { + error!(opctx.log, "{msg}"; "error" => ?e); public_error_from_diesel(e, ErrorHandler::Server) } }) @@ -194,24 +329,45 @@ impl DataStore { let name_or_id = name_or_id.clone(); - let config = match name_or_id { + match name_or_id { NameOrId::Name(name) => dsl::bgp_config .filter(bgp_config::name.eq(name.to_string())) .select(BgpConfig::as_select()) .limit(1) .first_async::(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), + .map_err(|e| { + let msg = "failed to lookup bgp config by name"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + Error::not_found_by_name( + ResourceType::BgpConfig, + &name, + ) + } + _ => Error::internal_error(msg), + } + }), NameOrId::Id(id) => dsl::bgp_config .filter(bgp_config::id.eq(id)) .select(BgpConfig::as_select()) .limit(1) .first_async::(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), - }?; + .map_err(|e| { + let msg = "failed to lookup bgp config by id"; + error!(opctx.log, "{msg}"; "error" => ?e); - Ok(config) + match e { + diesel::result::Error::NotFound => { + Error::not_found_by_id(ResourceType::BgpConfig, &id) + } + _ => Error::internal_error(msg), + } + }), + } } pub async fn bgp_config_list( @@ -237,10 +393,42 @@ impl DataStore { .select(BgpConfig::as_select()) .load_async(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + error!(opctx.log, "bgp_config_list failed"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + pub async fn bgp_announce_set_list( + &self, + opctx: &OpContext, + pagparams: &PaginatedBy<'_>, + ) -> ListResultVec { + use db::schema::bgp_announce_set::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + match pagparams { + PaginatedBy::Id(pagparams) => { + paginated(dsl::bgp_announce_set, dsl::id, &pagparams) + } + PaginatedBy::Name(pagparams) => paginated( + dsl::bgp_announce_set, + dsl::name, + &pagparams.map_name(|n| Name::ref_cast(n)), + ), + } + .filter(dsl::time_deleted.is_null()) + .select(BgpAnnounceSet::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + error!(opctx.log, "bgp_announce_set_list failed"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + }) } - pub async fn bgp_announce_list( + pub async fn bgp_announcement_list( &self, opctx: &OpContext, sel: ¶ms::BgpAnnounceSetSelector, @@ -250,11 +438,6 @@ impl DataStore { bgp_announcement::dsl as announce_dsl, }; - #[derive(Debug)] - enum BgpAnnounceListError { - AnnounceSetNotFound(Name), - } - let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("bgp_announce_list") @@ -264,7 +447,26 @@ impl DataStore { let name_or_id = sel.name_or_id.clone(); let announce_id: Uuid = match name_or_id { - NameOrId::Id(id) => id, + NameOrId::Id(id) => announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::id.eq(id)) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup announce set by id"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => err + .bail(Error::not_found_by_id( + ResourceType::BgpAnnounceSet, + &id, + )), + _ => err.bail(Error::internal_error(msg)), + } + }), NameOrId::Name(name) => { announce_set_dsl::bgp_announce_set .filter( @@ -278,15 +480,23 @@ impl DataStore { .first_async::(&conn) .await .map_err(|e| { - err.bail_retryable_or( - e, - BgpAnnounceListError::AnnounceSetNotFound( - Name::from(name.clone()), - ) - ) - })? + let msg = + "failed to lookup announce set by name"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => err + .bail(Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + )), + _ => { + err.bail(Error::internal_error(msg)) + } + } + }) } - }; + }?; let result = announce_dsl::bgp_announcement .filter(announce_dsl::announce_set_id.eq(announce_id)) @@ -299,21 +509,18 @@ impl DataStore { }) .await .map_err(|e| { + error!(opctx.log, "bgp_announce_list failed"; "error" => ?e); if let Some(err) = err.take() { - match err { - BgpAnnounceListError::AnnounceSetNotFound(name) => { - Error::not_found_by_name( - ResourceType::BgpAnnounceSet, - &name, - ) - } - } + err } else { public_error_from_diesel(e, ErrorHandler::Server) } }) } + // TODO: it seems this logic actually performs a find OR create for an announce set, and then replaces its child announcements. + // This might be changed in omicron#6016 to an api that creates an announce set then allows adding / removal of announcements + // to match how our other APIs work. pub async fn bgp_update_announce_set( &self, opctx: &OpContext, @@ -383,9 +590,16 @@ impl DataStore { Ok((db_as, db_annoucements)) }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + let msg = "bgp_update_announce_set failed"; + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + }) } + // TODO: it seems this logic actually performs a create OR update of an announce set and its child announcements + // (for example, it will add missing announcements). This might be changed in omicron#6016 to an api that creates an announce set + // then allows adding / removal of announcements to match how our other APIs work. pub async fn bgp_create_announce_set( &self, opctx: &OpContext, @@ -466,7 +680,11 @@ impl DataStore { Ok((db_as, db_annoucements)) }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + let msg = "bgp_create_announce_set failed"; + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + }) } pub async fn bgp_delete_announce_set( @@ -481,11 +699,6 @@ impl DataStore { use db::schema::bgp_config; use db::schema::bgp_config::dsl as bgp_config_dsl; - #[derive(Debug)] - enum BgpAnnounceSetDeleteError { - AnnounceSetInUse, - } - let conn = self.pool_connection_authorized(opctx).await?; let name_or_id = sel.name_or_id.clone(); @@ -496,18 +709,56 @@ impl DataStore { let name_or_id = name_or_id.clone(); async move { let id: Uuid = match name_or_id { + NameOrId::Id(id) => announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::id.eq(id)) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup announce set by id"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => err + .bail(Error::not_found_by_id( + ResourceType::BgpAnnounceSet, + &id, + )), + _ => err.bail(Error::internal_error(msg)), + } + }), NameOrId::Name(name) => { announce_set_dsl::bgp_announce_set + .filter( + bgp_announce_set::time_deleted.is_null(), + ) .filter( bgp_announce_set::name.eq(name.to_string()), ) .select(bgp_announce_set::id) .limit(1) .first_async::(&conn) - .await? + .await + .map_err(|e| { + let msg = + "failed to lookup announce set by name"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => err + .bail(Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + )), + _ => { + err.bail(Error::internal_error(msg)) + } + } + }) } - NameOrId::Id(id) => id, - }; + }?; let count = bgp_config_dsl::bgp_config .filter(bgp_config::bgp_announce_set_id.eq(id)) @@ -516,9 +767,9 @@ impl DataStore { .await?; if count > 0 { - return Err(err.bail( - BgpAnnounceSetDeleteError::AnnounceSetInUse, - )); + return Err( + err.bail(Error::conflict("announce set in use")) + ); } diesel::update(announce_set_dsl::bgp_announce_set) @@ -537,13 +788,12 @@ impl DataStore { }) .await .map_err(|e| { + let msg = "bgp_delete_announce_set failed"; if let Some(err) = err.take() { - match err { - BgpAnnounceSetDeleteError::AnnounceSetInUse => { - Error::invalid_request("BGP announce set in use") - } - } + error!(opctx.log, "{msg}"; "error" => ?err); + err } else { + error!(opctx.log, "{msg}"; "error" => ?e); public_error_from_diesel(e, ErrorHandler::Server) } }) @@ -563,7 +813,11 @@ impl DataStore { .select(BgpPeerView::as_select()) .load_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| { + let msg = "bgp_peer_configs failed"; + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + })?; Ok(results) } @@ -583,7 +837,11 @@ impl DataStore { .filter(dsl::addr.eq(addr)) .load_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|e| { + let msg = "communities_for_peer failed"; + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + })?; Ok(results) } @@ -601,24 +859,40 @@ impl DataStore { use db::schema::switch_port_settings_bgp_peer_config_allow_export::dsl; let conn = self.pool_connection_authorized(opctx).await?; - let result = self - .transaction_retry_wrapper("bgp_allow_export_for_peer") - .transaction(&conn, |conn| async move { - let active = peer_dsl::switch_port_settings_bgp_peer_config - .filter(db_peer::port_settings_id.eq(port_settings_id)) - .filter(db_peer::addr.eq(addr)) - .select(db_peer::allow_export_list_active) - .limit(1) - .first_async::(&conn) - .await?; - - if !active { - return Ok(None); - } + let err = OptionalError::new(); + self.transaction_retry_wrapper("bgp_allow_export_for_peer") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let active = peer_dsl::switch_port_settings_bgp_peer_config + .filter(db_peer::port_settings_id.eq(port_settings_id)) + .filter(db_peer::addr.eq(addr)) + .select(db_peer::allow_export_list_active) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup export settings for peer"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + let not_found_msg = format!("peer with {addr} not found for port settings {port_settings_id}"); + err.bail(Error::non_resourcetype_not_found(not_found_msg)) + }, + _ => err.bail(Error::internal_error(msg)), + } + })?; + + if !active { + return Ok(None); + } - let list = - dsl::switch_port_settings_bgp_peer_config_allow_export - .filter(db_allow::port_settings_id.eq(port_settings_id)) + let list = + dsl::switch_port_settings_bgp_peer_config_allow_export + .filter( + db_allow::port_settings_id.eq(port_settings_id), + ) .filter( db_allow::interface_name .eq(interface_name.to_owned()), @@ -627,12 +901,20 @@ impl DataStore { .load_async(&conn) .await?; - Ok(Some(list)) + Ok(Some(list)) + } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - Ok(result) + .map_err(|e| { + let msg = "allow_export_for_peer failed"; + if let Some(err) = err.take() { + error!(opctx.log, "{msg}"; "error" => ?err); + err + } else { + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn allow_import_for_peer( @@ -647,25 +929,42 @@ impl DataStore { use db::schema::switch_port_settings_bgp_peer_config_allow_import as db_allow; use db::schema::switch_port_settings_bgp_peer_config_allow_import::dsl; + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; - let result = self - .transaction_retry_wrapper("bgp_allow_export_for_peer") - .transaction(&conn, |conn| async move { - let active = peer_dsl::switch_port_settings_bgp_peer_config - .filter(db_peer::port_settings_id.eq(port_settings_id)) - .filter(db_peer::addr.eq(addr)) - .select(db_peer::allow_import_list_active) - .limit(1) - .first_async::(&conn) - .await?; - - if !active { - return Ok(None); - } + self + .transaction_retry_wrapper("bgp_allow_import_for_peer") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let active = peer_dsl::switch_port_settings_bgp_peer_config + .filter(db_peer::port_settings_id.eq(port_settings_id)) + .filter(db_peer::addr.eq(addr)) + .select(db_peer::allow_import_list_active) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + let msg = "failed to lookup import settings for peer"; + error!(opctx.log, "{msg}"; "error" => ?e); + + match e { + diesel::result::Error::NotFound => { + let not_found_msg = format!("peer with {addr} not found for port settings {port_settings_id}"); + err.bail(Error::non_resourcetype_not_found(not_found_msg)) + }, + _ => err.bail(Error::internal_error(msg)), + } + })?; + + if !active { + return Ok(None); + } - let list = - dsl::switch_port_settings_bgp_peer_config_allow_import - .filter(db_allow::port_settings_id.eq(port_settings_id)) + let list = + dsl::switch_port_settings_bgp_peer_config_allow_import + .filter( + db_allow::port_settings_id.eq(port_settings_id), + ) .filter( db_allow::interface_name .eq(interface_name.to_owned()), @@ -674,11 +973,19 @@ impl DataStore { .load_async(&conn) .await?; - Ok(Some(list)) + Ok(Some(list)) + } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - Ok(result) + .map_err(|e| { + let msg = "allow_import_for_peer failed"; + if let Some(err) = err.take() { + error!(opctx.log, "{msg}"; "error" => ?err); + err + } else { + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 13c3708e4a..2cd21754f8 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -366,6 +366,7 @@ impl DataStore { } } +#[derive(Clone, Copy, Debug)] pub enum UpdatePrecondition { DontCare, Null, diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 504e6cf936..2e09c1ac13 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -31,7 +31,7 @@ use diesel::{ use diesel_dtrace::DTraceConnection; use ipnetwork::IpNetwork; use nexus_db_model::{ - SqlU16, SqlU32, SqlU8, SwitchPortBgpPeerConfigAllowExport, + BgpConfig, SqlU16, SqlU32, SqlU8, SwitchPortBgpPeerConfigAllowExport, SwitchPortBgpPeerConfigAllowImport, SwitchPortBgpPeerConfigCommunity, }; use nexus_types::external_api::params; @@ -333,6 +333,7 @@ impl DataStore { SwitchPortSettingsCreateError::ReserveBlock( ReserveBlockError::AddressNotInLot, ) => Error::invalid_request("address not in lot"), + } } else { @@ -828,45 +829,158 @@ impl DataStore { port_settings_id: Option, current: UpdatePrecondition, ) -> UpdateResult<()> { + use db::schema::bgp_config::dsl as bgp_config_dsl; use db::schema::switch_port; use db::schema::switch_port::dsl as switch_port_dsl; + use db::schema::switch_port_settings_bgp_peer_config::dsl as bgp_peer_dsl; let conn = self.pool_connection_authorized(opctx).await?; - match current { - UpdatePrecondition::DontCare => { - diesel::update(switch_port_dsl::switch_port) - .filter(switch_port::id.eq(switch_port_id)) - .set(switch_port::port_settings_id.eq(port_settings_id)) - .execute_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - } - UpdatePrecondition::Null => { - diesel::update(switch_port_dsl::switch_port) - .filter(switch_port::id.eq(switch_port_id)) - .filter(switch_port::port_settings_id.is_null()) - .set(switch_port::port_settings_id.eq(port_settings_id)) - .execute_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - } - UpdatePrecondition::Value(current_id) => { - diesel::update(switch_port_dsl::switch_port) - .filter(switch_port::id.eq(switch_port_id)) - .filter(switch_port::port_settings_id.eq(current_id)) - .set(switch_port::port_settings_id.eq(port_settings_id)) - .execute_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - } - } + let err = OptionalError::new(); + self.transaction_retry_wrapper("switch_port_set_settings_id") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // TODO: remove once per-switch-multi-asn support is added + // Bail if user attempts to assign multiple ASNs to a switch via switch port settings + // This is a temporary measure until multi-asn-per-switch is supported. + + // what switch are we adding a configuration to? + let switch = switch_port_dsl::switch_port + .filter(switch_port_dsl::id.eq(switch_port_id)) + .select(switch_port_dsl::switch_location) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e: diesel::result::Error| { + let msg = "failed to look up switch port by id"; + error!(opctx.log, "{msg}"; "error" => ?e); + match e { + diesel::result::Error::NotFound => { + err.bail(Error::not_found_by_id( + ResourceType::SwitchPort, + &switch_port_id, + )) + } + _ => err.bail(Error::internal_error(msg)), + } + })?; + + // if we're setting a port settings id (and therefore activating a configuration + // on a port) we need to make sure there aren't any conflicting bgp configurations + if let Some(psid) = port_settings_id { + let bgp_config: Option = + match bgp_peer_dsl::switch_port_settings_bgp_peer_config + .inner_join( + bgp_config_dsl::bgp_config + .on(bgp_peer_dsl::bgp_config_id + .eq(bgp_config_dsl::id)), + ) + .filter( + bgp_peer_dsl::port_settings_id + .eq(psid), + ) + .select(BgpConfig::as_select()) + .limit(1) + .first_async::(&conn) + .await { + Ok(v) => Ok(Some(v)), + Err(e) => { + let msg = "failed to check if bgp peer exists in switch port settings"; + error!(opctx.log, "{msg}"; "error" => ?e); + match e { + diesel::result::Error::NotFound => { + Ok(None) + } + _ => Err(err.bail(Error::internal_error(msg))), + } + } + }?; + + // find all port settings for the targeted switch + // switch port + // inner join bgp peer on port settings id + // inner join bgp config on bgp config id + // filter switch location eq switch + // filter port settings id not null + // filter asn doesn't equal our asn + + if let Some(config) = bgp_config { + let conflicting_bgp_configs: Vec = switch_port_dsl::switch_port + .inner_join( + bgp_peer_dsl::switch_port_settings_bgp_peer_config + .on(bgp_peer_dsl::port_settings_id + .nullable() + .eq(switch_port_dsl::port_settings_id)), + ) + .inner_join(bgp_config_dsl::bgp_config.on( + bgp_peer_dsl::bgp_config_id.eq(bgp_config_dsl::id), + )) + .filter(switch_port_dsl::switch_location.eq(switch)) + .filter(switch_port_dsl::port_settings_id.is_not_null()) + .filter(bgp_config_dsl::asn.ne(config.asn)) + .select(BgpConfig::as_select()) + .load_async(&conn) + .await?; + + if !conflicting_bgp_configs.is_empty() { + return Err(err.bail(Error::conflict("a different asn is already configured on this switch"))); + } + } + + } + + // perform the requested update + match current { + UpdatePrecondition::DontCare => { + diesel::update(switch_port_dsl::switch_port) + .filter(switch_port::id.eq(switch_port_id)) + .set( + switch_port::port_settings_id + .eq(port_settings_id), + ) + .execute_async(&conn) + .await + } + UpdatePrecondition::Null => { + diesel::update(switch_port_dsl::switch_port) + .filter(switch_port::id.eq(switch_port_id)) + .filter(switch_port::port_settings_id.is_null()) + .set( + switch_port::port_settings_id + .eq(port_settings_id), + ) + .execute_async(&conn) + .await + } + UpdatePrecondition::Value(current_id) => { + diesel::update(switch_port_dsl::switch_port) + .filter(switch_port::id.eq(switch_port_id)) + .filter( + switch_port::port_settings_id + .eq(current_id), + ) + .set( + switch_port::port_settings_id + .eq(port_settings_id), + ) + .execute_async(&conn) + .await + } + } + } + }) + .await + .map_err(|e| { + let msg = "switch_port_set_settings_id failed"; + if let Some(err) = err.take() { + error!(opctx.log, "{msg}"; "error" => ?err); + err + } else { + error!(opctx.log, "{msg}"; "error" => ?e); + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; Ok(()) } @@ -945,10 +1059,10 @@ impl DataStore { .eq(route_config_dsl::port_settings_id.nullable())), ) .select(SwitchPort::as_select()) - // TODO: #3592 Correctness - // In single rack deployments there are only 64 ports. We'll need - // pagination in the future, or maybe a way to constrain the query to - // a rack? + // TODO: #3592 Correctness + // In single rack deployments there are only 64 ports. We'll need + // pagination in the future, or maybe a way to constrain the query to + // a rack? .limit(64) .union( switch_port_dsl::switch_port @@ -957,7 +1071,7 @@ impl DataStore { bgp_peer_config_dsl::switch_port_settings_bgp_peer_config .on(switch_port_dsl::port_settings_id .eq(bgp_peer_config_dsl::port_settings_id.nullable()), - ), + ), ) .select(SwitchPort::as_select()) .limit(64), @@ -1148,18 +1262,18 @@ async fn do_switch_port_settings_create( NameOrId::Name(name) => { let name = name.to_string(); bgp_config_dsl::bgp_config - .filter(bgp_config::time_deleted.is_null()) - .filter(bgp_config::name.eq(name)) - .select(bgp_config::id) - .limit(1) - .first_async::(conn) - .await - .map_err(|diesel_error| { - err.bail_retryable_or( - diesel_error, - SwitchPortSettingsCreateError::BgpConfigNotFound - ) - })? + .filter(bgp_config::time_deleted.is_null()) + .filter(bgp_config::name.eq(name)) + .select(bgp_config::id) + .limit(1) + .first_async::(conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::BgpConfigNotFound + ) + })? } }; @@ -1177,9 +1291,9 @@ async fn do_switch_port_settings_create( .collect(); diesel::insert_into(allow_import_dsl::switch_port_settings_bgp_peer_config_allow_import) - .values(to_insert) - .execute_async(conn) - .await?; + .values(to_insert) + .execute_async(conn) + .await?; } if let ImportExportPolicy::Allow(list) = &p.allowed_export { @@ -1196,9 +1310,9 @@ async fn do_switch_port_settings_create( .collect(); diesel::insert_into(allow_export_dsl::switch_port_settings_bgp_peer_config_allow_export) - .values(to_insert) - .execute_async(conn) - .await?; + .values(to_insert) + .execute_async(conn) + .await?; } if !p.communities.is_empty() { @@ -1216,9 +1330,9 @@ async fn do_switch_port_settings_create( .collect(); diesel::insert_into(bgp_communities_dsl::switch_port_settings_bgp_peer_config_communities) - .values(to_insert) - .execute_async(conn) - .await?; + .values(to_insert) + .execute_async(conn) + .await?; } bgp_peer_config.push(SwitchPortBgpPeerConfig::new( @@ -1229,6 +1343,7 @@ async fn do_switch_port_settings_create( )); } } + let db_bgp_peers: Vec = diesel::insert_into(bgp_peer_dsl::switch_port_settings_bgp_peer_config) .values(bgp_peer_config) @@ -1282,18 +1397,18 @@ async fn do_switch_port_settings_create( NameOrId::Name(name) => { let name = name.to_string(); address_lot_dsl::address_lot - .filter(address_lot::time_deleted.is_null()) - .filter(address_lot::name.eq(name)) - .select(address_lot::id) - .limit(1) - .first_async::(conn) - .await - .map_err(|diesel_error| { - err.bail_retryable_or( - diesel_error, - SwitchPortSettingsCreateError::AddressLotNotFound - ) - })? + .filter(address_lot::time_deleted.is_null()) + .filter(address_lot::name.eq(name)) + .select(address_lot::id) + .limit(1) + .first_async::(conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::AddressLotNotFound + ) + })? } }; // TODO: Reduce DB round trips needed for reserving ip blocks @@ -1353,18 +1468,18 @@ async fn do_switch_port_settings_delete( NameOrId::Name(name) => { let name = name.to_string(); port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name)) - .select(switch_port_settings::id) - .limit(1) - .first_async::(conn) - .await - .map_err(|diesel_error| { - err.bail_retryable_or( - diesel_error, - SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound - ) - })? + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound + ) + })? } }; @@ -1559,7 +1674,7 @@ mod test { shaper: None, }; - datastore.bgp_config_set(&opctx, &bgp_config).await.unwrap(); + datastore.bgp_config_create(&opctx, &bgp_config).await.unwrap(); let settings = SwitchPortSettingsCreate { identity: IdentityMetadataCreateParams { diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs index 154384dd5e..f86bb1a782 100644 --- a/nexus/src/app/background/tasks/sync_switch_configuration.rs +++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs @@ -565,7 +565,7 @@ impl BackgroundTask for SwitchPortSettingsManager { if !bgp_announce_prefixes.contains_key(&bgp_config.bgp_announce_set_id) { let announcements = match self .datastore - .bgp_announce_list( + .bgp_announcement_list( opctx, ¶ms::BgpAnnounceSetSelector { name_or_id: bgp_config diff --git a/nexus/src/app/bgp.rs b/nexus/src/app/bgp.rs index d192f1ccf9..31a0faa663 100644 --- a/nexus/src/app/bgp.rs +++ b/nexus/src/app/bgp.rs @@ -16,13 +16,13 @@ use omicron_common::api::external::{ use std::net::IpAddr; impl super::Nexus { - pub async fn bgp_config_set( + pub async fn bgp_config_create( &self, opctx: &OpContext, config: ¶ms::BgpConfigCreate, ) -> CreateResult { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - let result = self.db_datastore.bgp_config_set(opctx, config).await?; + let result = self.db_datastore.bgp_config_create(opctx, config).await?; Ok(result) } @@ -69,13 +69,13 @@ impl super::Nexus { Ok(result) } - pub async fn bgp_announce_list( + pub async fn bgp_announce_set_list( &self, opctx: &OpContext, - sel: ¶ms::BgpAnnounceSetSelector, - ) -> ListResultVec { + pagparams: &PaginatedBy<'_>, + ) -> ListResultVec { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; - self.db_datastore.bgp_announce_list(opctx, sel).await + self.db_datastore.bgp_announce_set_list(opctx, pagparams).await } pub async fn bgp_delete_announce_set( @@ -89,6 +89,15 @@ impl super::Nexus { Ok(result) } + pub async fn bgp_announcement_list( + &self, + opctx: &OpContext, + sel: ¶ms::BgpAnnounceSetSelector, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + self.db_datastore.bgp_announcement_list(opctx, sel).await + } + pub async fn bgp_peer_status( &self, opctx: &OpContext, diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 019314c527..f3c0031327 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -510,7 +510,7 @@ impl super::Nexus { match self .db_datastore - .bgp_config_set( + .bgp_config_create( &opctx, &BgpConfigCreate { identity: IdentityMetadataCreateParams { diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 015fe11e3a..e11256f06e 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -285,6 +285,8 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(networking_bgp_announce_set_delete)?; api.register(networking_bgp_message_history)?; + api.register(networking_bgp_announcement_list)?; + api.register(networking_bfd_enable)?; api.register(networking_bfd_disable)?; api.register(networking_bfd_status)?; @@ -3866,7 +3868,7 @@ async fn networking_bgp_config_create( let nexus = &apictx.context.nexus; let config = config.into_inner(); let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - let result = nexus.bgp_config_set(&opctx, &config).await?; + let result = nexus.bgp_config_create(&opctx, &config).await?; Ok(HttpResponseCreated::(result.into())) }; apictx @@ -4044,7 +4046,7 @@ async fn networking_bgp_config_delete( /// set with the one specified. #[endpoint { method = PUT, - path = "/v1/system/networking/bgp-announce", + path = "/v1/system/networking/bgp-announce-set", tags = ["system/networking"], }] async fn networking_bgp_announce_set_update( @@ -4066,24 +4068,28 @@ async fn networking_bgp_announce_set_update( .await } -//TODO pagination? the normal by-name/by-id stuff does not work here -/// Get originated routes for a BGP configuration +/// List BGP announce sets #[endpoint { method = GET, - path = "/v1/system/networking/bgp-announce", + path = "/v1/system/networking/bgp-announce-set", tags = ["system/networking"], }] async fn networking_bgp_announce_set_list( rqctx: RequestContext, - query_params: Query, -) -> Result>, HttpError> { + query_params: Query< + PaginatedByNameOrId, + >, +) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; - let sel = query_params.into_inner(); + let query = query_params.into_inner(); + let pag_params = data_page_params_for(&rqctx, &query)?; + let scan_params = ScanByNameOrId::from_query(&query)?; + let paginated_by = name_or_id_pagination(&pag_params, scan_params)?; let opctx = crate::context::op_context_for_external_api(&rqctx).await?; let result = nexus - .bgp_announce_list(&opctx, &sel) + .bgp_announce_set_list(&opctx, &paginated_by) .await? .into_iter() .map(|p| p.into()) @@ -4100,17 +4106,17 @@ async fn networking_bgp_announce_set_list( /// Delete BGP announce set #[endpoint { method = DELETE, - path = "/v1/system/networking/bgp-announce", + path = "/v1/system/networking/bgp-announce-set/{name_or_id}", tags = ["system/networking"], }] async fn networking_bgp_announce_set_delete( rqctx: RequestContext, - selector: Query, + path_params: Path, ) -> Result { let apictx = rqctx.context(); let handler = async { let nexus = &apictx.context.nexus; - let sel = selector.into_inner(); + let sel = path_params.into_inner(); let opctx = crate::context::op_context_for_external_api(&rqctx).await?; nexus.bgp_delete_announce_set(&opctx, &sel).await?; Ok(HttpResponseUpdatedNoContent {}) @@ -4122,6 +4128,40 @@ async fn networking_bgp_announce_set_delete( .await } +// TODO: is pagination necessary here? How large do we expect the list of +// announcements to become in real usage? +/// Get originated routes for a specified BGP announce set +#[endpoint { + method = GET, + path = "/v1/system/networking/bgp-announce-set/{name_or_id}/announcement", + tags = ["system/networking"], +}] +async fn networking_bgp_announcement_list( + rqctx: RequestContext, + path_params: Path, +) -> Result>, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let nexus = &apictx.context.nexus; + let sel = path_params.into_inner(); + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + + let result = nexus + .bgp_announcement_list(&opctx, &sel) + .await? + .into_iter() + .map(|p| p.into()) + .collect(); + + Ok(HttpResponseOk(result)) + }; + apictx + .context + .external_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await +} + /// Enable a BFD session #[endpoint { method = POST, diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 381d59e073..9703004c73 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -573,7 +573,7 @@ pub static DEMO_BGP_CONFIG: Lazy = shaper: None, }); pub const DEMO_BGP_ANNOUNCE_SET_URL: &'static str = - "/v1/system/networking/bgp-announce?name_or_id=a-bag-of-addrs"; + "/v1/system/networking/bgp-announce-set"; pub static DEMO_BGP_ANNOUNCE: Lazy = Lazy::new(|| params::BgpAnnounceSetCreate { identity: IdentityMetadataCreateParams { @@ -585,6 +585,10 @@ pub static DEMO_BGP_ANNOUNCE: Lazy = network: "10.0.0.0/16".parse().unwrap(), }], }); +pub const DEMO_BGP_ANNOUNCE_SET_DELETE_URL: &'static str = + "/v1/system/networking/bgp-announce-set/a-bag-of-addrs"; +pub const DEMO_BGP_ANNOUNCEMENT_URL: &'static str = + "/v1/system/networking/bgp-announce-set/a-bag-of-addrs/announcement"; pub const DEMO_BGP_STATUS_URL: &'static str = "/v1/system/networking/bgp-status"; pub const DEMO_BGP_EXPORTED_URL: &'static str = @@ -2274,6 +2278,7 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { AllowedMethod::GetNonexistent ], }, + VerifyEndpoint { url: &DEMO_BGP_CONFIG_CREATE_URL, visibility: Visibility::Public, @@ -2295,11 +2300,28 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { AllowedMethod::Put( serde_json::to_value(&*DEMO_BGP_ANNOUNCE).unwrap(), ), - AllowedMethod::GetNonexistent, + AllowedMethod::Get, + ], + }, + + VerifyEndpoint { + url: &DEMO_BGP_ANNOUNCE_SET_DELETE_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ AllowedMethod::Delete ], }, + VerifyEndpoint { + url: &DEMO_BGP_ANNOUNCEMENT_URL, + visibility: Visibility::Public, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::GetNonexistent, + ], + }, + VerifyEndpoint { url: &DEMO_BGP_STATUS_URL, visibility: Visibility::Public, diff --git a/nexus/tests/integration_tests/switch_port.rs b/nexus/tests/integration_tests/switch_port.rs index e1d52e464a..92c44eddad 100644 --- a/nexus/tests/integration_tests/switch_port.rs +++ b/nexus/tests/integration_tests/switch_port.rs @@ -76,7 +76,7 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { NexusRequest::objects_post( client, - "/v1/system/networking/bgp-announce", + "/v1/system/networking/bgp-announce-set", &announce_set, ) .authn_as(AuthnMode::PrivilegedUser) diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 053f56cf5c..bde11e2de3 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -178,9 +178,10 @@ networking_allow_list_view GET /v1/system/networking/allow-li networking_bfd_disable POST /v1/system/networking/bfd-disable networking_bfd_enable POST /v1/system/networking/bfd-enable networking_bfd_status GET /v1/system/networking/bfd-status -networking_bgp_announce_set_delete DELETE /v1/system/networking/bgp-announce -networking_bgp_announce_set_list GET /v1/system/networking/bgp-announce -networking_bgp_announce_set_update PUT /v1/system/networking/bgp-announce +networking_bgp_announce_set_delete DELETE /v1/system/networking/bgp-announce-set/{name_or_id} +networking_bgp_announce_set_list GET /v1/system/networking/bgp-announce-set +networking_bgp_announce_set_update PUT /v1/system/networking/bgp-announce-set +networking_bgp_announcement_list GET /v1/system/networking/bgp-announce-set/{name_or_id}/announcement networking_bgp_config_create POST /v1/system/networking/bgp networking_bgp_config_delete DELETE /v1/system/networking/bgp networking_bgp_config_list GET /v1/system/networking/bgp diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 82fa01121e..83897cbd1d 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1629,6 +1629,13 @@ pub struct BgpAnnounceSetCreate { pub announcement: Vec, } +/// Optionally select a BGP announce set by a name or id. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct OptionalBgpAnnounceSetSelector { + /// A name or id to use when s electing BGP port settings + pub name_or_id: Option, +} + /// Select a BGP announce set by a name or id. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] pub struct BgpAnnounceSetSelector { diff --git a/openapi/nexus.json b/openapi/nexus.json index 1e4113face..285dcd82bb 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -6533,22 +6533,48 @@ } } }, - "/v1/system/networking/bgp-announce": { + "/v1/system/networking/bgp-announce-set": { "get": { "tags": [ "system/networking" ], - "summary": "Get originated routes for a BGP configuration", + "summary": "List BGP announce sets", "operationId": "networking_bgp_announce_set_list", "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, { "in": "query", "name": "name_or_id", - "description": "A name or id to use when selecting BGP port settings", - "required": true, + "description": "A name or id to use when s electing BGP port settings", "schema": { "$ref": "#/components/schemas/NameOrId" } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + }, + { + "in": "query", + "name": "sort_by", + "schema": { + "$ref": "#/components/schemas/NameOrIdSortMode" + } } ], "responses": { @@ -6557,10 +6583,10 @@ "content": { "application/json": { "schema": { - "title": "Array_of_BgpAnnouncement", + "title": "Array_of_BgpAnnounceSet", "type": "array", "items": { - "$ref": "#/components/schemas/BgpAnnouncement" + "$ref": "#/components/schemas/BgpAnnounceSet" } } } @@ -6572,6 +6598,9 @@ "5XX": { "$ref": "#/components/responses/Error" } + }, + "x-dropshot-pagination": { + "required": [] } }, "put": { @@ -6609,7 +6638,9 @@ "$ref": "#/components/responses/Error" } } - }, + } + }, + "/v1/system/networking/bgp-announce-set/{name_or_id}": { "delete": { "tags": [ "system/networking" @@ -6618,7 +6649,7 @@ "operationId": "networking_bgp_announce_set_delete", "parameters": [ { - "in": "query", + "in": "path", "name": "name_or_id", "description": "A name or id to use when selecting BGP port settings", "required": true, @@ -6640,6 +6671,48 @@ } } }, + "/v1/system/networking/bgp-announce-set/{name_or_id}/announcement": { + "get": { + "tags": [ + "system/networking" + ], + "summary": "Get originated routes for a specified BGP announce set", + "operationId": "networking_bgp_announcement_list", + "parameters": [ + { + "in": "path", + "name": "name_or_id", + "description": "A name or id to use when selecting BGP port settings", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_BgpAnnouncement", + "type": "array", + "items": { + "$ref": "#/components/schemas/BgpAnnouncement" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/system/networking/bgp-exported": { "get": { "tags": [ diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5e1ba048b6..baef38e44f 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2781,6 +2781,10 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_bgp_config_by_name ON omicron.public.bg ) WHERE time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_asn ON omicron.public.bgp_config ( + asn +) WHERE time_deleted IS NULL; + CREATE TABLE IF NOT EXISTS omicron.public.bgp_announce_set ( id UUID PRIMARY KEY, name STRING(63) NOT NULL, @@ -4208,7 +4212,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '89.0.0', NULL) + (TRUE, NOW(), NOW(), '90.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/lookup-bgp-config-by-asn/up01.sql b/schema/crdb/lookup-bgp-config-by-asn/up01.sql new file mode 100644 index 0000000000..e886015a29 --- /dev/null +++ b/schema/crdb/lookup-bgp-config-by-asn/up01.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_asn ON omicron.public.bgp_config ( + asn +) WHERE time_deleted IS NULL; From 7c8c2c30304b763d99fc3ded4a31e532082646a8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 22 Aug 2024 10:20:27 -0700 Subject: [PATCH 91/91] [gateway] Separate enum for SP lookup errors (#6413) Currently, MGS has a `SpCommsError` enum that represents pretty much every error that may occur, including failures to look up a SP's address and communication failures while talking to that SP. This means that almost all of the `ManagementSwitch` and `SingleSp` functions return all of the possible errors of any of those operations. Perhaps more importantly, this means that the *authors* of code that calls these functions cannot easily determine which errors will actually be returned by the functions they're calling. For example, while working on #6354, I had incorrectly believed that the `ManagementSwitch::sp` function, which returns a `SingleSp` for a `SpIdentifier`, could fail if there were transient network issues, and that my code would need to handle this. In fact, it turns out that this function only returns an error if discovery hasn't completed yet, or if the `SpIdentifier` is invalid, both of which are fatal errors (and should all never be returned at the point where my code calls that function). Therefore, this branch refactors the error types a bit to separate a `SpLookupError` from `SpCommsError`, and change the `ManagementSwitch` functions that only return errors if discovery hasn't completed successfully or an `SpIdentifier` is invalid to return a `SpLookupError` instead. The `SpCommsError` has a `Discovery` variant which is produced from a `SpLookupError`, so this is actually a fairly minimal change overall --- functions returning a `SpCommsError` via the `?` operator can continue doing so identically to how they do today. I'd like to merge this before #6354, so that I can use it to clean up some error handling code in that branch. --- gateway/src/error.rs | 56 ++++++++++++++++++++------------ gateway/src/management_switch.rs | 19 ++++++----- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/gateway/src/error.rs b/gateway/src/error.rs index 5933daa340..ee148e0c98 100644 --- a/gateway/src/error.rs +++ b/gateway/src/error.rs @@ -26,12 +26,8 @@ pub enum StartupError { #[derive(Debug, Error, SlogInlineError)] pub enum SpCommsError { - #[error("discovery process not yet complete")] - DiscoveryNotYetComplete, - #[error("location discovery failed: {reason}")] - DiscoveryFailed { reason: String }, - #[error("nonexistent SP {0:?}")] - SpDoesNotExist(SpIdentifier), + #[error(transparent)] + Discovery(#[from] SpLookupError), #[error("unknown socket address for SP {0:?}")] SpAddressUnknown(SpIdentifier), #[error( @@ -52,13 +48,22 @@ pub enum SpCommsError { }, } +/// Errors returned by attempts to look up a SP in the management switch's +/// discovery map. +#[derive(Debug, Error, SlogInlineError)] +pub enum SpLookupError { + #[error("discovery process not yet complete")] + DiscoveryNotYetComplete, + #[error("location discovery failed: {reason}")] + DiscoveryFailed { reason: String }, + #[error("nonexistent SP {0:?}")] + SpDoesNotExist(SpIdentifier), +} + impl From for HttpError { fn from(error: SpCommsError) -> Self { match error { - SpCommsError::SpDoesNotExist(_) => HttpError::for_bad_request( - Some("InvalidSp".to_string()), - InlineErrorChain::new(&error).to_string(), - ), + SpCommsError::Discovery(err) => HttpError::from(err), SpCommsError::SpCommunicationFailed { err: CommunicationError::SpError( @@ -124,21 +129,11 @@ impl From for HttpError { "UpdateInProgress", InlineErrorChain::new(&error).to_string(), ), - SpCommsError::DiscoveryNotYetComplete => http_err_with_message( - http::StatusCode::SERVICE_UNAVAILABLE, - "DiscoveryNotYetComplete", - InlineErrorChain::new(&error).to_string(), - ), SpCommsError::SpAddressUnknown(_) => http_err_with_message( http::StatusCode::SERVICE_UNAVAILABLE, "SpAddressUnknown", InlineErrorChain::new(&error).to_string(), ), - SpCommsError::DiscoveryFailed { .. } => http_err_with_message( - http::StatusCode::SERVICE_UNAVAILABLE, - "DiscoveryFailed ", - InlineErrorChain::new(&error).to_string(), - ), SpCommsError::Timeout { .. } => http_err_with_message( http::StatusCode::SERVICE_UNAVAILABLE, "Timeout ", @@ -160,6 +155,27 @@ impl From for HttpError { } } +impl From for HttpError { + fn from(error: SpLookupError) -> Self { + match error { + SpLookupError::SpDoesNotExist(_) => HttpError::for_bad_request( + Some("InvalidSp".to_string()), + InlineErrorChain::new(&error).to_string(), + ), + SpLookupError::DiscoveryNotYetComplete => http_err_with_message( + http::StatusCode::SERVICE_UNAVAILABLE, + "DiscoveryNotYetComplete", + InlineErrorChain::new(&error).to_string(), + ), + SpLookupError::DiscoveryFailed { .. } => http_err_with_message( + http::StatusCode::SERVICE_UNAVAILABLE, + "DiscoveryFailed ", + InlineErrorChain::new(&error).to_string(), + ), + } + } +} + // Helper function to return an `HttpError` with the same internal and external // message. MGS is an "internal" service - even when we return a 500-level // status code, we want to give our caller some information about what is going diff --git a/gateway/src/management_switch.rs b/gateway/src/management_switch.rs index a93c44d62c..23dfbe01a8 100644 --- a/gateway/src/management_switch.rs +++ b/gateway/src/management_switch.rs @@ -20,6 +20,7 @@ pub use self::location_map::SwitchPortConfig; pub use self::location_map::SwitchPortDescription; use self::location_map::ValidatedLocationConfig; use crate::error::SpCommsError; +use crate::error::SpLookupError; use crate::error::StartupError; use gateway_messages::IgnitionState; use gateway_sp_comms::default_discovery_addr; @@ -316,18 +317,18 @@ impl ManagementSwitch { self.location_map.get().is_some() } - fn location_map(&self) -> Result<&LocationMap, SpCommsError> { + fn location_map(&self) -> Result<&LocationMap, SpLookupError> { let discovery_result = self .location_map .get() - .ok_or(SpCommsError::DiscoveryNotYetComplete)?; + .ok_or(SpLookupError::DiscoveryNotYetComplete)?; discovery_result .as_ref() - .map_err(|s| SpCommsError::DiscoveryFailed { reason: s.clone() }) + .map_err(|s| SpLookupError::DiscoveryFailed { reason: s.clone() }) } /// Get the identifier of our local switch. - pub fn local_switch(&self) -> Result { + pub fn local_switch(&self) -> Result { let location_map = self.location_map()?; Ok(location_map.port_to_id(self.local_ignition_controller_port)) } @@ -347,11 +348,11 @@ impl ManagementSwitch { /// This method will fail if discovery is not yet complete (i.e., we don't /// know the logical identifiers of any SP yet!) or if `id` specifies an SP /// that doesn't exist in our discovered location map. - fn get_port(&self, id: SpIdentifier) -> Result { + fn get_port(&self, id: SpIdentifier) -> Result { let location_map = self.location_map()?; let port = location_map .id_to_port(id) - .ok_or(SpCommsError::SpDoesNotExist(id))?; + .ok_or(SpLookupError::SpDoesNotExist(id))?; Ok(port) } @@ -362,7 +363,7 @@ impl ManagementSwitch { /// This method will fail if discovery is not yet complete (i.e., we don't /// know the logical identifiers of any SP yet!) or if `id` specifies an SP /// that doesn't exist in our discovered location map. - pub fn sp(&self, id: SpIdentifier) -> Result<&SingleSp, SpCommsError> { + pub fn sp(&self, id: SpIdentifier) -> Result<&SingleSp, SpLookupError> { let port = self.get_port(id)?; Ok(self.port_to_sp(port)) } @@ -377,7 +378,7 @@ impl ManagementSwitch { pub fn ignition_target( &self, id: SpIdentifier, - ) -> Result { + ) -> Result { let port = self.get_port(id)?; Ok(self.port_to_ignition_target[port.0]) } @@ -389,7 +390,7 @@ impl ManagementSwitch { /// therefore can't map our switch ports to SP identities). pub(crate) fn all_sps( &self, - ) -> Result, SpCommsError> + ) -> Result, SpLookupError> { let location_map = self.location_map()?; Ok(location_map