From a193490ca7418099e2d8f342aac77d414d1c160f Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Mon, 12 Aug 2024 16:19:04 -0400 Subject: [PATCH 01/13] Oxide rot 1 v1.0.13 (#6295) --- tools/permslip_production | 2 +- tools/permslip_staging | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/permslip_production b/tools/permslip_production index 5e9b76f980..4f82e4d6ed 100644 --- a/tools/permslip_production +++ b/tools/permslip_production @@ -1,2 +1,2 @@ -905d38cb8298c72ecac5cf31f792919fbcd69a4ad656c40e53b3ce2d80140111 manifest-oxide-rot-1-v1.0.12.toml +55336a274d0f100d5ef51cb653ec285b651eaba139c35a533e300e6d7d46032c manifest-oxide-rot-1-v1.0.13.toml 74e754e68705cf6fed4152a92bc1ee9667d1d98a21fc12993a2232dbe34bfccb manifest-bootleby-v1.3.0.toml diff --git a/tools/permslip_staging b/tools/permslip_staging index 2d0603c3d8..d886cc4246 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ 34cf117633f82cc8f665dc3b6c78dc2aff61ca87d2b2687290605080265dda30 manifest-gimlet-v1.0.23.toml -201ff5580bb4b0b01419d7c5e580af9926103e2b6d3024e6b49cee6fab415519 manifest-oxide-rot-1-v1.0.12.toml +85553dd164933a9b9e4f22409abd1190b1d632d192b5f7527129acaa778a671a manifest-oxide-rot-1-v1.0.13.toml db995edfe91959df3cb20ea8156f75b9dcff5ec5e77f98a28766617a8ed2e0c5 manifest-psc-v1.0.22.toml 26b6096a377edb3d7da50b1b499af104e6195bc7c7c6eb1b2751b32434d7ac9e manifest-sidecar-v1.0.23.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml From 9c04ddc5246d2102adb5db233a43023fd94be2d4 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:49:59 -0700 Subject: [PATCH 02/13] Update Rust crate syn to v2.0.74 (#6288) --- Cargo.lock | 154 +++++++++++++++++++------------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 +- 3 files changed, 80 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eae64d1728..74a7405e57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,7 +166,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -274,7 +274,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -296,7 +296,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -307,7 +307,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -360,7 +360,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -518,7 +518,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.72", + "syn 2.0.74", "which", ] @@ -551,7 +551,7 @@ checksum = "1657dce144574f921af10a92876a96f0ca05dd830900598d21d91c8e4cf78f74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1099,7 +1099,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1643,7 +1643,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1667,7 +1667,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1678,7 +1678,7 @@ checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" dependencies = [ "darling_core", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1733,7 +1733,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1777,7 +1777,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1810,7 +1810,7 @@ checksum = "5fe87ce4529967e0ba1dcf8450bab64d97dfd5010a6256187ffe2e43e6f0e049" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1831,7 +1831,7 @@ checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1852,7 +1852,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1862,7 +1862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" dependencies = [ "derive_builder_core", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1875,7 +1875,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version 0.4.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1940,7 +1940,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1949,7 +1949,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2219,7 +2219,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2233,7 +2233,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2640,7 +2640,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -2752,7 +2752,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3750,7 +3750,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4062,7 +4062,7 @@ version = "0.1.0" source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4583,7 +4583,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4964,7 +4964,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5210,7 +5210,7 @@ version = "0.1.0" dependencies = [ "omicron-workspace-hack", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -5381,7 +5381,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6258,7 +6258,7 @@ dependencies = [ "string_cache", "subtle", "syn 1.0.109", - "syn 2.0.72", + "syn 2.0.74", "time", "time-macros", "tokio", @@ -6406,7 +6406,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6557,7 +6557,7 @@ dependencies = [ "oximeter-macro-impl", "oximeter-timeseries-macro", "prettyplease", - "syn 2.0.72", + "syn 2.0.74", "toml 0.8.19", "uuid", ] @@ -6706,7 +6706,7 @@ dependencies = [ "serde_json", "slog-error-chain", "strum", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "toml 0.8.19", "trybuild", @@ -6745,7 +6745,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6781,7 +6781,7 @@ dependencies = [ "oximeter-impl", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -6937,7 +6937,7 @@ dependencies = [ "regex", "regex-syntax 0.8.4", "structmeta 0.3.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7105,7 +7105,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7175,7 +7175,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7439,7 +7439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -7535,7 +7535,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "typify", "unicode-ident", @@ -7555,7 +7555,7 @@ dependencies = [ "serde_json", "serde_tokenstream", "serde_yaml", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8047,7 +8047,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8301,7 +8301,7 @@ dependencies = [ "regex", "relative-path", "rustc_version 0.4.0", - "syn 2.0.72", + "syn 2.0.74", "unicode-ident", ] @@ -8693,7 +8693,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8719,7 +8719,7 @@ checksum = "7f81c2fde025af7e69b1d1420531c8a8811ca898919db177141a85313b1cb932" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8839,7 +8839,7 @@ checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8850,7 +8850,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8901,7 +8901,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8922,7 +8922,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -8964,7 +8964,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9334,7 +9334,7 @@ source = "git+https://github.com/oxidecomputer/slog-error-chain?branch=main#15f6 dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9461,7 +9461,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9559,7 +9559,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9645,7 +9645,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive 0.2.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9657,7 +9657,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive 0.3.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9668,7 +9668,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9679,7 +9679,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9714,7 +9714,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9727,7 +9727,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9774,9 +9774,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.72" +version = "2.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" dependencies = [ "proc-macro2", "quote", @@ -9950,7 +9950,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta 0.2.0", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -9981,7 +9981,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10118,7 +10118,7 @@ checksum = "8d9ef545650e79f30233c0003bcc2504d7efac6dad25fca40744de773fe2049c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10189,7 +10189,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10466,7 +10466,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -10743,7 +10743,7 @@ dependencies = [ "semver 1.0.23", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "unicode-ident", ] @@ -10760,7 +10760,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "typify-impl", ] @@ -10977,7 +10977,7 @@ dependencies = [ "proc-macro2", "quote", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "usdt-impl", ] @@ -10995,7 +10995,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn 2.0.72", + "syn 2.0.74", "thiserror", "thread-id", "version_check", @@ -11011,7 +11011,7 @@ dependencies = [ "proc-macro2", "quote", "serde_tokenstream", - "syn 2.0.72", + "syn 2.0.74", "usdt-impl", ] @@ -11190,7 +11190,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-shared", ] @@ -11224,7 +11224,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -11848,7 +11848,7 @@ checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -11859,7 +11859,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -11879,7 +11879,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 326a7a285e..cf78ea4a79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ anstyle = "1.0.8" api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" -assert_cmd = "2.0.16" +assert_cmd = "2.0.15" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.81" atomicwrites = "0.4.3" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 470cd1d621..f5562a279f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -100,7 +100,7 @@ smallvec = { version = "1.13.2", default-features = false, features = ["const_ne spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.72", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } @@ -204,7 +204,7 @@ spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.72", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] } tokio = { version = "1.38.1", features = ["full", "test-util"] } From 914f5fd7d51f9b060dcc0382a30b607e25df49b2 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 12 Aug 2024 14:49:41 -0700 Subject: [PATCH 03/13] add internal demo saga (#6281) --- clients/nexus-client/src/lib.rs | 1 + dev-tools/omdb/src/bin/omdb/nexus.rs | 147 ++++++++++++++++ dev-tools/omdb/tests/env.out | 16 ++ dev-tools/omdb/tests/successes.out | 35 ++++ dev-tools/omdb/tests/test_all_output.rs | 23 +++ dev-tools/omdb/tests/usage_errors.out | 38 ++++ docs/demo-saga.adoc | 195 +++++++++++++++++++++ nexus/internal-api/src/lib.rs | 36 +++- nexus/src/app/mod.rs | 27 ++- nexus/src/app/saga.rs | 24 ++- nexus/src/app/sagas/demo.rs | 135 ++++++++++++++ nexus/src/app/sagas/mod.rs | 2 + nexus/src/internal_api/http_entrypoints.rs | 35 ++++ nexus/tests/integration_tests/demo_saga.rs | 74 ++++++++ nexus/tests/integration_tests/mod.rs | 1 + nexus/types/src/internal_api/views.rs | 8 + openapi/nexus-internal.json | 74 ++++++++ uuid-kinds/src/lib.rs | 1 + 18 files changed, 864 insertions(+), 8 deletions(-) create mode 100644 docs/demo-saga.adoc create mode 100644 nexus/src/app/sagas/demo.rs create mode 100644 nexus/tests/integration_tests/demo_saga.rs diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index b7722144fe..62366c45e1 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -42,6 +42,7 @@ progenitor::generate_api!( OmicronPhysicalDisksConfig = nexus_types::disk::OmicronPhysicalDisksConfig, RecoverySiloConfig = nexus_sled_agent_shared::recovery_silo::RecoverySiloConfig, TypedUuidForCollectionKind = omicron_uuid_kinds::CollectionUuid, + TypedUuidForDemoSagaKind = omicron_uuid_kinds::DemoSagaUuid, TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid, TypedUuidForPropolisKind = omicron_uuid_kinds::TypedUuid, TypedUuidForSledKind = omicron_uuid_kinds::TypedUuid, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ec3e519cbc..9aae6b2205 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -25,6 +25,7 @@ use nexus_client::types::BackgroundTasksActivateRequest; use nexus_client::types::CurrentStatus; use nexus_client::types::LastResult; use nexus_client::types::PhysicalDiskPath; +use nexus_client::types::SagaState; use nexus_client::types::SledSelector; use nexus_client::types::UninitializedSledId; use nexus_db_queries::db::lookup::LookupPath; @@ -34,6 +35,7 @@ use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::DemoSagaUuid; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; @@ -71,6 +73,8 @@ enum NexusCommands { BackgroundTasks(BackgroundTasksArgs), /// interact with blueprints Blueprints(BlueprintsArgs), + /// view sagas, create and complete demo sagas + Sagas(SagasArgs), /// interact with sleds Sleds(SledsArgs), } @@ -244,6 +248,36 @@ struct BlueprintImportArgs { input: Utf8PathBuf, } +#[derive(Debug, Args)] +struct SagasArgs { + #[command(subcommand)] + command: SagasCommands, +} + +#[derive(Debug, Subcommand)] +enum SagasCommands { + /// List sagas run by this Nexus + /// + /// Note that this is reporting in-memory state about sagas run by *this* + /// Nexus instance. You'll get different answers if you ask different Nexus + /// instances. + List, + + /// Create a "demo" saga + /// + /// This saga will wait until it's explicitly completed using the + /// "demo-complete" subcommand. + DemoCreate, + + /// Complete a demo saga started with "demo-create". + DemoComplete(DemoSagaIdArgs), +} + +#[derive(Debug, Args)] +struct DemoSagaIdArgs { + demo_saga_id: DemoSagaUuid, +} + #[derive(Debug, Args)] struct SledsArgs { #[command(subcommand)] @@ -402,6 +436,34 @@ impl NexusArgs { cmd_nexus_blueprints_import(&client, token, args).await } + NexusCommands::Sagas(SagasArgs { command }) => { + if self.nexus_internal_url.is_none() { + eprintln!( + "{}", + textwrap::wrap( + "WARNING: A Nexus instance was selected from DNS \ + because a specific one was not specified. But \ + the `omdb nexus sagas` commands usually only make \ + sense when targeting a specific Nexus instance.", + 80 + ) + .join("\n") + ); + } + match command { + SagasCommands::List => cmd_nexus_sagas_list(&client).await, + SagasCommands::DemoCreate => { + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_create(&client, token).await + } + SagasCommands::DemoComplete(args) => { + let token = omdb.check_allow_destructive()?; + cmd_nexus_sagas_demo_complete(&client, args, token) + .await + } + } + } + NexusCommands::Sleds(SledsArgs { command: SledsCommands::ListUninitialized, }) => cmd_nexus_sleds_list_uninitialized(&client).await, @@ -1626,6 +1688,91 @@ async fn cmd_nexus_blueprints_import( Ok(()) } +/// Runs `omdb nexus sagas list` +async fn cmd_nexus_sagas_list( + client: &nexus_client::Client, +) -> Result<(), anyhow::Error> { + // We don't want users to confuse this with a general way to list all sagas. + // Such a command would read database state and it would go under "omdb db". + eprintln!( + "{}", + textwrap::wrap( + "NOTE: This command only reads in-memory state from the targeted \ + Nexus instance. Sagas may be missing if they were run by a \ + different Nexus instance or if they finished before this Nexus \ + instance last started up.", + 80 + ) + .join("\n") + ); + + let saga_stream = client.saga_list_stream(None, None); + let sagas = + saga_stream.try_collect::>().await.context("listing sagas")?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SagaRow { + saga_id: Uuid, + state: &'static str, + } + let rows = sagas.into_iter().map(|saga| SagaRow { + saga_id: saga.id, + state: match saga.state { + SagaState::Running => "running", + SagaState::Succeeded => "succeeded", + SagaState::Failed { .. } => "failed", + SagaState::Stuck { .. } => "stuck", + }, + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(0, 1, 0, 0)) + .to_string(); + println!("{}", table); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-create` +async fn cmd_nexus_sagas_demo_create( + client: &nexus_client::Client, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + let demo_saga = + client.saga_demo_create().await.context("creating demo saga")?; + println!("saga id: {}", demo_saga.saga_id); + println!( + "demo saga id: {} (use this with `demo-complete`)", + demo_saga.demo_saga_id, + ); + Ok(()) +} + +/// Runs `omdb nexus sagas demo-complete` +async fn cmd_nexus_sagas_demo_complete( + client: &nexus_client::Client, + args: &DemoSagaIdArgs, + _destruction_token: DestructiveOperationToken, +) -> Result<(), anyhow::Error> { + if let Err(error) = client + .saga_demo_complete(&args.demo_saga_id) + .await + .context("completing demo saga") + { + eprintln!("error: {:#}", error); + eprintln!( + "note: `demo-complete` must be run against the same Nexus \ + instance that is currently running that saga." + ); + eprintln!( + "note: Be sure that you're using the demo_saga_id, not the saga_id." + ); + Err(error) + } else { + Ok(()) + } +} + /// Runs `omdb nexus sleds list-uninitialized` async fn cmd_nexus_sleds_list_uninitialized( client: &nexus_client::Client, diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 67f113a801..5755df9488 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -472,6 +472,22 @@ note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=d note: database schema version matches expected () note: listing all commissioned sleds (use -F to filter, e.g. -F in-service) ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: Nexus URL not specified. Will pick one from DNS. +note: using Nexus URL http://[::ffff:127.0.0.1]:REDACTED_PORT +WARNING: A Nexus instance was selected from DNS because a specific one was not +specified. But the `omdb nexus sagas` commands usually only make sense when +targeting a specific Nexus instance. +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["oximeter", "--oximeter-url", "junk", "list-producers"] termination: Exited(1) --------------------------------------------- diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index d4c07899f4..66f07cb2f0 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -625,6 +625,41 @@ warning: unknown background task: "vpc_route_manager" (don't know how to interpr stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= +EXECUTING COMMAND: omdb ["--destructive", "nexus", "sagas", "demo-create"] +termination: Exited(0) +--------------------------------------------- +stdout: +saga id: ..................... +demo saga id: ..................... (use this with `demo-complete`) +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] +termination: Exited(0) +--------------------------------------------- +stdout: +SAGA_ID STATE +..................... running +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +============================================= EXECUTING COMMAND: omdb ["--destructive", "nexus", "background-tasks", "activate", "inventory_collection"] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 6a959d726a..d0258aeaed 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -81,6 +81,16 @@ async fn test_omdb_usage_errors() { &["nexus"], &["nexus", "background-tasks"], &["nexus", "blueprints"], + &["nexus", "sagas"], + // Missing "--destructive" flag. The URL is bogus but just ensures that + // we get far enough to hit the error we care about. + &[ + "nexus", + "--nexus-internal-url", + "http://[::1]:111", + "sagas", + "demo-create", + ], &["nexus", "sleds"], &["sled-agent"], &["sled-agent", "zones"], @@ -134,6 +144,9 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["mgs", "inventory"], &["nexus", "background-tasks", "doc"], &["nexus", "background-tasks", "show"], + &["nexus", "sagas", "list"], + &["--destructive", "nexus", "sagas", "demo-create"], + &["nexus", "sagas", "list"], &[ "--destructive", "nexus", @@ -326,6 +339,16 @@ async fn test_omdb_env_settings(cptestctx: &ControlPlaneTestContext) { let args = &["--dns-server", &dns_sockaddr.to_string(), "db", "sleds"]; do_run(&mut output, move |exec| exec, &cmd_path, args).await; + // That said, the "sagas" command prints an extra warning in this case. + let args = &["nexus", "sagas", "list"]; + do_run( + &mut output, + move |exec| exec.env("OMDB_DNS_SERVER", &dns_sockaddr.to_string()), + &cmd_path, + args, + ) + .await; + // Case: specified in multiple places (command-line argument wins) let args = &["oximeter", "--oximeter-url", "junk", "list-producers"]; let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index ca70412d84..1ee07410bf 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -446,6 +446,7 @@ Usage: omdb nexus [OPTIONS] Commands: background-tasks print information about background tasks blueprints interact with blueprints + sagas view sagas, create and complete demo sagas sleds interact with sleds help Print this message or the help of the given subcommand(s) @@ -522,6 +523,43 @@ Connection Options: Safety Options: -w, --destructive Allow potentially-destructive subcommands ============================================= +EXECUTING COMMAND: omdb ["nexus", "sagas"] +termination: Exited(2) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +view sagas, create and complete demo sagas + +Usage: omdb nexus sagas [OPTIONS] + +Commands: + list List sagas run by this Nexus + demo-create Create a "demo" saga + demo-complete Complete a demo saga started with "demo-create" + help Print this message or the help of the given subcommand(s) + +Options: + --log-level log level filter [env: LOG_LEVEL=] [default: warn] + -h, --help Print help + +Connection Options: + --nexus-internal-url URL of the Nexus internal API [env: + OMDB_NEXUS_URL=] + --dns-server [env: OMDB_DNS_SERVER=] + +Safety Options: + -w, --destructive Allow potentially-destructive subcommands +============================================= +EXECUTING COMMAND: omdb ["nexus", "--nexus-internal-url", "http://[::1]:111", "sagas", "demo-create"] +termination: Exited(1) +--------------------------------------------- +stdout: +--------------------------------------------- +stderr: +note: using Nexus URL http://[::1]:111 +Error: This command is potentially destructive. Pass the `-w` / `--destructive` flag to allow it. +============================================= EXECUTING COMMAND: omdb ["nexus", "sleds"] termination: Exited(2) --------------------------------------------- diff --git a/docs/demo-saga.adoc b/docs/demo-saga.adoc new file mode 100644 index 0000000000..316050fc23 --- /dev/null +++ b/docs/demo-saga.adoc @@ -0,0 +1,195 @@ +:showtitle: +:numbered: +:toc: left + += Demo saga + +Nexus ships with a "demo" saga that can be used to interactively experiment with sagas, saga recovery, and saga transfer (after Nexus zone expungement). The demo saga consists of a single action that blocks until it's instructed to proceed. You instruct it to proceed using a request to the Nexus _internal_ API. + +In the example below, we'll: + +. Use `omicron-dev run-all` to run a simulated control plane stack +. Start a second Nexus whose execution we can control precisely +. Use the `omdb nexus sagas demo-create` command to kick off a demo saga +. Use the `omdb nexus sagas demo-complete` command to instruct that saga to finish + +For steps 1-2, we're just following the https://github.com/oxidecomputer/omicron/blob/main/docs/how-to-run-simulated.adoc#using-both-omicron-dev-run-all-and-running-nexus-manually[docs for running a simulated stack and a second Nexus]. First, run `omicron-dev run-all`: + +```terminal +$ cargo xtask omicron-dev run-all +... +omicron-dev: setting up all services ... +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.0.log" +DB URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +DB address: [::1]:43428 +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.2.log" +log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log +note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.7162.3.log" +omicron-dev: services are running. +omicron-dev: nexus external API: 127.0.0.1:12220 +omicron-dev: nexus internal API: [::1]:12221 +omicron-dev: cockroachdb pid: 7166 +omicron-dev: cockroachdb URL: postgresql://root@[::1]:43428/omicron?sslmode=disable +omicron-dev: cockroachdb directory: /dangerzone/omicron_tmp/.tmpkzPi6h +omicron-dev: internal DNS HTTP: http://[::1]:55952 +omicron-dev: internal DNS: [::1]:36474 +omicron-dev: external DNS name: oxide-dev.test +omicron-dev: external DNS HTTP: http://[::1]:64396 +omicron-dev: external DNS: [::1]:35977 +omicron-dev: e.g. `dig @::1 -p 35977 test-suite-silo.sys.oxide-dev.test` +omicron-dev: management gateway: http://[::1]:33325 (switch0) +omicron-dev: management gateway: http://[::1]:61144 (switch1) +omicron-dev: silo name: test-suite-silo +omicron-dev: privileged user name: test-privileged +``` + +Then follow those docs to configure and start a second Nexus: + +```terminal +$ cargo run --bin=nexus -- config-second.toml +... +Aug 12 20:16:25.405 INFO listening, local_addr: [::1]:12223, component: dropshot_internal, name: a4ef738a-1fb0-47b1-9da2-4919c7ec7c7f, file: /home/dap/.cargo/git/checkouts/dropshot-a4a923d29dccc492/52d900a/dropshot/src/server.rs:205 +... +``` + +The rest of these instructions will use `omdb` pointed at the second Nexus instance, so we'll set OMDB_NEXUS_URL in the environment: + +```terminal +$ export OMDB_NEXUS_URL=http://[::1]:12223 +``` + +Now we can use `omdb nexus sagas list` to list the sagas that have run _in that second Nexus process_ only: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +``` + +Now we can create a demo saga: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create +... +note: using Nexus URL http://[::1]:12223 +saga id: f7765d6a-6e45-4c13-8904-2677b79a97eb +demo saga id: 88eddf09-dda3-4d70-8d99-1d3b441c57da (use this with `demo-complete`) +``` + +We have to use the `--destructive` option because this command by nature changes state in Nexus and `omdb` won't allow commands that change state by default. + +We can see the new saga in the list of sagas now. It's running: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb running +``` + +and it will stay running indefinitely until we run `demo-complete`. Let's do that: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 88eddf09-dda3-4d70-8d99-1d3b441c57da +... +note: using Nexus URL http://[::1]:12223 +``` + +and then list sagas again: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +f7765d6a-6e45-4c13-8904-2677b79a97eb succeeded +``` + +It works across recovery, too. You can go through the same loop again, but this time kill Nexus and start it again: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-create +... +note: using Nexus URL http://[::1]:12223 +saga id: 65253cb6-4428-4aa7-9afc-bf9b42166cb5 +demo saga id: 208ebc89-acc6-42d3-9f40-7f5567c8a39b (use this with `demo-complete`) +``` + +Now restart Nexus (^C the second invocation and run it again). Now if we use `omdb` we don't see the earlier saga because it was finished when this new Nexus process started. But we see the one we created later because it was recovered: + +```terminal +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 running +``` + +Side note: we can see it was recovered: + +```terminal +$ cargo run --bin=omdb -- nexus background-tasks show +... +task: "saga_recovery" + configured period: every 10m + currently executing: no + last completed activation: iter 1, triggered by a periodic timer firing + started at 2024-08-12T20:20:41.714Z (44s ago) and ran for 79ms + since Nexus started: + sagas recovered: 1 + sagas recovery errors: 0 + sagas observed started: 0 + sagas inferred finished: 0 + missing from SEC: 0 + bad state in SEC: 0 + last pass: + found sagas: 1 (in-progress, assigned to this Nexus) + recovered: 1 (successfully) + failed: 0 + skipped: 0 (already running) + removed: 0 (newly finished) + recently recovered sagas (1): + TIME SAGA_ID + 2024-08-12T20:20:41Z 65253cb6-4428-4aa7-9afc-bf9b42166cb5 + no saga recovery failures +... +``` + +Now we can complete that saga: + +```terminal +$ cargo run --bin=omdb -- --destructive nexus sagas demo-complete 208ebc89-acc6-42d3-9f40-7f5567c8a39b +... +note: using Nexus URL http://[::1]:12223 +``` + +and see it finish: + +``` +$ cargo run --bin=omdb -- nexus sagas list +... +note: using Nexus URL http://[::1]:12223 +NOTE: This command only reads in-memory state from the targeted Nexus instance. +Sagas may be missing if they were run by a different Nexus instance or if they +finished before this Nexus instance last started up. +SAGA_ID STATE +65253cb6-4428-4aa7-9afc-bf9b42166cb5 succeeded +``` + +Note too that the completion is not synchronous with the `demo-complete` command, though it usually _is_ pretty quick. It's possible you'll catch it `running` if you run `nexus sagas list` right after running `nexus sagas demo-complete`, but you should quickly see it `succeeded` if you keep running `nexus sagas list`. diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs index c6ade3b1a2..6a98c44614 100644 --- a/nexus/internal-api/src/lib.rs +++ b/nexus/internal-api/src/lib.rs @@ -23,7 +23,7 @@ use nexus_types::{ OximeterInfo, RackInitializationRequest, SledAgentInfo, SwitchPutRequest, SwitchPutResponse, }, - views::{BackgroundTask, Ipv4NatEntryView, Saga}, + views::{BackgroundTask, DemoSaga, Ipv4NatEntryView, Saga}, }, }; use omicron_common::{ @@ -39,7 +39,8 @@ use omicron_common::{ update::ArtifactId, }; use omicron_uuid_kinds::{ - DownstairsKind, SledUuid, TypedUuid, UpstairsKind, UpstairsRepairKind, + DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind, + UpstairsRepairKind, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -282,6 +283,31 @@ pub trait NexusInternalApi { path_params: Path, ) -> Result, HttpError>; + /// Kick off an instance of the "demo" saga + /// + /// This saga is used for demo and testing. The saga just waits until you + /// complete using the `saga_demo_complete` API. + #[endpoint { + method = POST, + path = "/demo-saga", + }] + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Complete a waiting demo saga + /// + /// Note that the id used here is not the same as the id of the saga. It's + /// the one returned by the `saga_demo_create` API. + #[endpoint { + method = POST, + path = "/demo-saga/{demo_saga_id}/complete", + }] + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result; + // Background Tasks /// List background tasks @@ -565,6 +591,12 @@ pub struct SagaPathParam { pub saga_id: Uuid, } +/// Path parameters for DemoSaga requests +#[derive(Deserialize, JsonSchema)] +pub struct DemoSagaPathParam { + pub demo_saga_id: DemoSagaUuid, +} + /// Path parameters for Background Task requests #[derive(Deserialize, JsonSchema)] pub struct BackgroundTaskPathParam { diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 60ed611bd7..5cfacd0c9c 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -7,6 +7,7 @@ use self::external_endpoints::NexusCertResolver; use self::saga::SagaExecutor; use crate::app::background::BackgroundTasksData; +use crate::app::background::SagaRecoveryHelpers; use crate::app::oximeter::LazyTimeseriesClient; use crate::populate::populate_start; use crate::populate::PopulateArgs; @@ -19,6 +20,7 @@ use nexus_config::NexusConfig; use nexus_config::RegionAllocationStrategy; use nexus_config::Tunables; use nexus_config::UpdatesConfig; +use nexus_db_model::AllSchemaVersions; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; @@ -35,6 +37,7 @@ use std::net::SocketAddrV6; use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; use std::sync::OnceLock; +use tokio::sync::mpsc; use uuid::Uuid; // The implementation of Nexus is large, and split into a number of submodules @@ -89,12 +92,9 @@ pub(crate) mod sagas; // TODO: When referring to API types, we should try to include // the prefix unless it is unambiguous. -pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; - -use crate::app::background::SagaRecoveryHelpers; -use nexus_db_model::AllSchemaVersions; pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE; -use tokio::sync::mpsc; +pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; +use sagas::demo::CompletingDemoSagas; // XXX: Might want to recast as max *floating* IPs, we have at most one // ephemeral (so bounded in saga by design). @@ -204,6 +204,9 @@ pub struct Nexus { /// Default Crucible region allocation strategy default_region_allocation_strategy: RegionAllocationStrategy, + + /// List of demo sagas awaiting a request to complete them + demo_sagas: Arc>, } impl Nexus { @@ -480,6 +483,9 @@ impl Nexus { .pkg .default_region_allocation_strategy .clone(), + demo_sagas: Arc::new(std::sync::Mutex::new( + CompletingDemoSagas::new(), + )), }; // TODO-cleanup all the extra Arcs here seems wrong @@ -955,6 +961,17 @@ impl Nexus { } Ok(clients.into_iter().collect::>()) } + + pub(crate) fn demo_sagas( + &self, + ) -> Result, Error> { + self.demo_sagas.lock().map_err(|error| { + Error::internal_error(&format!( + "failed to acquire demo_sagas lock: {:#}", + error + )) + }) + } } /// For unimplemented endpoints, indicates whether the resource identified diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index fcdbb0db59..5bc69946ad 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -58,12 +58,14 @@ use futures::FutureExt; use futures::StreamExt; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; +use nexus_types::internal_api::views::DemoSaga; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResult; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; +use omicron_uuid_kinds::DemoSagaUuid; use std::sync::Arc; use std::sync::OnceLock; use steno::SagaDag; @@ -296,7 +298,6 @@ pub(crate) struct RunnableSaga { } impl RunnableSaga { - #[cfg(test)] pub(crate) fn id(&self) -> SagaId { self.id } @@ -457,4 +458,25 @@ impl super::Nexus { pub(crate) fn sec(&self) -> &steno::SecClient { &self.sagas.sec_client } + + pub(crate) async fn saga_demo_create(&self) -> Result { + use crate::app::sagas::demo; + let demo_saga_id = DemoSagaUuid::new_v4(); + let saga_params = demo::Params { id: demo_saga_id }; + let saga_dag = create_saga_dag::(saga_params)?; + let runnable_saga = self.sagas.saga_prepare(saga_dag).await?; + let saga_id = runnable_saga.id().0; + // We don't need the handle that runnable_saga.start() returns because + // we're not going to wait for the saga to finish here. + let _ = runnable_saga.start().await?; + Ok(DemoSaga { saga_id, demo_saga_id }) + } + + pub(crate) fn saga_demo_complete( + &self, + demo_saga_id: DemoSagaUuid, + ) -> Result<(), Error> { + let mut demo_sagas = self.demo_sagas()?; + demo_sagas.complete(demo_saga_id) + } } diff --git a/nexus/src/app/sagas/demo.rs b/nexus/src/app/sagas/demo.rs new file mode 100644 index 0000000000..4a8eda8b80 --- /dev/null +++ b/nexus/src/app/sagas/demo.rs @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Demo saga, used for testing and interactive debugging +//! +//! The "Demo" saga exists so that developers and automated tests can create a +//! saga that will not complete until they take some action to complete it. The +//! saga just waits until it gets the message that it should finish. Users +//! create demo sagas and complete them using requests to the internal API. +//! +//! The implementation is entirely in-memory, which means you have to send the +//! completion message to the Nexus that's running the saga. However, it does +//! work across Nexus restarts, so this can be used to exercise the saga +//! recovery path. +//! +//! It's tempting to build this only for development and not official releases, +//! but that'd be more work, there's little downside to always including it, and +//! it's conceivable that it'd be useful for production systems, too. + +use super::NexusActionContext; +use super::{ActionRegistry, NexusSaga, SagaInitError}; +use crate::app::sagas::declare_saga_actions; +use anyhow::ensure; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::DemoSagaUuid; +use serde::Deserialize; +use serde::Serialize; +use slog::info; +use std::collections::BTreeMap; +use steno::ActionError; +use tokio::sync::oneshot; + +/// Set of demo sagas that have been marked completed +/// +/// Nexus maintains one of these at the top level. Individual demo sagas wait +/// until their id shows up here, then remove it and proceed. +pub struct CompletingDemoSagas { + ids: BTreeMap>, +} + +impl CompletingDemoSagas { + pub fn new() -> CompletingDemoSagas { + CompletingDemoSagas { ids: BTreeMap::new() } + } + + pub fn complete(&mut self, id: DemoSagaUuid) -> Result<(), Error> { + self.ids + .remove(&id) + .ok_or_else(|| { + Error::non_resourcetype_not_found(format!( + "demo saga with id {:?}", + id + )) + })? + .send(()) + .map_err(|_| { + Error::internal_error( + "saga stopped listening (Nexus shutting down?)", + ) + }) + } + + pub fn subscribe( + &mut self, + id: DemoSagaUuid, + ) -> Result, anyhow::Error> { + let (tx, rx) = oneshot::channel(); + ensure!( + self.ids.insert(id, tx).is_none(), + "multiple subscriptions for the same demo saga" + ); + Ok(rx) + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub id: DemoSagaUuid, +} + +declare_saga_actions! { + demo; + DEMO_WAIT -> "demo_wait" { + + demo_wait + } +} + +#[derive(Debug)] +pub(crate) struct SagaDemo; +impl NexusSaga for SagaDemo { + const NAME: &'static str = "demo"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + demo_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(demo_wait_action()); + Ok(builder.build()?) + } +} + +async fn demo_wait(sagactx: NexusActionContext) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let demo_id = sagactx.saga_params::()?.id; + let log = osagactx.log(); + info!(log, "demo saga: begin wait"; "id" => %demo_id); + let rx = { + let mut demo_sagas = osagactx + .nexus() + .demo_sagas() + .map_err(ActionError::action_failed)?; + demo_sagas.subscribe(demo_id).map_err(|e| { + ActionError::action_failed(Error::internal_error(&format!( + "demo saga subscribe failed: {:#}", + e + ))) + })? + }; + match rx.await { + Ok(_) => { + info!(log, "demo saga: completing"; "id" => %demo_id); + } + Err(_) => { + info!(log, "demo saga: waiting failed (Nexus shutting down?)"; + "id" => %demo_id); + } + } + Ok(()) +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 0c57a5b2dc..b944fb4d2b 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -22,6 +22,7 @@ use steno::SagaType; use thiserror::Error; use uuid::Uuid; +pub mod demo; pub mod disk_create; pub mod disk_delete; pub mod finalize_disk; @@ -134,6 +135,7 @@ fn make_action_registry() -> ActionRegistry { let mut registry = steno::ActionRegistry::new(); registry.register(Arc::clone(&*ACTION_GENERATE_ID)); + ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions(&mut registry); ::register_actions( diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 33b626a7fc..c5322e3930 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -35,6 +35,7 @@ use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; use nexus_types::internal_api::views::BackgroundTask; +use nexus_types::internal_api::views::DemoSaga; use nexus_types::internal_api::views::Ipv4NatEntryView; use nexus_types::internal_api::views::Saga; use omicron_common::api::external::http_pagination::data_page_params_for; @@ -530,6 +531,40 @@ impl NexusInternalApi for NexusInternalApiImpl { .await } + async fn saga_demo_create( + rqctx: RequestContext, + ) -> Result, HttpError> { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let demo_saga = nexus.saga_demo_create().await?; + Ok(HttpResponseOk(demo_saga)) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + + async fn saga_demo_complete( + rqctx: RequestContext, + path_params: Path, + ) -> Result { + let apictx = &rqctx.context().context; + let handler = async { + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + nexus.saga_demo_complete(path.demo_saga_id)?; + Ok(HttpResponseUpdatedNoContent()) + }; + + apictx + .internal_latencies + .instrument_dropshot_handler(&rqctx, handler) + .await + } + // Background Tasks async fn bgtask_list( diff --git a/nexus/tests/integration_tests/demo_saga.rs b/nexus/tests/integration_tests/demo_saga.rs new file mode 100644 index 0000000000..888fa35965 --- /dev/null +++ b/nexus/tests/integration_tests/demo_saga.rs @@ -0,0 +1,74 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Smoke test for the demo saga + +use futures::TryStreamExt; +use nexus_client::types::Saga; +use nexus_client::types::SagaState; +use nexus_test_interface::NexusServer; +use nexus_test_utils_macros::nexus_test; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; +use std::time::Duration; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + +// Tests that we can create a demo saga, then mark it completed, and the actual +// saga's state matches what we expect along the way. +#[nexus_test] +async fn test_demo_saga(cptestctx: &ControlPlaneTestContext) { + let log = &cptestctx.logctx.log; + let nexus_internal_url = format!( + "http://{}", + cptestctx.server.get_http_server_internal_address().await + ); + let nexus_client = + nexus_client::Client::new(&nexus_internal_url, log.clone()); + + let sagas_before = list_sagas(&nexus_client).await; + eprintln!("found sagas (before): {:?}", sagas_before); + let demo_saga = nexus_client.saga_demo_create().await.unwrap(); + let saga_id = demo_saga.saga_id; + assert!(!sagas_before.into_iter().any(|s| s.id == saga_id)); + + let sagas_after = list_sagas(&nexus_client).await; + eprintln!("found sagas (after): {:?}", sagas_after); + let found = sagas_after.into_iter().find(|s| s.id == saga_id).unwrap(); + assert!(matches!(found.state, SagaState::Running)); + + // It is hard to verify that the saga is not going to complete by itself. + // No matter how long we wait and make sure it didn't complete, it might + // have completed after that. And then we've made the test suite take that + // much longer. But we can at least make sure that completing the saga + // does cause it to finish. + nexus_client.saga_demo_complete(&demo_saga.demo_saga_id).await.unwrap(); + + // Completion is not synchronous -- that just unblocked the saga. So we + // need to poll a bit to wait for it to actually finish. + let found = wait_for_condition( + || async { + let sagas = list_sagas(&nexus_client).await; + eprintln!("found sagas (last): {:?}", sagas); + let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap(); + if matches!(found.state, SagaState::Succeeded) { + Ok(found) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .unwrap(); + + assert_eq!(found.id, saga_id); + assert!(matches!(found.state, SagaState::Succeeded)); +} + +async fn list_sagas(client: &nexus_client::Client) -> Vec { + client.saga_list_stream(None, None).try_collect::>().await.unwrap() +} diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 5054527c63..fdf14dbd07 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -11,6 +11,7 @@ mod basic; mod certificates; mod commands; mod console_api; +mod demo_saga; mod device_auth; mod disks; mod external_ips; diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index b71fd04779..a4557ffd31 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -9,6 +9,7 @@ use futures::stream::StreamExt; use omicron_common::api::external::MacAddr; use omicron_common::api::external::ObjectStream; use omicron_common::api::external::Vni; +use omicron_uuid_kinds::DemoSagaUuid; use schemars::JsonSchema; use serde::Serialize; use std::net::Ipv4Addr; @@ -152,6 +153,13 @@ impl From for SagaState { } } +/// Identifies an instance of the demo saga +#[derive(Clone, Debug, Serialize, JsonSchema)] +pub struct DemoSaga { + pub saga_id: Uuid, + pub demo_saga_id: DemoSagaUuid, +} + /// Background tasks /// /// These are currently only intended for observability by developers. We will diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 7e4d6e6c02..a181d14540 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -364,6 +364,59 @@ } } }, + "/demo-saga": { + "post": { + "summary": "Kick off an instance of the \"demo\" saga", + "description": "This saga is used for demo and testing. The saga just waits until you complete using the `saga_demo_complete` API.", + "operationId": "saga_demo_create", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DemoSaga" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/demo-saga/{demo_saga_id}/complete": { + "post": { + "summary": "Complete a waiting demo saga", + "description": "Note that the id used here is not the same as the id of the saga. It's the one returned by the `saga_demo_create` API.", + "operationId": "saga_demo_complete", + "parameters": [ + { + "in": "path", + "name": "demo_saga_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -2624,6 +2677,23 @@ "kind" ] }, + "DemoSaga": { + "description": "Identifies an instance of the demo saga", + "type": "object", + "properties": { + "demo_saga_id": { + "$ref": "#/components/schemas/TypedUuidForDemoSagaKind" + }, + "saga_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "demo_saga_id", + "saga_id" + ] + }, "DiskIdentity": { "description": "Uniquely identifies a disk.", "type": "object", @@ -4897,6 +4967,10 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDemoSagaKind": { + "type": "string", + "format": "uuid" + }, "TypedUuidForDownstairsRegionKind": { "type": "string", "format": "uuid" diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 8f695d2399..ba586c03a5 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -51,6 +51,7 @@ macro_rules! impl_typed_uuid_kind { impl_typed_uuid_kind! { Collection => "collection", Dataset => "dataset", + DemoSaga => "demo_saga", Downstairs => "downstairs", DownstairsRegion => "downstairs_region", ExternalIp => "external_ip", From 3ad79c1cbdc6fb77515bc10ce5f4a7d7c8687624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Tue, 13 Aug 2024 10:15:00 +1200 Subject: [PATCH 04/13] Remove outdated TODO file (#6291) --- TODO.adoc | 113 ------------------------------------------------------ 1 file changed, 113 deletions(-) delete mode 100644 TODO.adoc diff --git a/TODO.adoc b/TODO.adoc deleted file mode 100644 index 40c38e14b3..0000000000 --- a/TODO.adoc +++ /dev/null @@ -1,113 +0,0 @@ -:showtitle: -:icons: font - -= TODO - -API endpoints: - -* RFD 24: regions, AZs, etc -* (lots more) - -Work queue (see also: existing GitHub issues): - -* use CARGO_BIN_EXE for paths to binaries -https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-crates -* dropshot: allow consumers to provide error codes for dropshot errors -* general maintenance and cleanup -** replace &Arc with &T, and some instances of Arc as well -** all identifiers could be newtypes, with a prefix for the type (like AWS - "i-123" for instances) -** rethinking ApiError a bit -- should it use thiserror, or at least impl - std::error::Error? -** scope out switching to sync (see RFD 79) -** proper config + logging for sled agent -* settle on an approach for modification of resources and implement it once -* implement behavior of server restarting (e.g., sled agent starting up) -** This would help validate some of the architectural choices. Current thinking - is that this will notify OXCP of the restart, and OXCP will find instances - that are supposed to be on that server and run instance_ensure(). It will - also want to do that for the disks associated with those instances. - IMPORTANT: this process should also _remove_ any resources that are currently - on that system, so the notification to OXCP about a restart may need to - include the list of resources that the SA knows about and their current - states. -* implement audit log -* implement alerts -* implement external user authentication -* implement external user authorization mechanism -* implement throttling and load shedding described in RFD 6 -* implement hardening in RFD 10 -* implement ETag / If-Match / If-None-Match -* implement limits for all types of resources -* implement scheme for API versioning -** how to identify the requested version -- header or URI? -** translators for older versions? -** integration of supported API versions into build artifact? -** Should all the uses of serde_json disallow unrecognized fields? Should any? -* debugging/monitoring: Prometheus? -* debugging/monitoring: OpenTracing? OpenTelemetry? -* debugging/monitoring: Dynamic tracing? -* debugging/monitoring: Core files? -* Automated testing -** General API testing: there's a lot of boilerplate in hand-generated tests - for each kind of resource. Would it be reasonable / possible to have a sort - of omnibus test that's given the OpenAPI spec (or something like it), - creates a hierarchy with at least one of every possible resource, and does - things like: For each possible resource -*** attempt to (create, get, put, delete) one with an invalid name -*** attempt to (GET, DELETE, PUT) one that does not exist -*** attempt to create one with invalid JSON -*** attempt to create one with a duplicate name of the one we know about -*** exercise list operation with marker and limit (may need to create many of them) -*** for each required input property: -**** attempt to create a resource without that property -*** for each input property: attempt to create a resource with invalid values - for that property -*** list instances of that resource and expect to find the one we know about -*** GET the one instance we know about -*** DELETE the one instance we know about -*** GET the one instance we know about again and expect it to fail -*** list instances again and expect to find nothing -* We will need archivers for deleted records -- especially saga logs - -External dependencies / open questions: - -* Should we create a more first-class notion of objects in the API? -** This would be a good way to enforce built-in limits. -** This would be a good way to enforce uniformity of pagination. -** If each resource provides a way to construct ETags, we could provide - automatic implementation of If-Match, etc. -** With the right interface, we could provide automatic implementations of PUT - or PATCH with JSON Merge Patch and JSON Patch given any one of these. -* would like to require that servers have unique, immutable uuids -* TLS: -** How will we do TLS termination? -** How will we manage server certificates? -** How will we manage client certificates? -* what does bootstrapping / key management look like? -* what does internal authorization look like? - -Other activities: - -* Performance testing -* Stress testing -* Fault testing / under load -* Fuzz testing -* Security review - -Nice-to-haves: - -* API consistency checks: e.g., camel case every where - -Things we're going to want to build once: - -* metric export -* structured event reporting (e.g., audit log, alert log, fault log) -* opentracing-type reporting -* client-side circuit breakers -* service discovery -* client connection pooling -* server-side throttling -* command-line utilities - -Check out linkerd (for inspiration -- it looks K8s-specific) From b83f6094acdbce8e32878b201632a0b9a3a84966 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 04:30:15 +0000 Subject: [PATCH 05/13] Update taiki-e/install-action digest to 7f737c1 (#6301) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`8efaa9b` -> `7f737c1`](https://togithub.com/taiki-e/install-action/compare/8efaa9b...7f737c1) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 7de7cb0ee1..e310c011e7 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@8efaa9bb37d22aefc9d331dfbd45e2d230acfc33 # v2 + uses: taiki-e/install-action@7f737c1056bae14d45b3daec1a2d26ad480e50f7 # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From a3062747f03bdcf91dc80ee98ab26f48a6364267 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 04:39:15 +0000 Subject: [PATCH 06/13] Update Rust crate assert_cmd to 2.0.16 (#6302) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index cf78ea4a79..326a7a285e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -267,7 +267,7 @@ anstyle = "1.0.8" api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" -assert_cmd = "2.0.15" +assert_cmd = "2.0.16" async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.81" atomicwrites = "0.4.3" From 8592a6bdb56500f2c920bf07d49c85ed8c1ecef6 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 05:36:04 +0000 Subject: [PATCH 07/13] Update Rust crate serde to v1.0.207 (#6303) --- Cargo.lock | 8 ++++---- workspace-hack/Cargo.toml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74a7405e57..a686b41823 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8795,9 +8795,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.206" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3e4cd94123dd520a128bcd11e34d9e9e423e7e3e50425cb1b4b1e3549d0284" +checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" dependencies = [ "serde_derive", ] @@ -8833,9 +8833,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.206" +version = "1.0.207" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabfb6138d2383ea8208cf98ccf69cdfb1aff4088460681d84189aa259762f97" +checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" dependencies = [ "proc-macro2", "quote", diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index f5562a279f..ad845e8073 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -91,7 +91,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } @@ -194,7 +194,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.206", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } From b08cce78804c94e19a8404d58beafa3d6d296d4a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 13 Aug 2024 09:20:46 -0700 Subject: [PATCH 08/13] [oximeter] Add some units from physical reality (#6296) In service of future changes to record data from power, temperature, and fan speed sensors in Oximeter, this branch adds some physical quantities to the `Units` enum: `Volts`, `Amps`, `DegreesCelcius`, and `Rpm`. I've added all of these as whole numbers of the measured quantities, with the expectation that they will probably be recorded as floating-point. We could consider instead using a smaller unit like `MilliAmps`, and recording them as integers, but that introduces a bunch of dimensional analysis that I'm not sure if we want to be doing. --- openapi/nexus.json | 12 +++++++++++- oximeter/impl/src/schema/codegen.rs | 6 ++++++ oximeter/impl/src/schema/mod.rs | 6 +++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/openapi/nexus.json b/openapi/nexus.json index ae5eaeae64..da77eec2a8 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -19805,7 +19805,10 @@ "count", "bytes", "seconds", - "nanoseconds" + "nanoseconds", + "volts", + "amps", + "degrees_celcius" ] }, { @@ -19814,6 +19817,13 @@ "enum": [ "none" ] + }, + { + "description": "Rotations per minute.", + "type": "string", + "enum": [ + "rpm" + ] } ] }, diff --git a/oximeter/impl/src/schema/codegen.rs b/oximeter/impl/src/schema/codegen.rs index d433441718..4778cf4970 100644 --- a/oximeter/impl/src/schema/codegen.rs +++ b/oximeter/impl/src/schema/codegen.rs @@ -522,6 +522,12 @@ impl quote::ToTokens for Units { Units::Nanoseconds => { quote! { ::oximeter::schema::Units::Nanoseconds } } + Units::Amps => quote! { ::oximeter::schema::Units::Amps }, + Units::Volts => quote! { ::oximeter::schema::Units::Volts }, + Units::DegreesCelcius => { + quote! { ::oximeter::schema::Units::DegreesCelcius } + } + Units::Rpm => quote! { ::oximeter::schema::Units::Rpm }, }; toks.to_tokens(tokens); } diff --git a/oximeter/impl/src/schema/mod.rs b/oximeter/impl/src/schema/mod.rs index 7743034e31..250604d7be 100644 --- a/oximeter/impl/src/schema/mod.rs +++ b/oximeter/impl/src/schema/mod.rs @@ -179,7 +179,6 @@ pub struct TimeseriesDescription { /// Measurement units for timeseries samples. #[derive(Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] -// TODO-completeness: Include more units, such as power / temperature. // TODO-completeness: Decide whether and how to handle dimensional analysis // during queries, if needed. pub enum Units { @@ -189,6 +188,11 @@ pub enum Units { Bytes, Seconds, Nanoseconds, + Volts, + Amps, + DegreesCelcius, + /// Rotations per minute. + Rpm, } /// The schema for a timeseries. From 33ab24fc05c2755c352641f8265094af792bac75 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:22:29 -0700 Subject: [PATCH 09/13] Update Rust crate serde_json to 1.0.124 (#6305) --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a686b41823..5b38a4905e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8864,9 +8864,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.124" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" dependencies = [ "itoa", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 326a7a285e..bb899f8825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -498,7 +498,7 @@ secrecy = "0.8.0" semver = { version = "1.0.23", features = ["std", "serde"] } serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] } serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" } -serde_json = "1.0.122" +serde_json = "1.0.124" serde_path_to_error = "0.1.16" serde_tokenstream = "0.2" serde_urlencoded = "0.7.1" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index ad845e8073..854a020167 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -92,7 +92,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } @@ -195,7 +195,7 @@ schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.207", features = ["alloc", "derive", "rc"] } -serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } +serde_json = { version = "1.0.124", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } From d25b10258ff479c2081a67e4e523bf161f44b7b7 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 13 Aug 2024 13:45:02 -0400 Subject: [PATCH 10/13] Add a new `ClickhouseServer` Omicron Zone (#6297) The new zone type reflects the zone with which we'll deploy clickhouse server nodes in a replicated setup. We decided on using a new zone type rather than adding a boolean field to the existing `Clickhouse` zone type that is used for single server deployments in last Tuesday's (Aug 6, 2024) update huddle. This is the fist part of the work to be done for replicated clickhouse deployments that are automated via reconfigurator. As such, the actual zone deployment is left as a `todo`. --- common/src/api/internal/shared.rs | 5 ++ dev-tools/omdb/src/bin/omdb/db.rs | 1 + internal-dns/src/names.rs | 3 + nexus-sled-agent-shared/src/inventory.rs | 24 +++++++ nexus/db-model/src/dataset_kind.rs | 4 ++ nexus/db-model/src/inventory.rs | 4 ++ nexus/db-model/src/omicron_zone_config.rs | 13 ++++ nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/service_kind.rs | 1 + .../deployment/external_networking.rs | 1 + nexus/db-queries/src/db/datastore/rack.rs | 1 + nexus/reconfigurator/execution/src/dns.rs | 3 + .../execution/src/omicron_zones.rs | 1 + .../src/planner/omicron_zone_placement.rs | 1 + .../background/tasks/sync_service_zone_nat.rs | 1 + nexus/types/src/deployment.rs | 5 ++ nexus/types/src/deployment/zone_type.rs | 25 ++++++++ openapi/nexus-internal.json | 63 ++++++++++++++++--- openapi/sled-agent.json | 25 ++++++++ schema/all-zones-requests.json | 25 ++++++++ .../up1.sql | 1 + .../up2.sql | 1 + .../up3.sql | 1 + schema/crdb/dbinit.sql | 5 +- schema/rss-service-plan-v3.json | 25 ++++++++ sled-agent/src/params.rs | 3 + sled-agent/src/services.rs | 18 ++++++ sled-storage/src/dataset.rs | 3 + 28 files changed, 256 insertions(+), 10 deletions(-) create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up1.sql create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up2.sql create mode 100644 schema/crdb/add-clickhouse-server-enum-variants/up3.sql diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index e457d08fb2..3856a472ab 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -710,8 +710,12 @@ pub struct ResolvedVpcRouteSet { pub enum DatasetKind { Crucible, Cockroach, + /// Used for single-node clickhouse deployments Clickhouse, + /// Used for replicated clickhouse deployments ClickhouseKeeper, + /// Used for replicated clickhouse deployments + ClickhouseServer, ExternalDns, InternalDns, } @@ -724,6 +728,7 @@ impl fmt::Display for DatasetKind { Cockroach => "cockroach", Clickhouse => "clickhouse", ClickhouseKeeper => "clickhouse_keeper", + ClickhouseServer => "clickhouse_server", ExternalDns => "external_dns", InternalDns => "internal_dns", }; diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 1030e4288b..9ce4c66a80 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1106,6 +1106,7 @@ async fn lookup_service_info( | BlueprintZoneType::InternalNtp(_) => ServiceKind::Ntp, BlueprintZoneType::Clickhouse(_) => ServiceKind::Clickhouse, BlueprintZoneType::ClickhouseKeeper(_) => ServiceKind::ClickhouseKeeper, + BlueprintZoneType::ClickhouseServer(_) => ServiceKind::ClickhouseServer, BlueprintZoneType::CockroachDb(_) => ServiceKind::Cockroach, BlueprintZoneType::Crucible(_) => ServiceKind::Crucible, BlueprintZoneType::CruciblePantry(_) => ServiceKind::CruciblePantry, diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index f975029d69..a9fe1a36bf 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -25,6 +25,7 @@ pub const DNS_ZONE_EXTERNAL_TESTING: &str = "oxide-dev.test"; pub enum ServiceName { Clickhouse, ClickhouseKeeper, + ClickhouseServer, Cockroach, InternalDns, ExternalDns, @@ -48,6 +49,7 @@ impl ServiceName { match self { ServiceName::Clickhouse => "clickhouse", ServiceName::ClickhouseKeeper => "clickhouse-keeper", + ServiceName::ClickhouseServer => "clickhouse-server", ServiceName::Cockroach => "cockroach", ServiceName::ExternalDns => "external-dns", ServiceName::InternalDns => "nameservice", @@ -73,6 +75,7 @@ impl ServiceName { match self { ServiceName::Clickhouse | ServiceName::ClickhouseKeeper + | ServiceName::ClickhouseServer | ServiceName::Cockroach | ServiceName::InternalDns | ServiceName::ExternalDns diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 8a793f6150..2f1361a6f2 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -134,15 +134,26 @@ pub enum OmicronZoneType { snat_cfg: SourceNatConfig, }, + /// Type of clickhouse zone used for a single node clickhouse deployment Clickhouse { address: SocketAddrV6, dataset: OmicronZoneDataset, }, + /// A zone used to run a Clickhouse Keeper node + /// + /// Keepers are only used in replicated clickhouse setups ClickhouseKeeper { address: SocketAddrV6, dataset: OmicronZoneDataset, }, + + /// A zone used to run a Clickhouse Server in a replicated deployment + ClickhouseServer { + address: SocketAddrV6, + dataset: OmicronZoneDataset, + }, + CockroachDb { address: SocketAddrV6, dataset: OmicronZoneDataset, @@ -212,6 +223,9 @@ impl OmicronZoneType { OmicronZoneType::ClickhouseKeeper { .. } => { ZoneKind::ClickhouseKeeper } + OmicronZoneType::ClickhouseServer { .. } => { + ZoneKind::ClickhouseServer + } OmicronZoneType::CockroachDb { .. } => ZoneKind::CockroachDb, OmicronZoneType::Crucible { .. } => ZoneKind::Crucible, OmicronZoneType::CruciblePantry { .. } => ZoneKind::CruciblePantry, @@ -252,6 +266,7 @@ impl OmicronZoneType { OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -271,6 +286,7 @@ impl OmicronZoneType { | OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -289,6 +305,7 @@ impl OmicronZoneType { | OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::CruciblePantry { .. } | OmicronZoneType::ExternalDns { .. } @@ -310,6 +327,7 @@ impl OmicronZoneType { OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -328,6 +346,7 @@ impl OmicronZoneType { OmicronZoneType::InternalNtp { .. } | OmicronZoneType::Clickhouse { .. } | OmicronZoneType::ClickhouseKeeper { .. } + | OmicronZoneType::ClickhouseServer { .. } | OmicronZoneType::CockroachDb { .. } | OmicronZoneType::Crucible { .. } | OmicronZoneType::CruciblePantry { .. } @@ -367,6 +386,7 @@ pub enum ZoneKind { BoundaryNtp, Clickhouse, ClickhouseKeeper, + ClickhouseServer, CockroachDb, Crucible, CruciblePantry, @@ -390,6 +410,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroachdb" for historical reasons. ZoneKind::CockroachDb => "cockroachdb", ZoneKind::Crucible => "crucible", @@ -409,6 +430,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroachdb" for historical reasons. ZoneKind::CockroachDb => "cockroachdb", ZoneKind::Crucible => "crucible", @@ -431,6 +453,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse-keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", // Note "cockroach" for historical reasons. ZoneKind::CockroachDb => "cockroach", ZoneKind::Crucible => "crucible", @@ -451,6 +474,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp => "boundary_ntp", ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse_keeper", + ZoneKind::ClickhouseServer => "clickhouse_server", ZoneKind::CockroachDb => "cockroach_db", ZoneKind::Crucible => "crucible", ZoneKind::CruciblePantry => "crucible_pantry", diff --git a/nexus/db-model/src/dataset_kind.rs b/nexus/db-model/src/dataset_kind.rs index 395d01353e..4a86efaca1 100644 --- a/nexus/db-model/src/dataset_kind.rs +++ b/nexus/db-model/src/dataset_kind.rs @@ -20,6 +20,7 @@ impl_enum_type!( Cockroach => b"cockroach" Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" ExternalDns => b"external_dns" InternalDns => b"internal_dns" ); @@ -35,6 +36,9 @@ impl From for DatasetKind { internal::shared::DatasetKind::ClickhouseKeeper => { DatasetKind::ClickhouseKeeper } + internal::shared::DatasetKind::ClickhouseServer => { + DatasetKind::ClickhouseServer + } internal::shared::DatasetKind::ExternalDns => { DatasetKind::ExternalDns } diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 400acc68b3..87986c4f54 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -985,6 +985,7 @@ impl_enum_type!( BoundaryNtp => b"boundary_ntp" Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" CockroachDb => b"cockroach_db" Crucible => b"crucible" CruciblePantry => b"crucible_pantry" @@ -1001,6 +1002,7 @@ impl From for ServiceKind { ZoneType::BoundaryNtp | ZoneType::InternalNtp => Self::Ntp, ZoneType::Clickhouse => Self::Clickhouse, ZoneType::ClickhouseKeeper => Self::ClickhouseKeeper, + ZoneType::ClickhouseServer => Self::ClickhouseServer, ZoneType::CockroachDb => Self::Cockroach, ZoneType::Crucible => Self::Crucible, ZoneType::CruciblePantry => Self::CruciblePantry, @@ -1020,6 +1022,7 @@ impl From for nexus_sled_agent_shared::inventory::ZoneKind { ZoneType::BoundaryNtp => BoundaryNtp, ZoneType::Clickhouse => Clickhouse, ZoneType::ClickhouseKeeper => ClickhouseKeeper, + ZoneType::ClickhouseServer => ClickhouseServer, ZoneType::CockroachDb => CockroachDb, ZoneType::Crucible => Crucible, ZoneType::CruciblePantry => CruciblePantry, @@ -1040,6 +1043,7 @@ impl From for ZoneType { BoundaryNtp => ZoneType::BoundaryNtp, Clickhouse => ZoneType::Clickhouse, ClickhouseKeeper => ZoneType::ClickhouseKeeper, + ClickhouseServer => ZoneType::ClickhouseServer, CockroachDb => ZoneType::CockroachDb, Crucible => ZoneType::Crucible, CruciblePantry => ZoneType::CruciblePantry, diff --git a/nexus/db-model/src/omicron_zone_config.rs b/nexus/db-model/src/omicron_zone_config.rs index 9236fc9407..23e1ef2dd9 100644 --- a/nexus/db-model/src/omicron_zone_config.rs +++ b/nexus/db-model/src/omicron_zone_config.rs @@ -109,6 +109,9 @@ impl OmicronZone { OmicronZoneType::ClickhouseKeeper { address, dataset } => { (ZoneType::ClickhouseKeeper, address, Some(dataset)) } + OmicronZoneType::ClickhouseServer { address, dataset } => { + (ZoneType::ClickhouseServer, address, Some(dataset)) + } OmicronZoneType::CockroachDb { address, dataset } => { (ZoneType::CockroachDb, address, Some(dataset)) } @@ -258,6 +261,12 @@ impl OmicronZone { dataset: common.dataset?, }, ), + ZoneType::ClickhouseServer => BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { + address, + dataset: common.dataset?, + }, + ), ZoneType::CockroachDb => BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, @@ -392,6 +401,10 @@ impl OmicronZone { address, dataset: common.dataset?, }, + ZoneType::ClickhouseServer => OmicronZoneType::ClickhouseServer { + address, + dataset: common.dataset?, + }, ZoneType::CockroachDb => OmicronZoneType::CockroachDb { address, dataset: common.dataset?, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index dd9c9dc667..1e0caabb02 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(86, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(87, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(87, "add-clickhouse-server-enum-variants"), KnownVersion::new(86, "snapshot-replacement"), KnownVersion::new(85, "add-migrations-by-time-created-index"), KnownVersion::new(84, "region-read-only"), diff --git a/nexus/db-model/src/service_kind.rs b/nexus/db-model/src/service_kind.rs index 016de9c44e..04fbab20b2 100644 --- a/nexus/db-model/src/service_kind.rs +++ b/nexus/db-model/src/service_kind.rs @@ -20,6 +20,7 @@ impl_enum_type!( // Enum values Clickhouse => b"clickhouse" ClickhouseKeeper => b"clickhouse_keeper" + ClickhouseServer => b"clickhouse_server" Cockroach => b"cockroach" Crucible => b"crucible" CruciblePantry => b"crucible_pantry" diff --git a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs index b6ced8e2c5..7ace07305d 100644 --- a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -327,6 +327,7 @@ impl DataStore { ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, ZoneKind::Clickhouse | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer | ZoneKind::CockroachDb | ZoneKind::Crucible | ZoneKind::CruciblePantry diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index c9fb61b15a..c069a72955 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -595,6 +595,7 @@ impl DataStore { BlueprintZoneType::InternalNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 8bcae27bc0..885ffa67d1 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -275,6 +275,9 @@ pub fn blueprint_internal_dns_config( BlueprintZoneType::ClickhouseKeeper( blueprint_zone_type::ClickhouseKeeper { address, .. }, ) => (ServiceName::ClickhouseKeeper, address.port()), + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { address, .. }, + ) => (ServiceName::ClickhouseServer, address.port()), BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, .. }, ) => (ServiceName::Cockroach, address.port()), diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index acbb7a6b33..b40bbab982 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -138,6 +138,7 @@ pub(crate) async fn clean_up_expunged_zones( BlueprintZoneType::BoundaryNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) diff --git a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs index c08f30124c..2fb60e66f8 100644 --- a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs +++ b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs @@ -31,6 +31,7 @@ impl DiscretionaryOmicronZone { // Zones that we should place but don't yet. BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::InternalDns(_) diff --git a/nexus/src/app/background/tasks/sync_service_zone_nat.rs b/nexus/src/app/background/tasks/sync_service_zone_nat.rs index eb9554cff7..4fbef3ae2e 100644 --- a/nexus/src/app/background/tasks/sync_service_zone_nat.rs +++ b/nexus/src/app/background/tasks/sync_service_zone_nat.rs @@ -239,6 +239,7 @@ impl BackgroundTask for ServiceZoneNatTracker { // well OmicronZoneType::Clickhouse {..} => continue, OmicronZoneType::ClickhouseKeeper {..} => continue, + OmicronZoneType::ClickhouseServer{..} => continue, OmicronZoneType::CockroachDb {..} => continue, OmicronZoneType::Crucible {..} => continue, OmicronZoneType::CruciblePantry {..} => continue, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 4342adb02b..cc48f2646a 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -671,6 +671,11 @@ impl BlueprintZoneConfig { blueprint_zone_type::ClickhouseKeeper { address, dataset }, ) } + OmicronZoneType::ClickhouseServer { address, dataset } => { + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { address, dataset }, + ) + } OmicronZoneType::CockroachDb { address, dataset } => { BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, dataset }, diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index 789a0215b7..e4958fc3c3 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -25,6 +25,7 @@ pub enum BlueprintZoneType { BoundaryNtp(blueprint_zone_type::BoundaryNtp), Clickhouse(blueprint_zone_type::Clickhouse), ClickhouseKeeper(blueprint_zone_type::ClickhouseKeeper), + ClickhouseServer(blueprint_zone_type::ClickhouseServer), CockroachDb(blueprint_zone_type::CockroachDb), Crucible(blueprint_zone_type::Crucible), CruciblePantry(blueprint_zone_type::CruciblePantry), @@ -60,6 +61,7 @@ impl BlueprintZoneType { } BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -78,6 +80,7 @@ impl BlueprintZoneType { | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -94,6 +97,7 @@ impl BlueprintZoneType { | BlueprintZoneType::ExternalDns(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::Crucible(_) | BlueprintZoneType::CruciblePantry(_) @@ -110,6 +114,7 @@ impl BlueprintZoneType { BlueprintZoneType::BoundaryNtp(_) | BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) | BlueprintZoneType::CockroachDb(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) @@ -129,6 +134,9 @@ impl BlueprintZoneType { BlueprintZoneType::ClickhouseKeeper( blueprint_zone_type::ClickhouseKeeper { dataset, address }, ) => (dataset, DatasetKind::ClickhouseKeeper, address), + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { dataset, address }, + ) => (dataset, DatasetKind::ClickhouseServer, address), BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { dataset, address }, ) => (dataset, DatasetKind::Cockroach, address), @@ -185,6 +193,12 @@ impl From for OmicronZoneType { dataset: zone.dataset, } } + BlueprintZoneType::ClickhouseServer(zone) => { + Self::ClickhouseServer { + address: zone.address, + dataset: zone.dataset, + } + } BlueprintZoneType::CockroachDb(zone) => Self::CockroachDb { address: zone.address, dataset: zone.dataset, @@ -235,6 +249,7 @@ impl BlueprintZoneType { Self::BoundaryNtp(_) => ZoneKind::BoundaryNtp, Self::Clickhouse(_) => ZoneKind::Clickhouse, Self::ClickhouseKeeper(_) => ZoneKind::ClickhouseKeeper, + Self::ClickhouseServer(_) => ZoneKind::ClickhouseServer, Self::CockroachDb(_) => ZoneKind::CockroachDb, Self::Crucible(_) => ZoneKind::Crucible, Self::CruciblePantry(_) => ZoneKind::CruciblePantry, @@ -273,6 +288,7 @@ pub mod blueprint_zone_type { pub external_ip: OmicronZoneExternalSnatIp, } + /// Used in single-node clickhouse setups #[derive( Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] @@ -289,6 +305,15 @@ pub mod blueprint_zone_type { pub dataset: OmicronZoneDataset, } + /// Used in replicated clickhouse setups + #[derive( + Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, + )] + pub struct ClickhouseServer { + pub address: SocketAddrV6, + pub dataset: OmicronZoneDataset, + } + #[derive( Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize, )] diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a181d14540..5dd7d3dea3 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2154,6 +2154,7 @@ ] }, { + "description": "Used in single-node clickhouse setups", "type": "object", "properties": { "address": { @@ -2197,6 +2198,29 @@ "type" ] }, + { + "description": "Used in replicated clickhouse setups", + "type": "object", + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/components/schemas/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + }, + "required": [ + "address", + "dataset", + "type" + ] + }, { "type": "object", "properties": { @@ -2645,14 +2669,37 @@ }, "DatasetKind": { "description": "Describes the purpose of the dataset.", - "type": "string", - "enum": [ - "crucible", - "cockroach", - "clickhouse", - "clickhouse_keeper", - "external_dns", - "internal_dns" + "oneOf": [ + { + "type": "string", + "enum": [ + "crucible", + "cockroach", + "external_dns", + "internal_dns" + ] + }, + { + "description": "Used for single-node clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse" + ] + }, + { + "description": "Used for replicated clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse_keeper" + ] + }, + { + "description": "Used for replicated clickhouse deployments", + "type": "string", + "enum": [ + "clickhouse_server" + ] + } ] }, "DatasetPutRequest": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index ecaff33042..21e1451689 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3747,6 +3747,7 @@ ] }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "properties": { "address": { @@ -3769,6 +3770,7 @@ ] }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "properties": { "address": { @@ -3790,6 +3792,29 @@ "type" ] }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/components/schemas/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + }, + "required": [ + "address", + "dataset", + "type" + ] + }, { "type": "object", "properties": { diff --git a/schema/all-zones-requests.json b/schema/all-zones-requests.json index 910feb8c74..4d20959ad1 100644 --- a/schema/all-zones-requests.json +++ b/schema/all-zones-requests.json @@ -353,6 +353,7 @@ } }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "required": [ "address", @@ -375,6 +376,7 @@ } }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "required": [ "address", @@ -396,6 +398,29 @@ } } }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + } + }, { "type": "object", "required": [ diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up1.sql b/schema/crdb/add-clickhouse-server-enum-variants/up1.sql new file mode 100644 index 0000000000..9f1b4e419c --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up1.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.service_kind ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up2.sql b/schema/crdb/add-clickhouse-server-enum-variants/up2.sql new file mode 100644 index 0000000000..b94a4df0cf --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up2.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.dataset_kind ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/add-clickhouse-server-enum-variants/up3.sql b/schema/crdb/add-clickhouse-server-enum-variants/up3.sql new file mode 100644 index 0000000000..874ccec8f2 --- /dev/null +++ b/schema/crdb/add-clickhouse-server-enum-variants/up3.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.zone_type ADD VALUE IF NOT EXISTS 'clickhouse_server' AFTER 'clickhouse_keeper'; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2a83f01298..ddc399d282 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -288,6 +288,7 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_switch_by_rack ON omicron.public.switch CREATE TYPE IF NOT EXISTS omicron.public.service_kind AS ENUM ( 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'cockroach', 'crucible', 'crucible_pantry', @@ -506,6 +507,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.dataset_kind AS ENUM ( 'cockroach', 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'external_dns', 'internal_dns' ); @@ -3209,6 +3211,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.zone_type AS ENUM ( 'boundary_ntp', 'clickhouse', 'clickhouse_keeper', + 'clickhouse_server', 'cockroach_db', 'crucible', 'crucible_pantry', @@ -4214,7 +4217,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '86.0.0', NULL) + (TRUE, NOW(), NOW(), '87.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/rss-service-plan-v3.json b/schema/rss-service-plan-v3.json index fd4b9c7064..a003cde6f0 100644 --- a/schema/rss-service-plan-v3.json +++ b/schema/rss-service-plan-v3.json @@ -494,6 +494,7 @@ } }, { + "description": "Type of clickhouse zone used for a single node clickhouse deployment", "type": "object", "required": [ "address", @@ -516,6 +517,7 @@ } }, { + "description": "A zone used to run a Clickhouse Keeper node\n\nKeepers are only used in replicated clickhouse setups", "type": "object", "required": [ "address", @@ -537,6 +539,29 @@ } } }, + { + "description": "A zone used to run a Clickhouse Server in a replicated deployment", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + } + }, { "type": "object", "required": [ diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 4a7885279c..aa5e8fd26f 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -277,6 +277,9 @@ pub(crate) trait OmicronZoneTypeExt { OmicronZoneType::ClickhouseKeeper { dataset, address, .. } => { Some((dataset, DatasetType::ClickhouseKeeper, address)) } + OmicronZoneType::ClickhouseServer { dataset, address, .. } => { + Some((dataset, DatasetType::ClickhouseServer, address)) + } OmicronZoneType::CockroachDb { dataset, address, .. } => { Some((dataset, DatasetType::CockroachDb, address)) } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index a79d5b68e7..e319b3fa15 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1587,6 +1587,24 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } + ZoneArgs::Omicron(OmicronZoneConfigLocal { + zone: + OmicronZoneConfig { + zone_type: OmicronZoneType::ClickhouseServer { .. }, + underlay_address: _, + .. + }, + .. + }) => { + // We aren't yet deploying this service + error!( + &self.inner.log, + "Deploying ClickhouseServer zones is not yet supported" + ); + + todo!() + } + ZoneArgs::Omicron(OmicronZoneConfigLocal { zone: OmicronZoneConfig { diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 26b5085609..74f2be782f 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -142,6 +142,7 @@ pub enum DatasetType { Crucible, Clickhouse, ClickhouseKeeper, + ClickhouseServer, ExternalDns, InternalDns, } @@ -164,6 +165,7 @@ impl DatasetType { Self::CockroachDb => DatasetKind::Cockroach, Self::Clickhouse => DatasetKind::Clickhouse, Self::ClickhouseKeeper => DatasetKind::ClickhouseKeeper, + Self::ClickhouseServer => DatasetKind::ClickhouseServer, Self::ExternalDns => DatasetKind::ExternalDns, Self::InternalDns => DatasetKind::InternalDns, } @@ -206,6 +208,7 @@ impl std::fmt::Display for DatasetType { CockroachDb => "cockroachdb", Clickhouse => "clickhouse", ClickhouseKeeper => "clickhouse_keeper", + ClickhouseServer => "clickhouse_server", ExternalDns => "external_dns", InternalDns => "internal_dns", }; From cf0b0fda6c792bfc3d347fdefec368dd1e7b1f93 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 13 Aug 2024 10:46:18 -0700 Subject: [PATCH 11/13] Update Dendrite to use new timeseries, expunge the old (#6271) --- .../replicated/9/timeseries-to-delete.txt | 47 +++++++++++++++++++ .../single-node/9/timeseries-to-delete.txt | 47 +++++++++++++++++++ oximeter/db/src/model.rs | 2 +- package-manifest.toml | 12 ++--- tools/dendrite_openapi_version | 4 +- tools/dendrite_stub_checksums | 6 +-- 6 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 oximeter/db/schema/replicated/9/timeseries-to-delete.txt create mode 100644 oximeter/db/schema/single-node/9/timeseries-to-delete.txt diff --git a/oximeter/db/schema/replicated/9/timeseries-to-delete.txt b/oximeter/db/schema/replicated/9/timeseries-to-delete.txt new file mode 100644 index 0000000000..449d2e9155 --- /dev/null +++ b/oximeter/db/schema/replicated/9/timeseries-to-delete.txt @@ -0,0 +1,47 @@ +data_link:abort +data_link:b_e_r_check_done +data_link:b_e_r_check_start +data_link:bad_sync_headers +data_link:disabled +data_link:enabled +data_link:end +data_link:errored_blocks +data_link:fec_align +data_link:fec_corr_cnt +data_link:fec_hi_ser +data_link:fec_ser_lane0 +data_link:fec_ser_lane1 +data_link:fec_ser_lane2 +data_link:fec_ser_lane3 +data_link:fec_ser_lane4 +data_link:fec_ser_lane5 +data_link:fec_ser_lane6 +data_link:fec_ser_lane7 +data_link:fec_uncorr_cnt +data_link:idle +data_link:link_down +data_link:link_up +data_link:monitor_p_r_b_s_errors +data_link:pci_hi_ber +data_link:pcs_block_lock_loss +data_link:pcs_invalid_errors +data_link:pcs_sync_loss +data_link:pcs_unknown_errors +data_link:pcs_valid_errors +data_link:remote_fault +data_link:rx_buf_full +data_link:rx_bytes +data_link:rx_crc_errs +data_link:rx_errs +data_link:rx_pkts +data_link:tofino3_states +data_link:tx_bytes +data_link:tx_errs +data_link:tx_pkts +data_link:wait_auto_neg_done +data_link:wait_auto_neg_link_training_done +data_link:wait_d_f_e_done +data_link:wait_p_l_l_ready +data_link:wait_signal_o_k +data_link:wait_test_done +sidecar:sample_time diff --git a/oximeter/db/schema/single-node/9/timeseries-to-delete.txt b/oximeter/db/schema/single-node/9/timeseries-to-delete.txt new file mode 100644 index 0000000000..449d2e9155 --- /dev/null +++ b/oximeter/db/schema/single-node/9/timeseries-to-delete.txt @@ -0,0 +1,47 @@ +data_link:abort +data_link:b_e_r_check_done +data_link:b_e_r_check_start +data_link:bad_sync_headers +data_link:disabled +data_link:enabled +data_link:end +data_link:errored_blocks +data_link:fec_align +data_link:fec_corr_cnt +data_link:fec_hi_ser +data_link:fec_ser_lane0 +data_link:fec_ser_lane1 +data_link:fec_ser_lane2 +data_link:fec_ser_lane3 +data_link:fec_ser_lane4 +data_link:fec_ser_lane5 +data_link:fec_ser_lane6 +data_link:fec_ser_lane7 +data_link:fec_uncorr_cnt +data_link:idle +data_link:link_down +data_link:link_up +data_link:monitor_p_r_b_s_errors +data_link:pci_hi_ber +data_link:pcs_block_lock_loss +data_link:pcs_invalid_errors +data_link:pcs_sync_loss +data_link:pcs_unknown_errors +data_link:pcs_valid_errors +data_link:remote_fault +data_link:rx_buf_full +data_link:rx_bytes +data_link:rx_crc_errs +data_link:rx_errs +data_link:rx_pkts +data_link:tofino3_states +data_link:tx_bytes +data_link:tx_errs +data_link:tx_pkts +data_link:wait_auto_neg_done +data_link:wait_auto_neg_link_training_done +data_link:wait_d_f_e_done +data_link:wait_p_l_l_ready +data_link:wait_signal_o_k +data_link:wait_test_done +sidecar:sample_time diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index 05667058b5..f27df4ed49 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 8; +pub const OXIMETER_VERSION: u64 = 9; // Wrapper type to represent a boolean in the database. // diff --git a/package-manifest.toml b/package-manifest.toml index 5ee81e722b..2c68257050 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -660,8 +660,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "4b09ea6d89af353fd4240a3cfde8655c555f6f42e05c6fc4a4e32724f86bb749" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" output.type = "zone" output.intermediate_only = true @@ -687,8 +687,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "224ff076a3031d5b913e40084a48fce7bdd08e8ef1abd1ab74df0058963bb3b2" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "68bf16452a3159529fb1bd11f43adfb002020d086e0f64f48bd766bf47843ae9" output.type = "zone" output.intermediate_only = true @@ -707,8 +707,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9811438cc91c6ec4e8a8ca12479c920bb25fec81" -source.sha256 = "66b04128c41ad9cd26ca3746d51fff5d295ca65f48e7aabee616026934cc8d5e" +source.commit = "8293f28df659c070b48e13f87a51b836238b406e" +source.sha256 = "b7d6a1a20f302ded9c6e4bbba66b9432bec5edda593edfcdbb9429a95201655a" output.type = "zone" output.intermediate_only = true diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index b7d34debd1..652ebc31eb 100755 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="9811438cc91c6ec4e8a8ca12479c920bb25fec81" -SHA2="12dc61e7c62b2e1ee1cf3c2bf7cdda6bee6ec96925d2fc1c021c6c1a8fdd56cd" +COMMIT="8293f28df659c070b48e13f87a51b836238b406e" +SHA2="3a54305ab4b1270c9a5fb0603f481fce199f3767c174a03559ff642f7f44687e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 5d5b60ff57..cd8eb65a3e 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="4b09ea6d89af353fd4240a3cfde8655c555f6f42e05c6fc4a4e32724f86bb749" -CIDL_SHA256_LINUX_DPD="fb597785b6fd94b0840a80ff82bc596426aa6b815dd64793075f05d2ba5db38d" -CIDL_SHA256_LINUX_SWADM="9be30b688301debe4103057730ff9a426c96b45d571a6287268f381d8a11dbc1" +CIDL_SHA256_ILLUMOS="7400e4b0942b33af64a9aad1a429b0e2446e126f58a780328cf10eb46c63b7f8" +CIDL_SHA256_LINUX_DPD="290edfc4076d31d6f70aa7cc16ce758e10d14777d8542b688fa2880fdfde398c" +CIDL_SHA256_LINUX_SWADM="e1e35784538a4fdd76dc257cc636ac3f43f7ef2842dabfe981f17f8ce6b8e1a2" From b6e9078b6e15b2bc48874312d876e3aa01b52881 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 13 Aug 2024 10:49:53 -0700 Subject: [PATCH 12/13] move relevant docs from RFDs 48, 61 into Omicron (#6206) --- README.adoc | 2 +- docs/architecture-rev-2024-08-01.svg | 1 + docs/control-plane-architecture.adoc | 254 +++++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 docs/architecture-rev-2024-08-01.svg create mode 100644 docs/control-plane-architecture.adoc diff --git a/README.adoc b/README.adoc index 6b24821c6e..449bd3e5ea 100644 --- a/README.adoc +++ b/README.adoc @@ -14,7 +14,7 @@ Omicron is open-source. But we're pretty focused on our own goals for the forese https://docs.oxide.computer/api[Docs are automatically generated for the public (externally-facing) API] based on the OpenAPI spec that itself is automatically generated from the server implementation. You can generate your own docs for either the public API or any of the internal APIs by feeding the corresponding OpenAPI specs (in link:./openapi[]) into an OpenAPI doc generator. -There are some internal design docs in the link:./docs[] directory. +There are some internal design docs in the link:./docs[] directory. You might start with link:./docs/control-plane-architecture.adoc[]. For more design documentation and internal Rust API docs, see the https://rust.docs.corp.oxide.computer/omicron/[generated Rust documentation]. You can generate this yourself with: diff --git a/docs/architecture-rev-2024-08-01.svg b/docs/architecture-rev-2024-08-01.svg new file mode 100644 index 0000000000..a952297de4 --- /dev/null +++ b/docs/architecture-rev-2024-08-01.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/control-plane-architecture.adoc b/docs/control-plane-architecture.adoc new file mode 100644 index 0000000000..931da9ce02 --- /dev/null +++ b/docs/control-plane-architecture.adoc @@ -0,0 +1,254 @@ +:showtitle: +:numbered: +:toc: left + +// +// The sources for the diagrams in this document are in the Oxide Google Drive +// folder for Control Plane Architecture: +// https://drive.google.com/open?id=1OI-QxmapK7oYGFRGp0suJdpQDft-qVAz +// + += Control plane architecture + +NOTE: Much of this material originally came from <> and <>. This is now the living documentation for all the material covered here. + +NOTE: The RFD references in this documentation may be Oxide-internal. Where possible, we're trying to move relevant documentation from those RFDs into docs here. + +== What is the control plane + +In software systems the terms **data plane** and **control plane** are often used to refer to the parts of the system that directly provide resources to users (the data plane) and the parts that support the configuration, control, monitoring, and operation of the system (the control plane). Within the Oxide system, we say that the data plane comprises those parts that provide CPU resources (including both the host CPU and hypervisor software), storage resources, and network resources. The control plane provides the APIs through which users provision, configure, and monitor these resources and the mechanisms through which these APIs are implemented. Also part of the control plane are the APIs and facilities through which operators manage the system itself, including fault management, alerting, software updates for various components of the system, and so on. + +Broadly, the control plane must provide: + +* an externally-facing API endpoint described in <> through which users can provision elastic infrastructure backed by the system. This includes APIs for compute instances, storage, networking, as well as supporting resources like organizations, users, groups, ssh keys, tags, and so on. This API may be used by developers directly as well as the developer console backend. See <>. +* an externally-facing API endpoint for all operator functions. This is a long list, including configuration and management of hardware and software components and monitoring. +* implementation of lifecycle activities, like initial system setup; adding, removing, or replacing servers or other components; and the like. +* facilities for remote support by Oxide, including secure access to crash dumps, core files, log files, and system consoles. + +== Fundamental properties + +NOTE: These are design goals. They have not all been fully implemented yet. + +**Availability.** Availability of the control plane refers to the property that requests to provision resources succeed when the underlying resources are available within the system and requests to reconfigure or monitor resources succeed as long as they are well-formed. Unavailability refers to request failure due to hardware or software failure. + +IMPORTANT: Generally, the control plane is expected to remain **available** in the face of any two hardware or software failures, including transient failures of individual compute sleds, power rectifiers, switches, or the like. + +**Durability.** Along the same lines, resources created in the control plane are expected to be durable unless otherwise specified. That is, if the whole system is powered off and on again ("cold start"), the system should converge to a point where all instances, disks, and networking resources that were running before the power outage are available as they were from the user's perspective before the event. Similarly, if a compute server is lost (either through graceful decommissioning or otherwise), it should be possible to resume service of resources that were running on that server (e.g., instances, disks) on other servers in the system. There may be additional constraints on how many servers can fail permanently before data is lost, but in no case should it be possible to permanently lose an instance, disk, or other resource after the permanent failure of two compute sleds. + +IMPORTANT: Resources created by users should generally survive permanent failure of any two hardware or software components. + +**Consistency.** Generally, users can expect strong consistency for resources within some namespace. The bounds of the namespace for a particular resource may vary as described in <>. For example, if a user creates an instance, another user with appropriate permissions should immediately see that instance. In terms of https://en.wikipedia.org/wiki/CAP_theorem[**CAP**], the system is generally CP, with an emphasis on avoiding partitions through reliable software and hardware. + +IMPORTANT: The API namespace is generally expected to provide strong consistency. + +**Scalability and performance.** The API is designed with a scheme for naming and pagination that supports operating on arbitrarily large collections, so in principle it's expected to support arbitrary numbers of most resources. In practice, the system is intended to support on the order of 100 servers in a rack and 10,000 VMs in a rack. While these numbers are unlikely to change drastically in the future, the long-term goal of providing a single view over multiple racks means the system will need to support much larger numbers of servers and other resources. To avoid catastrophic degradation in performance (to the point of unavailability) as the system is scaled, aggressive limits will be imposed on the numbers of most resources. Operators may choose to raise these limits but will be advised to test the system's performance at the new scale. + +IMPORTANT: The API should support arbitrarily large systems. The system itself should be clear about its target scale and avoid catastrophic degradation due to users consuming too many resources. + +**Security.** Older versions of <> discussed control plane security in great detail. That content needs to be extracted from the history and probably put here. + +**Supportability and debuggability.** Effective customer support includes rapidly diagnosing issues and releasing fixes with low-risk updates. To achieve this, all the software in the system, including the control plane, must be built with supportability in mind, which means being able to collect enough information about failures to diagnose them from their first occurrence in the field as much as possible and being able to update software with low risk to the system. Details will be covered in an RFD to-be-named-later. + +== Parts of the control plane + +=== Crash course on hardware architecture + +For our purposes, an Oxide rack comprises three types of boards (systems): + +* Up to 32 compute **sleds** (servers). These are sometimes called **Gimlets**, though "Gimlet" technically refers to a particular hardware generation. Within the sled, the **host system** is the x86 box we generally think of as "the server". +* 1 or 2 **switches**, each attached via PCIe to one of the 32 compute sleds. (The switches are _also_ connected to each of the 32 sleds for networking. This PCIe connection we're talking about is for control of the switch itself, which is only done by one sled.) The chassis that house the switches are sometimes called **Sidecars**, though "Sidecar" technically refers to a particular hardware generation. Sleds that are attached to switches are often called **Scrimlets** (which is a little unfortunate since the name obviously comes from "Gimlet", but it might not be a Gimlet (since Gimlet refers to a specific hardware generation)). +* 1-2 power shelves, each with a **Power Shelf Controller (PSC)** that provides basic monitoring and control for the rectifiers that make up the power shelf. + +Each type of system (Gimlet, Sidecar, and PSC) contains a **service processor** (SP) that's responsible for basic monitoring and control, typically including power control and thermal management. + +<> discusses service processors in more detail. + +=== Components that run alongside specific hardware + +.Overview of the control plane +image::architecture-rev-2024-08-01.svg[Control Plane Architecture] + +At the "bottom" of the stack, we have a few basic components that reside alongside the specific pieces of hardware that they manage: + +* On each sled, the **sled agent** manages instances, storage, networking, and the sled's other resources. Sled agent also collects information about hardware and reports it to Nexus. Each sled also runs either a **boundary NTP** or **internal NTP** service to synchronize the sled's clock. More on boundary NTP below. +* On the two Scrimlets, a "switch zone" provides additional functionality related to the switch: +** **Dendrite** provides APIs for configuring the switch itself (e.g., populating various tables used for packet forwarding, NAT, etc.). +** **Management Gateway Service (MGS)** provides APIs for communicating with all the rack's service processors (including those on the sleds, Sidecars, and PSCs). See <> for details. +** **Wicket** and its associated service **wicketd** provide a text user interface (TUI) that's accessible over the rack's technician ports. Wicket is used for initial system setup (before networking has been configured) and for support. +** **Boundary NTP** provides NTP service for all sleds in the rack based on upstream NTP servers provided by the customer. + +.Components deployed alongside specific hardware +[cols="1h,2,4",stripes="none",options="header"] +|=== +| Component +| How it's deployed +| Availability/scalability + +| Sled agent +| One per sled, tied to that specific sled +| N/A + +| Internal DNS +| One zone per non-Scrimlet sled +| N/A + +| Boundary NTP +| One zone per Scrimlet. Both instances within a rack are fungible. +| There are two. Short-term failure (order of hours or even days) is unlikely to affect anything since sled clocks do not drift that quickly. + +| Dendrite +| Part of the switch zone (one per Scrimlet), tied to that specific switch +| Unavailability of either instance results in loss of ability to configure and monitor the corresponding switch. + +| Management Gateway +| Part of the switch zone (one per Scrimlet) Both instances within one rack are fungible. +| Only one of the two instances are generally required to maintain service. + +| Wicket +| Part of the switch zone (one per Scrimlet). Both instances within one rack are fungible. +| Wickets operate independently. Failure of one means unavailability of the TUI over that technician port. + +|=== + +=== Higher-level components + +Most other components: + +* are deployed in illumos zones +* don't care where they run and can even be deployed multiple times on the same sled +* can be deployed multiple times for availability, horizontal scalability, or both + +They are: + +* **Nexus** provides primary control for the whole control plane. Nexus hosts all user-facing APIs (both operator and customer), the web console, and internal APIs for other control plane components to report inventory, generate alerts, and so on. Nexus is also responsible for background control plane activity, including utilization management, server failure detection and recovery, and the like. Persistent state is stored elsewhere (in CockroachDB), which allows Nexus to be scaled separately. +* **CockroachDB** provides a replicated, strongly-consistent, horizontally scalable database that stores virtually all control plane data. See <> and <> for details. +* **Clickhouse** provides storage and querying services for metric data collected from all components in the rack. See <> for more information. +* **Oximeter** collects metric data from the other components and store it into Clickhouse. See <> for more information. +* **External DNS** operates authoritative DNS nameservers for end users and operators. These are authoritative nameservers for whatever DNS name the customer specifies. They currently just provide DNS names for the the external API and web console. +* **Internal DNS** provides DNS names for all control plane components. This is how most of the control plane discovers its dependencies. (See <> and <>.) + + +.Hardware-agnostic components +[cols="1h,2,4,4",stripes="none",options="header"] +|=== +| Component +| How it's deployed +| Horizontal scalability +| Availability + +| Nexus +| Using zones, as many as needed. Instances are fungible. +| Not architecturally limited. State provided by CockroachDB. +| With N instances needed to handle load, and M instances deployed, can survive M - N failures. + +| CockroachDB +| Using zones, as many as needed. Instances are fungible. +| Required, provided by CockroachDB cluster expansion. +| Required, provided by CockroachDB range replication. + +| Clickhouse +| Using zones, as many as needed. Instances are fungible. +| TBD +| Required, provided by Clickhouse replication (see <>). + +| Oximeter +| Using zones, as many as needed. +| Yes. Configuration managed by Nexus, stored in CockroachDB, and cached in local storage for improved availability when other components are down +| TBD. + +| External DNS +| Using zones, as many as needed. Instances are fungible. +| Not architecturally limited. Generally limited by the number of external DNS server IP addresses provided by the customer, which is usually 2-5. +| Generally, only one is needed for service. + +| Internal DNS +| Using zones, as many as needed. Instances are fungible. +| Hardcoded limit of 5. +| With N instances needed to handle load, and M instances deployed, can survive M - N failures. + +|=== + +== Design principles + +=== Basics + +As much as possible, components are deployed in illumos zones. These are lightweight containers that act as their own complete systems (e.g., with their own dedicated networking stack with its own interfaces, IPs, etc.). + +Oxide-produced components are written in Rust. They communicate over HTTP using APIs managed via OpenAPI using Dropshot. HTTP may not provide the best latency, but we don't expect the throughput of API requests to be so high or the target latency so low that the overhead of HTTP internally will noticeably impact the customer experience. Using OpenAPI enables us to leverage investments in OpenAPI libraries, tooling, and documentation that we need for the external API. Rigorous use of OpenAPI, including automatically generating OpenAPI specifications from server implementations, allows us to automatically identify potentially breaking API changes. This information will eventually be included in metadata associated with each component's update images so that the upgrade software can use this to ensure that only compatible combinations of components are deployed. + +Service discovery happens via DNS. See <> and <>. + +=== Nexus, data flow + +Nexus is the place where system-wide decisions get made. CockroachDB is the source of truth for all configuration. + +Nexus stores all of its state in CockroachDB. It's the only component that communicates directly with CockroachDB. + +Nexus instances operate independently, without directly coordinating with each other except through CockroachDB. + +Generally, when a change gets made, the process is: + +1. Nexus receives a request to make the change (e.g., via the external API) +2. Nexus validates the requested change +3. Nexus stores the information into CockroachDB. (This is the point where change is serialized against any concurrent changes.) +4. Nexus propagates the change to other components that need to know about it. + +There are a few basic contexts in Nexus: + +* **API requests** from either the external or internal API. Here, Nexus is latency-sensitive. When we make database queries or other requests in this context, we usually do _not_ retry transient failures, but leave that to callers (See https://en.wikipedia.org/wiki/End-to-end_principle["end-to-end principle"]). API request handlers may kick off sagas or activate background tasks. +* **Distributed sagas** are a https://www.youtube.com/watch?v=0UTOLRTwOX0[design pattern] for carrying out multi-step operations in a distributed system. Saga actions generally _do_ retry transient errors indefinitely. +* **Background tasks** are periodic or event-triggered activities that manage everything else that has to happen in the system (e.g., change propagation, CockroachDB cluster management, fault tolerance, etc.). Nexus has a framework for background tasks that's oriented around the "reconciler" pattern (see <>). In this context, we also usually don't retry individual operations -- instead, the entire activity will be retried on a periodic basis. Background tasks are structured to re-evaluate the state of the world each time they're run and then determine what to do, on the assumption that things may have changed since the last time they ran. + +It's essential that components provide visibility into what they're doing for debugging and support. Software should be able to exonerate itself when things are broken. + +* API requests are short-lived. The Nexus log is currently the only real way to see what these have done. +* Sagas are potentially long-lived. Without needing any per-saga work, the saga log provides detailed information about which steps have run, which steps are in-progress, and the results of each step that completed. +* Background tasks are continuous processes. They can provide whatever detailed status they want to, including things like: activity counters, error counters, ringbuffers of recent events, data produced by the task, etc. These can be viewed with `omdb`. + +== Cold start + +"Cold start" refers to starting the control plane from a rack that's completely powered off. Achieving this requires careful consideration of where configuration is stored and how configuration changes flow through the system. + +We'll start from the point where sleds are powered on, even though a lot happens with the rectifiers, service processors, Sidecars, etc. before that point. Once host systems are powered on: + +* Sled agents start up, communicate with each other, and form a trust quorum that enables each of them to decrypt their local storage. This local storage includes: +** a **bootstore** containing basic network configuration needed to bring up the rack +** information about what control plane services are running on this sled +* Sled agents apply any needed network configuration and start any services they're supposed to be running: +** On Scrimlets, the switch zone and boundary NTP are started. Boundary NTP synchronizes time from the customer-provided NTP servers. +** On non-Scrimlets, internal DNS is started. The rest of cold boot waits until time has been synchronized from the boundary NTP instances. +** Once time is synchronized, internal DNS services are started so that components can find each other. +** Once internal DNS is available, all other services are started concurrently. +*** CockroachDB nodes start up, discover the rest of the cluster via DNS, and form a cluster. +*** Nexus starts up and waits for CockroachDB to become available. +*** All other services start up and wait for their dependencies to become available. + +For this to work: + +* **Bootstore** must contain enough information to configure networking on the switches and each host to reach other services within the rack as well as the outside world (for NTP). +* **Internal DNS** must be able to come up without any external dependencies, meaning it stores a complete copy of all DNS data locally. + +However, Nexus is the place where all _changes_ to configuration are made, and CockroachDB is the source of truth for all configuration. As a result, when changing bootstore contents or internal DNS, the change is first made at Nexus, stored into CockroachDB, and then propagated to all sleds and internal DNS instances for local persistent storage so that it's available on cold start (of the _sled_) without the rest of the control plane being up. + +This is a very rough approximation, but gives an idea of the dependencies associated with cold start. + +[bibliography] +== References + +Unfortunately, most of these RFDs are not yet public. + +* [[[rfd4, RFD 4]]] https://rfd.shared.oxide.computer/rfd/4/[RFD 4 User Facing API] +* [[[rfd6, RFD 6]]] https://rfd.shared.oxide.computer/rfd/6/[RFD 6 Threat Model]. Note the reference above comes from an earlier version of RFD 6 (7e44771b239c0458aea2b6e2045294d41b79cb22 or earlier). +* [[[rfd24, RFD 24]]] https://rfd.shared.oxide.computer/rfd/24/[RFD 24 Multi-Rack Oxide Deployments] +* [[[rfd30, RFD 30]]] https://rfd.shared.oxide.computer/rfd/30/[RFD 30 Oxide Console Prototype] +* [[[rfd48, RFD 48]]] https://rfd.shared.oxide.computer/rfd/48/[RFD 48 Control Plane Requirements] +* [[[rfd53, RFD 53]]] https://rfd.shared.oxide.computer/rfd/53/[RFD 53 Control plane data storage requirements] +* [[[rfd61, RFD 61]]] https://rfd.shared.oxide.computer/rfd/61/[RFD 61 Control Plane Architecture and Design] +* [[[rfd110, RFD 110]]] https://rfd.shared.oxide.computer/rfd/110/[RFD 110 CockroachDB for the control plane database] +* [[[rfd125, RFD 125]]] https://rfd.shared.oxide.computer/rfd/125/[RFD 125 Telemetry requirements and building blocks] +* [[[rfd162, RFD 162]]] https://rfd.shared.oxide.computer/rfd/162/[RFD 162 Metrics collection architecture and design] +* [[[rfd206, RFD 206]]] https://rfd.shared.oxide.computer/rfd/206/[RFD 206 Service Discovery] +* [[[rfd210, RFD 210]]] https://rfd.shared.oxide.computer/rfd/210/[RFD 210 Omicron, service processors, and power shelf controllers] +* [[[rfd248, RFD 248]]] https://rfd.shared.oxide.computer/rfd/248/[RFD 248 Omicron service discovery: server side] +* [[[rfd373, RFD 373]]] https://rfd.shared.oxide.computer/rfd/373/[RFD 373 Reliable Persistent Workflows] +* [[[rfd468, RFD 468]]] https://rfd.shared.oxide.computer/rfd/468/[RFD 468 Rolling out replicated ClickHouse to new and existing racks] From e48cd90c6eaca31a7e377256d46aad7aa5167958 Mon Sep 17 00:00:00 2001 From: Rain Date: Tue, 13 Aug 2024 12:24:30 -0700 Subject: [PATCH 13/13] [omicron-zones] ensure name_prefix for clickhouse-server is valid (#6312) Followup from #6297 -- `name_prefix` requires dashes. --- Cargo.lock | 1 + nexus-sled-agent-shared/Cargo.toml | 1 + nexus-sled-agent-shared/src/inventory.rs | 28 ++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b38a4905e..3f7b669e37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5140,6 +5140,7 @@ dependencies = [ "schemars", "serde", "sled-hardware-types", + "strum", "uuid", ] diff --git a/nexus-sled-agent-shared/Cargo.toml b/nexus-sled-agent-shared/Cargo.toml index 8e2358e902..544cebfbe4 100644 --- a/nexus-sled-agent-shared/Cargo.toml +++ b/nexus-sled-agent-shared/Cargo.toml @@ -14,4 +14,5 @@ omicron-workspace-hack.workspace = true schemars.workspace = true serde.workspace = true sled-hardware-types.workspace = true +strum.workspace = true uuid.workspace = true diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 2f1361a6f2..2a94fc50db 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; // Export this type for convenience -- this way, dependents don't have to // depend on sled-hardware-types. pub use sled_hardware_types::Baseboard; +use strum::EnumIter; use uuid::Uuid; /// Identifies information about disks which may be attached to Sleds. @@ -381,7 +382,9 @@ impl OmicronZoneType { /// the four representations if at all possible. If you must add a new one, /// please add it here rather than doing something ad-hoc in the calling code /// so it's more legible. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, EnumIter, +)] pub enum ZoneKind { BoundaryNtp, Clickhouse, @@ -453,7 +456,7 @@ impl ZoneKind { ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => Self::NTP_PREFIX, ZoneKind::Clickhouse => "clickhouse", ZoneKind::ClickhouseKeeper => "clickhouse-keeper", - ZoneKind::ClickhouseServer => "clickhouse_server", + ZoneKind::ClickhouseServer => "clickhouse-server", // Note "cockroach" for historical reasons. ZoneKind::CockroachDb => "cockroach", ZoneKind::Crucible => "crucible", @@ -486,3 +489,24 @@ impl ZoneKind { } } } + +#[cfg(test)] +mod tests { + use omicron_common::api::external::Name; + use strum::IntoEnumIterator; + + use super::*; + + #[test] + fn test_name_prefixes() { + for zone_kind in ZoneKind::iter() { + let name_prefix = zone_kind.name_prefix(); + name_prefix.parse::().unwrap_or_else(|e| { + panic!( + "failed to parse name prefix {:?} for zone kind {:?}: {}", + name_prefix, zone_kind, e + ); + }); + } + } +}