From 89b998c1c24e26b9e1161acbbd1b236f40470bdd Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 17 Jul 2024 10:32:22 -0700 Subject: [PATCH 01/21] Move Dendrite-specific timeseries definitions to TOML (#6103) --- oximeter/oximeter/schema/dendrite.toml | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 oximeter/oximeter/schema/dendrite.toml diff --git a/oximeter/oximeter/schema/dendrite.toml b/oximeter/oximeter/schema/dendrite.toml new file mode 100644 index 0000000000..e822069a2f --- /dev/null +++ b/oximeter/oximeter/schema/dendrite.toml @@ -0,0 +1,58 @@ +format_version = 1 + +[target] +name = "dendrite" +description = "Oxide switch management daemon" +authz_scope = "fleet" +versions = [ + { version = 1, fields = [ "rack_id", "sled_model", "sled_revision", "sled_id", "sled_serial" ] }, +] + +[[metrics]] +name = "sample_collection_duration" +description = "Duration spent collecting all timeseries samples" +units = "seconds" +datum_type = "f64" +versions = [ + # Note: The sample collection time includes the time spent querying the + # switch for its statistics, which is why these fields are included. + # Dendrite may eventually report statistics about itself, or other aspects + # not related to the switch, so they belong here, not the target. + { added_in = 1, fields = [ "switch_model", "switch_revision", "switch_id", "switch_serial" ] } +] + +[fields.rack_id] +type = "uuid" +description = "ID of the rack containing the switch" + +[fields.sled_model] +type = "string" +description = "The model of the sled managing the switch" + +[fields.sled_revision] +type = "u32" +description = "Revision number of the sled managing the switch" + +[fields.sled_id] +type = "uuid" +description = "ID of the sled managing the switch" + +[fields.sled_serial] +type = "string" +description = "Serial number of the sled managing the switch" + +[fields.switch_model] +type = "string" +description = "The model of the switch being managed" + +[fields.switch_revision] +type = "u32" +description = "Revision number of the switch being managed" + +[fields.switch_id] +type = "uuid" +description = "ID of the switch being managed" + +[fields.switch_serial] +type = "string" +description = "Serial number of the switch being managed" From 30027becdc2c226a4371e4e8864456ebf072e245 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Wed, 17 Jul 2024 17:00:24 -0500 Subject: [PATCH 02/21] Bump web console (clone firewall rule) (#6112) https://github.com/oxidecomputer/console/compare/4377d015...17ae890c * [17ae890c](https://github.com/oxidecomputer/console/commit/17ae890c) oxidecomputer/console#2331 * [3a25f287](https://github.com/oxidecomputer/console/commit/3a25f287) oxidecomputer/console#2330 * [d7e92f65](https://github.com/oxidecomputer/console/commit/d7e92f65) oxidecomputer/console#2250 * [d7712f1c](https://github.com/oxidecomputer/console/commit/d7712f1c) oxidecomputer/console#2313 * [81bd2abd](https://github.com/oxidecomputer/console/commit/81bd2abd) oxidecomputer/console#2302 * [de7e443f](https://github.com/oxidecomputer/console/commit/de7e443f) link to images doc from the image upload form * [c2e7ca1c](https://github.com/oxidecomputer/console/commit/c2e7ca1c) oxidecomputer/console#2323 * [4147221c](https://github.com/oxidecomputer/console/commit/4147221c) oxidecomputer/console#2319 * [636fed3e](https://github.com/oxidecomputer/console/commit/636fed3e) oxidecomputer/console#2318 * [acce4fdd](https://github.com/oxidecomputer/console/commit/acce4fdd) update silos guide title --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 626464c23d..4f67064733 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="4377d01585ef87981ed51a4cd1f07376e8502d39" -SHA2="3e0707dcd6a350ecc3bd62e8e7485a773eebf52f5ffd0db4e8cfb01251e28374" +COMMIT="17ae890c68a5277fbefe773694e790a8f1b178b4" +SHA2="273a31ba14546305bfafeb9aedb2d9a7530328a0359cda363380c9ca3240b948" From b0f5013cbc3111b42b349add4ebf9234e07dbbd5 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:24:32 -0700 Subject: [PATCH 03/21] Update Rust crate tokio to 1.38.1 (#6093) --- Cargo.lock | 6 +++--- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e84129087e..8d41a715ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4002,7 +4002,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.5", ] [[package]] @@ -9878,9 +9878,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.38.0" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", diff --git a/Cargo.toml b/Cargo.toml index e46cdeb972..e74eb6ab60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -507,7 +507,7 @@ textwrap = "0.16.1" test-strategy = "0.3.1" thiserror = "1.0" tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "main" } -tokio = "1.37.0" +tokio = "1.38.1" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.15" tokio-tungstenite = "0.20" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 9c9f47d735..7be432ebca 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -104,7 +104,7 @@ string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.71", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } -tokio = { version = "1.38.0", features = ["full", "test-util"] } +tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } @@ -211,7 +211,7 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extr syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.71", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] } -tokio = { version = "1.38.0", features = ["full", "test-util"] } +tokio = { version = "1.38.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } From 5b32a8d8f7a23b87c170c0512e7c39e330bfeb91 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 04:18:17 +0000 Subject: [PATCH 04/21] Update taiki-e/install-action digest to ea7e518 (#6114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`3e71e71` -> `ea7e518`](https://togithub.com/taiki-e/install-action/compare/3e71e71...ea7e518) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 980acc33dc..3a31a5323d 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@3e71e7135de310b70bc22dccb4d275acde8e055a # v2 + uses: taiki-e/install-action@ea7e5189a7664872699532b4cd92a443f520624e # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 2ba2846933acf81e99b79b64016cdb1d0e1110c6 Mon Sep 17 00:00:00 2001 From: Nils Nieuwejaar Date: Thu, 18 Jul 2024 13:00:11 -0400 Subject: [PATCH 05/21] dbinit.sql should not be ALTER TABLE-ing after creating tables (#6116) --- schema/crdb/dbinit.sql | 53 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 7d93a5d5bd..a40e148683 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1430,7 +1430,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.network_interface_kind AS ENUM ( 'instance', /* An interface attached to a service. */ - 'service' + 'service', + 'probe' ); CREATE TABLE IF NOT EXISTS omicron.public.network_interface ( @@ -1871,6 +1872,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( */ state omicron.public.ip_attach_state NOT NULL, + is_probe BOOL NOT NULL DEFAULT false, + /* The name must be non-NULL iff this is a floating IP. */ CONSTRAINT null_fip_name CHECK ( (kind != 'floating' AND name IS NULL) OR @@ -2618,11 +2621,32 @@ CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_port_config ( geometry omicron.public.switch_port_geometry ); +CREATE TYPE IF NOT EXISTS omicron.public.switch_link_fec AS ENUM ( + 'Firecode', + 'None', + 'Rs' +); + +CREATE TYPE IF NOT EXISTS omicron.public.switch_link_speed AS ENUM ( + '0G', + '1G', + '10G', + '25G', + '40G', + '50G', + '100G', + '200G', + '400G' +); + CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_link_config ( port_settings_id UUID, lldp_service_config_id UUID NOT NULL, link_name TEXT, mtu INT4, + fec omicron.public.switch_link_fec, + speed omicron.public.switch_link_speed, + autoneg BOOL NOT NULL DEFAULT false, PRIMARY KEY (port_settings_id, link_name) ); @@ -3599,27 +3623,6 @@ FROM WHERE instance.time_deleted IS NULL AND vmm.time_deleted IS NULL; -CREATE TYPE IF NOT EXISTS omicron.public.switch_link_fec AS ENUM ( - 'Firecode', - 'None', - 'Rs' -); - -CREATE TYPE IF NOT EXISTS omicron.public.switch_link_speed AS ENUM ( - '0G', - '1G', - '10G', - '25G', - '40G', - '50G', - '100G', - '200G', - '400G' -); - -ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS fec omicron.public.switch_link_fec; -ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS speed omicron.public.switch_link_speed; - CREATE SEQUENCE IF NOT EXISTS omicron.public.ipv4_nat_version START 1 INCREMENT 1; CREATE TABLE IF NOT EXISTS omicron.public.ipv4_nat_entry ( @@ -3696,8 +3699,6 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_bfd_session ON omicron.public.bfd_sessi switch ) WHERE time_deleted IS NULL; -ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS autoneg BOOL NOT NULL DEFAULT false; - CREATE INDEX IF NOT EXISTS ipv4_nat_lookup_by_vni ON omicron.public.ipv4_nat_entry ( vni ) @@ -3790,10 +3791,6 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_probe_by_name ON omicron.public.probe ( ) WHERE time_deleted IS NULL; -ALTER TABLE omicron.public.external_ip ADD COLUMN IF NOT EXISTS is_probe BOOL NOT NULL DEFAULT false; - -ALTER TYPE omicron.public.network_interface_kind ADD VALUE IF NOT EXISTS 'probe'; - CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( 'started', 'succeeded', From c5ed4de5cd2b667cc4b46520e19bc036da8d63ab Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Thu, 18 Jul 2024 11:00:40 -0700 Subject: [PATCH 06/21] Updating dropshot, handling new log_headers field in config struct (#6105) --- Cargo.lock | 11 ++++++----- dns-server/src/lib.rs | 1 + dns-server/tests/basic_test.rs | 1 + gateway/src/lib.rs | 1 + installinator-api/src/lib.rs | 2 ++ internal-dns/src/resolver.rs | 1 + nexus/src/app/background/init.rs | 1 + nexus/test-utils/src/lib.rs | 1 + oximeter/producer/src/lib.rs | 1 + sled-agent/src/server.rs | 2 +- sled-agent/src/services.rs | 2 ++ sled-agent/src/sim/storage.rs | 1 + wicketd/src/lib.rs | 1 + workspace-hack/Cargo.toml | 6 ++++-- 14 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d41a715ec..867c2eec25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2034,7 +2034,7 @@ dependencies = [ [[package]] name = "dropshot" version = "0.10.2-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#9fef3961c0b89aa8ab8e186dc0c89f8f4f811eea" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#7b594d01f47ca783c5d4a25ca2b256602580fe92" dependencies = [ "async-stream", "async-trait", @@ -2080,7 +2080,7 @@ dependencies = [ [[package]] name = "dropshot_endpoint" version = "0.10.2-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#9fef3961c0b89aa8ab8e186dc0c89f8f4f811eea" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#7b594d01f47ca783c5d4a25ca2b256602580fe92" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3274,7 +3274,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.4.10", "tokio", "tower-service", "tracing", @@ -6008,6 +6008,7 @@ dependencies = [ "similar", "slog", "smallvec 1.13.2", + "socket2 0.5.7", "spin 0.9.8", "string_cache", "subtle", @@ -10743,9 +10744,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom 0.2.14", "serde", diff --git a/dns-server/src/lib.rs b/dns-server/src/lib.rs index a2b1fda0d7..424159e41d 100644 --- a/dns-server/src/lib.rs +++ b/dns-server/src/lib.rs @@ -138,6 +138,7 @@ impl TransientServer { bind_address: "[::1]:0".parse().unwrap(), request_body_max_bytes: 4 * 1024 * 1024, default_handler_task_mode: dropshot::HandlerTaskMode::Detached, + log_headers: vec![], }, ) .await?; diff --git a/dns-server/tests/basic_test.rs b/dns-server/tests/basic_test.rs index 19666e82c1..b3b7f37378 100644 --- a/dns-server/tests/basic_test.rs +++ b/dns-server/tests/basic_test.rs @@ -419,6 +419,7 @@ fn test_config( bind_address: "[::1]:0".to_string().parse().unwrap(), request_body_max_bytes: 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }; Ok((tmp_dir, config_storage, config_dropshot, logctx)) diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 1354f30a0a..be8c84d7db 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -98,6 +98,7 @@ fn start_dropshot_server( bind_address: SocketAddr::V6(addr), request_body_max_bytes, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }; let http_server_starter = dropshot::HttpServerStarter::new( &dropshot, diff --git a/installinator-api/src/lib.rs b/installinator-api/src/lib.rs index cd87643a66..3ff9acffd2 100644 --- a/installinator-api/src/lib.rs +++ b/installinator-api/src/lib.rs @@ -131,6 +131,7 @@ pub fn default_config(bind_address: std::net::SocketAddr) -> ConfigDropshot { // available in omicron. request_body_max_bytes: 4 * 1024 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], } } @@ -153,6 +154,7 @@ pub fn make_server_starter( // available in omicron. request_body_max_bytes: 4 * 1024 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }; let api = crate::installinator_api::api_description::()?; diff --git a/internal-dns/src/resolver.rs b/internal-dns/src/resolver.rs index cf5def01c5..fdd5dce428 100644 --- a/internal-dns/src/resolver.rs +++ b/internal-dns/src/resolver.rs @@ -434,6 +434,7 @@ mod test { bind_address: "[::1]:0".parse().unwrap(), request_body_max_bytes: 8 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, ) .await diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 3e79c42978..4a5d792c80 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -928,6 +928,7 @@ pub mod test { bind_address: "[::1]:0".parse().unwrap(), request_body_max_bytes: 8 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, ) .await diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 18efe40e27..960ded50d5 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -1575,6 +1575,7 @@ pub async fn start_dns_server( bind_address: "[::1]:0".parse().unwrap(), request_body_max_bytes: 8 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, ) .await diff --git a/oximeter/producer/src/lib.rs b/oximeter/producer/src/lib.rs index 36b05d7bb1..e9223b62f3 100644 --- a/oximeter/producer/src/lib.rs +++ b/oximeter/producer/src/lib.rs @@ -222,6 +222,7 @@ impl Server { bind_address: server_info.address, request_body_max_bytes, default_handler_task_mode: dropshot::HandlerTaskMode::Detached, + log_headers: vec![], }; let server = Self::build_dropshot_server(&log, ®istry, &dropshot)?; diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index f702e4c67d..ec86066096 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -68,7 +68,7 @@ impl Server { let dropshot_config = dropshot::ConfigDropshot { bind_address: SocketAddr::V6(sled_address), - ..config.dropshot + ..config.dropshot.clone() }; let dropshot_log = log.new(o!("component" => "dropshot (SledAgent)")); let http_server = dropshot::HttpServerStarter::new( diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index bbc91eee64..11a60e5d0e 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -2160,6 +2160,7 @@ impl ServiceManager { request_body_max_bytes: 8192 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, }, dropshot_internal: dropshot::ConfigDropshot { @@ -2170,6 +2171,7 @@ impl ServiceManager { // rack setup. request_body_max_bytes: 10 * 1024 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, internal_dns: nexus_config::InternalDns::FromSubnet { subnet: Ipv6Subnet::::new( diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 5077120fdd..0d534b9c4e 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -975,6 +975,7 @@ impl PantryServer { // - bulk writes into disks request_body_max_bytes: 8192 * 1024, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }, super::http_entrypoints_pantry::api(), pantry.clone(), diff --git a/wicketd/src/lib.rs b/wicketd/src/lib.rs index 66c6bd41e8..907d8754f8 100644 --- a/wicketd/src/lib.rs +++ b/wicketd/src/lib.rs @@ -135,6 +135,7 @@ impl Server { // some endpoints. request_body_max_bytes: 4 << 30, default_handler_task_mode: HandlerTaskMode::Detached, + log_headers: vec![], }; let mgs_manager = MgsManager::new(&log, args.mgs_address); diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 7be432ebca..14534baa6f 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -99,6 +99,7 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } +socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } @@ -116,7 +117,7 @@ unicode-bidi = { version = "0.3.15" } unicode-normalization = { version = "0.1.23" } usdt = { version = "0.5.0" } usdt-impl = { version = "0.5.0", default-features = false, features = ["asm", "des"] } -uuid = { version = "1.9.1", features = ["serde", "v4"] } +uuid = { version = "1.10.0", features = ["serde", "v4"] } yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zerocopy = { version = "0.7.34", features = ["derive", "simd"] } zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } @@ -204,6 +205,7 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } +socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } @@ -224,7 +226,7 @@ unicode-normalization = { version = "0.1.23" } unicode-xid = { version = "0.2.4" } usdt = { version = "0.5.0" } usdt-impl = { version = "0.5.0", default-features = false, features = ["asm", "des"] } -uuid = { version = "1.9.1", features = ["serde", "v4"] } +uuid = { version = "1.10.0", features = ["serde", "v4"] } yasna = { version = "0.5.2", features = ["bit-vec", "num-bigint", "std", "time"] } zerocopy = { version = "0.7.34", features = ["derive", "simd"] } zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] } From c04e1acaaf98ae7fce0d80ad33995bdabf919a44 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 18 Jul 2024 12:30:05 -0700 Subject: [PATCH 07/21] [nexus] Make 'dataset' columns for IP address optional (#6055) This is part of an effort to make datasets usable without an explicit service managing them (e.g., in the context of Support Bundles). Related to https://github.com/oxidecomputer/omicron/issues/6042 Fixes https://github.com/oxidecomputer/omicron/issues/2000 --- dev-tools/omdb/src/bin/omdb/db.rs | 24 +++++++---- nexus/db-model/src/dataset.rs | 18 ++++---- nexus/db-model/src/schema.rs | 4 +- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/dataset.rs | 8 ++-- nexus/db-queries/src/db/datastore/mod.rs | 12 ++++-- nexus/db-queries/src/db/datastore/region.rs | 8 +++- nexus/db-queries/src/db/datastore/volume.rs | 14 ++++--- .../reconfigurator/execution/src/datasets.rs | 2 +- .../background/tasks/lookup_region_port.rs | 41 +++++++++++-------- nexus/src/app/crucible.rs | 26 +++++++----- nexus/src/app/rack.rs | 2 +- nexus/src/app/sagas/disk_create.rs | 12 +++++- .../src/app/sagas/region_replacement_start.rs | 32 +++++++-------- nexus/src/app/sagas/snapshot_create.rs | 34 ++++++++++++--- nexus/src/app/sled.rs | 3 +- schema/crdb/dataset-address-optional/up01.sql | 1 + schema/crdb/dataset-address-optional/up02.sql | 1 + schema/crdb/dataset-address-optional/up03.sql | 4 ++ schema/crdb/dbinit.sql | 11 +++-- 20 files changed, 170 insertions(+), 90 deletions(-) create mode 100644 schema/crdb/dataset-address-optional/up01.sql create mode 100644 schema/crdb/dataset-address-optional/up02.sql create mode 100644 schema/crdb/dataset-address-optional/up03.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 44b34b0220..98669ddc06 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2879,7 +2879,12 @@ async fn cmd_db_validate_region_snapshots( use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; - let url = format!("http://{}", dataset.address()); + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); let actual_region_snapshots = client @@ -2940,7 +2945,7 @@ async fn cmd_db_validate_region_snapshots( dataset_id: region_snapshot.dataset_id, region_id: region_snapshot.region_id, snapshot_id: region_snapshot.snapshot_id, - dataset_addr: dataset.address(), + dataset_addr, error: String::from( "region snapshot was deleted, please remove its record", ), @@ -2955,7 +2960,7 @@ async fn cmd_db_validate_region_snapshots( dataset_id: region_snapshot.dataset_id, region_id: region_snapshot.region_id, snapshot_id: region_snapshot.snapshot_id, - dataset_addr: dataset.address(), + dataset_addr, error: String::from( "NEXUS BUG: region snapshot was deleted, but the higher level snapshot was not!", ), @@ -2984,7 +2989,7 @@ async fn cmd_db_validate_region_snapshots( dataset_id: region_snapshot.dataset_id, region_id: region_snapshot.region_id, snapshot_id: region_snapshot.snapshot_id, - dataset_addr: dataset.address(), + dataset_addr, error: format!( "AGENT BUG: region snapshot was deleted but has a running snapshot in state {:?}!", running_snapshot.state, @@ -3034,7 +3039,12 @@ async fn cmd_db_validate_region_snapshots( use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; - let url = format!("http://{}", dataset.address()); + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); let actual_region_snapshots = client @@ -3052,7 +3062,7 @@ async fn cmd_db_validate_region_snapshots( dataset_id: dataset.id(), region_id: region.id(), snapshot_id, - dataset_addr: dataset.address(), + dataset_addr, error: String::from( "Nexus does not know about this snapshot!", ), @@ -3077,7 +3087,7 @@ async fn cmd_db_validate_region_snapshots( dataset_id: dataset.id(), region_id: region.id(), snapshot_id, - dataset_addr: dataset.address(), + dataset_addr, error: String::from( "Nexus does not know about this running snapshot!" ), diff --git a/nexus/db-model/src/dataset.rs b/nexus/db-model/src/dataset.rs index 65c0070509..a9dee990b9 100644 --- a/nexus/db-model/src/dataset.rs +++ b/nexus/db-model/src/dataset.rs @@ -36,8 +36,8 @@ pub struct Dataset { pub pool_id: Uuid, - ip: ipv6::Ipv6Addr, - port: SqlU16, + ip: Option, + port: Option, pub kind: DatasetKind, pub size_used: Option, @@ -47,7 +47,7 @@ impl Dataset { pub fn new( id: Uuid, pool_id: Uuid, - addr: SocketAddrV6, + addr: Option, kind: DatasetKind, ) -> Self { let size_used = match kind { @@ -59,19 +59,19 @@ impl Dataset { time_deleted: None, rcgen: Generation::new(), pool_id, - ip: addr.ip().into(), - port: addr.port().into(), + ip: addr.map(|addr| addr.ip().into()), + port: addr.map(|addr| addr.port().into()), kind, size_used, } } - pub fn address(&self) -> SocketAddrV6 { - self.address_with_port(self.port.into()) + pub fn address(&self) -> Option { + self.address_with_port(self.port?.into()) } - pub fn address_with_port(&self, port: u16) -> SocketAddrV6 { - SocketAddrV6::new(Ipv6Addr::from(self.ip), port, 0, 0) + pub fn address_with_port(&self, port: u16) -> Option { + Some(SocketAddrV6::new(Ipv6Addr::from(self.ip?), port, 0, 0)) } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 89ae6c18c5..dc57de9263 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1022,8 +1022,8 @@ table! { pool_id -> Uuid, - ip -> Inet, - port -> Int4, + ip -> Nullable, + port -> Nullable, kind -> crate::DatasetKindEnum, size_used -> Nullable, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 3e740590c5..cc34a3581c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(82, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(83, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(83, "dataset-address-optional"), KnownVersion::new(82, "region-port"), KnownVersion::new(81, "add-nullable-filesystem-pool"), KnownVersion::new(80, "add-instance-id-to-migrations"), diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 3f1df24e45..a08e346fe8 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -290,7 +290,7 @@ mod test { .dataset_insert_if_not_exists(Dataset::new( Uuid::new_v4(), zpool_id, - "[::1]:0".parse().unwrap(), + Some("[::1]:0".parse().unwrap()), DatasetKind::Crucible, )) .await @@ -323,7 +323,7 @@ mod test { .dataset_insert_if_not_exists(Dataset::new( dataset1.id(), zpool_id, - "[::1]:12345".parse().unwrap(), + Some("[::1]:12345".parse().unwrap()), DatasetKind::Cockroach, )) .await @@ -339,7 +339,7 @@ mod test { .dataset_upsert(Dataset::new( Uuid::new_v4(), zpool_id, - "[::1]:0".parse().unwrap(), + Some("[::1]:0".parse().unwrap()), DatasetKind::Cockroach, )) .await @@ -371,7 +371,7 @@ mod test { .dataset_insert_if_not_exists(Dataset::new( dataset1.id(), zpool_id, - "[::1]:12345".parse().unwrap(), + Some("[::1]:12345".parse().unwrap()), DatasetKind::Cockroach, )) .await diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 461e71d88a..07b98c0542 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -892,7 +892,8 @@ mod test { .collect() .await; - let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); + let bogus_addr = + Some(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0)); let datasets = stream::iter(zpools) .map(|zpool| { @@ -1266,7 +1267,8 @@ mod test { .collect() .await; - let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); + let bogus_addr = + Some(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0)); // 1 dataset per zpool stream::iter(zpool_ids.clone()) @@ -1365,7 +1367,8 @@ mod test { .collect() .await; - let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); + let bogus_addr = + Some(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0)); // 1 dataset per zpool stream::iter(zpool_ids) @@ -1444,7 +1447,8 @@ mod test { physical_disk_id, ) .await; - let bogus_addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0); + let bogus_addr = + Some(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 8080, 0, 0)); let dataset = Dataset::new( Uuid::new_v4(), zpool_id, diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 6832665944..3b1c20c1df 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -496,7 +496,13 @@ impl DataStore { let dataset = self.dataset_get(region.dataset_id()).await?; - Ok(Some(SocketAddrV6::new(*dataset.address().ip(), port, 0, 0))) + let Some(address) = dataset.address() else { + return Err(Error::internal_error( + "Dataset for Crucible region does know IP address", + )); + }; + + Ok(Some(SocketAddrV6::new(*address.ip(), port, 0, 0))) } pub async fn regions_missing_ports( diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 84f8e211a8..b13006aa95 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1164,12 +1164,14 @@ impl DataStore { let mut targets: Vec = vec![]; - find_matching_rw_regions_in_volume( - &vcr, - dataset.address().ip(), - &mut targets, - ) - .map_err(|e| Error::internal_error(&e.to_string()))?; + let Some(address) = dataset.address() else { + return Err(Error::internal_error( + "Crucible Dataset missing IP address", + )); + }; + + find_matching_rw_regions_in_volume(&vcr, address.ip(), &mut targets) + .map_err(|e| Error::internal_error(&e.to_string()))?; Ok(targets) } diff --git a/nexus/reconfigurator/execution/src/datasets.rs b/nexus/reconfigurator/execution/src/datasets.rs index 51ac45c9df..139c94c53f 100644 --- a/nexus/reconfigurator/execution/src/datasets.rs +++ b/nexus/reconfigurator/execution/src/datasets.rs @@ -66,7 +66,7 @@ pub(crate) async fn ensure_dataset_records_exist( let dataset = Dataset::new( id.into_untyped_uuid(), pool_id.into_untyped_uuid(), - address, + Some(address), kind.into(), ); let maybe_inserted = datastore diff --git a/nexus/src/app/background/tasks/lookup_region_port.rs b/nexus/src/app/background/tasks/lookup_region_port.rs index b0f13ac986..fbfc5c5af2 100644 --- a/nexus/src/app/background/tasks/lookup_region_port.rs +++ b/nexus/src/app/background/tasks/lookup_region_port.rs @@ -91,26 +91,33 @@ impl BackgroundTask for LookupRegionPort { } }; - let returned_region = match get_region_from_agent( - &dataset.address(), - region.id(), - ) - .await - { - Ok(returned_region) => returned_region, + let Some(dataset_addr) = dataset.address() else { + let s = format!( + "Missing dataset address for dataset: {dataset_id}" + ); + error!(log, "{s}"); + status.errors.push(s); + continue; + }; - Err(e) => { - let s = format!( - "could not get region {} from agent: {e}", - region.id(), - ); + let returned_region = + match get_region_from_agent(&dataset_addr, region.id()) + .await + { + Ok(returned_region) => returned_region, - error!(log, "{s}"); - status.errors.push(s); + Err(e) => { + let s = format!( + "could not get region {} from agent: {e}", + region.id(), + ); - continue; - } - }; + error!(log, "{s}"); + status.errors.push(s); + + continue; + } + }; match self .datastore diff --git a/nexus/src/app/crucible.rs b/nexus/src/app/crucible.rs index caa65255e5..72a5c80baf 100644 --- a/nexus/src/app/crucible.rs +++ b/nexus/src/app/crucible.rs @@ -69,11 +69,17 @@ impl super::Nexus { fn crucible_agent_client_for_dataset( &self, dataset: &db::model::Dataset, - ) -> CrucibleAgentClient { - CrucibleAgentClient::new_with_client( - &format!("http://{}", dataset.address()), + ) -> Result { + let Some(addr) = dataset.address() else { + return Err(Error::internal_error( + "Missing crucible dataset address", + )); + }; + + Ok(CrucibleAgentClient::new_with_client( + &format!("http://{}", addr), self.reqwest_client.clone(), - ) + )) } /// Return if the Crucible agent is expected to be there and answer Nexus: @@ -147,7 +153,7 @@ impl super::Nexus { dataset: &db::model::Dataset, region: &db::model::Region, ) -> Result { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let Ok(extent_count) = u32::try_from(region.extent_count()) else { @@ -261,7 +267,7 @@ impl super::Nexus { dataset: &db::model::Dataset, region_id: Uuid, ) -> Result, Error> { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let result = ProgenitorOperationRetry::new( @@ -303,7 +309,7 @@ impl super::Nexus { dataset: &db::model::Dataset, region_id: Uuid, ) -> Result { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let result = ProgenitorOperationRetry::new( @@ -343,7 +349,7 @@ impl super::Nexus { dataset: &db::model::Dataset, region_id: Uuid, ) -> Result<(), Error> { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let result = ProgenitorOperationRetry::new( @@ -386,7 +392,7 @@ impl super::Nexus { region_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let result = ProgenitorOperationRetry::new( @@ -435,7 +441,7 @@ impl super::Nexus { region_id: Uuid, snapshot_id: Uuid, ) -> Result<(), Error> { - let client = self.crucible_agent_client_for_dataset(dataset); + let client = self.crucible_agent_client_for_dataset(dataset)?; let dataset_id = dataset.id(); let result = ProgenitorOperationRetry::new( diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 422540c0b8..13b30fd47a 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -145,7 +145,7 @@ impl super::Nexus { db::model::Dataset::new( dataset.dataset_id, dataset.zpool_id, - dataset.request.address, + Some(dataset.request.address), dataset.request.kind.into(), ) }) diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index bdccd7f79b..c350534617 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -498,9 +498,17 @@ async fn sdc_regions_ensure( .map(|(dataset, region)| { dataset .address_with_port(region.port_number) - .to_string() + .ok_or_else(|| { + ActionError::action_failed( + Error::internal_error(&format!( + "missing IP address for dataset {}", + dataset.id(), + )), + ) + }) + .map(|addr| addr.to_string()) }) - .collect(), + .collect::, ActionError>>()?, lossy: false, flush_timeout: None, diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index a4ba10775a..1297158b24 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -534,12 +534,13 @@ async fn srrs_replace_region_in_volume( "ensured_dataset_and_region", )?; - let new_region_address = SocketAddrV6::new( - *new_dataset.address().ip(), - ensured_region.port_number, - 0, - 0, - ); + let Some(new_address) = new_dataset.address() else { + return Err(ActionError::action_failed(Error::internal_error( + "Dataset missing IP address", + ))); + }; + let new_region_address = + SocketAddrV6::new(*new_address.ip(), ensured_region.port_number, 0, 0); // If this node is rerun, the forward action will have overwritten // db_region's volume id, so get the cached copy. @@ -611,12 +612,11 @@ async fn srrs_replace_region_in_volume_undo( "ensured_dataset_and_region", )?; - let new_region_address = SocketAddrV6::new( - *new_dataset.address().ip(), - ensured_region.port_number, - 0, - 0, - ); + let Some(new_address) = new_dataset.address() else { + anyhow::bail!("Dataset missing IP address"); + }; + let new_region_address = + SocketAddrV6::new(*new_address.ip(), ensured_region.port_number, 0, 0); // The forward action will have overwritten db_region's volume id, so get // the cached copy. @@ -894,25 +894,25 @@ pub(crate) mod test { Dataset::new( Uuid::new_v4(), Uuid::new_v4(), - "[fd00:1122:3344:101::1]:12345".parse().unwrap(), + Some("[fd00:1122:3344:101::1]:12345".parse().unwrap()), DatasetKind::Crucible, ), Dataset::new( Uuid::new_v4(), Uuid::new_v4(), - "[fd00:1122:3344:102::1]:12345".parse().unwrap(), + Some("[fd00:1122:3344:102::1]:12345".parse().unwrap()), DatasetKind::Crucible, ), Dataset::new( Uuid::new_v4(), Uuid::new_v4(), - "[fd00:1122:3344:103::1]:12345".parse().unwrap(), + Some("[fd00:1122:3344:103::1]:12345".parse().unwrap()), DatasetKind::Crucible, ), Dataset::new( Uuid::new_v4(), Uuid::new_v4(), - "[fd00:1122:3344:104::1]:12345".parse().unwrap(), + Some("[fd00:1122:3344:104::1]:12345".parse().unwrap()), DatasetKind::Crucible, ), ]; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 9e665a1de1..5a8313229a 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -411,9 +411,17 @@ async fn ssc_regions_ensure( .map(|(dataset, region)| { dataset .address_with_port(region.port_number) - .to_string() + .ok_or_else(|| { + ActionError::action_failed( + Error::internal_error(&format!( + "missing IP address for dataset {}", + dataset.id(), + )), + ) + }) + .map(|addr| addr.to_string()) }) - .collect(), + .collect::, ActionError>>()?, lossy: false, flush_timeout: None, @@ -1232,8 +1240,14 @@ async fn ssc_start_running_snapshot( let mut map: BTreeMap = BTreeMap::new(); for (dataset, region) in datasets_and_regions { + let Some(dataset_addr) = dataset.address() else { + return Err(ActionError::action_failed(Error::internal_error( + &format!("Missing IP address for dataset {}", dataset.id(),), + ))); + }; + // Create a Crucible agent client - let url = format!("http://{}", dataset.address()); + let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); info!( @@ -1299,11 +1313,21 @@ async fn ssc_start_running_snapshot( // Map from the region to the snapshot let region_addr = format!( "{}", - dataset.address_with_port(crucible_region.port_number) + SocketAddrV6::new( + *dataset_addr.ip(), + crucible_region.port_number, + 0, + 0 + ) ); let snapshot_addr = format!( "{}", - dataset.address_with_port(crucible_running_snapshot.port_number) + SocketAddrV6::new( + *dataset_addr.ip(), + crucible_running_snapshot.port_number, + 0, + 0 + ) ); info!(log, "map {} to {}", region_addr, snapshot_addr); map.insert(region_addr, snapshot_addr.clone()); diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 0165b2d261..6e21470368 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -306,7 +306,8 @@ impl super::Nexus { "dataset_id" => id.to_string(), "address" => address.to_string() ); - let dataset = db::model::Dataset::new(id, zpool_id, address, kind); + let dataset = + db::model::Dataset::new(id, zpool_id, Some(address), kind); self.db_datastore.dataset_upsert(dataset).await?; Ok(()) } diff --git a/schema/crdb/dataset-address-optional/up01.sql b/schema/crdb/dataset-address-optional/up01.sql new file mode 100644 index 0000000000..e29215251d --- /dev/null +++ b/schema/crdb/dataset-address-optional/up01.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.dataset ALTER COLUMN ip DROP NOT NULL; diff --git a/schema/crdb/dataset-address-optional/up02.sql b/schema/crdb/dataset-address-optional/up02.sql new file mode 100644 index 0000000000..997294fa12 --- /dev/null +++ b/schema/crdb/dataset-address-optional/up02.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.dataset ALTER COLUMN port DROP NOT NULL; diff --git a/schema/crdb/dataset-address-optional/up03.sql b/schema/crdb/dataset-address-optional/up03.sql new file mode 100644 index 0000000000..0af212e320 --- /dev/null +++ b/schema/crdb/dataset-address-optional/up03.sql @@ -0,0 +1,4 @@ +ALTER TABLE omicron.public.dataset ADD CONSTRAINT IF NOT EXISTS ip_and_port_set_for_crucible CHECK ( + (kind != 'crucible') OR + (kind = 'crucible' AND ip IS NOT NULL and port IS NOT NULL) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a40e148683..7fc83ad5d0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -525,8 +525,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.dataset ( pool_id UUID NOT NULL, /* Contact information for the dataset */ - ip INET NOT NULL, - port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, + ip INET, + port INT4 CHECK (port BETWEEN 0 AND 65535), kind omicron.public.dataset_kind NOT NULL, @@ -537,6 +537,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.dataset ( CONSTRAINT size_used_column_set_for_crucible CHECK ( (kind != 'crucible') OR (kind = 'crucible' AND size_used IS NOT NULL) + ), + + CONSTRAINT ip_and_port_set_for_crucible CHECK ( + (kind != 'crucible') OR + (kind = 'crucible' AND ip IS NOT NULL and port IS NOT NULL) ) ); @@ -4140,7 +4145,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '82.0.0', NULL) + (TRUE, NOW(), NOW(), '83.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From f038746eab35146f8a29ad85836d7b92cfdcf298 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Thu, 18 Jul 2024 12:45:04 -0700 Subject: [PATCH 08/21] Make VPC Subnet insertion query idempotent (#6098) - Fixes #6069 - Modifies the existing query to ignore PK conflicts, making it idempotent, while still detecting overlapping IP ranges from different subnets - Adds regression test for #6069 --- nexus/db-queries/src/db/datastore/mod.rs | 10 +- nexus/db-queries/src/db/datastore/vpc.rs | 26 +- nexus/db-queries/src/db/queries/vpc_subnet.rs | 702 +++++++++--------- nexus/src/app/sagas/vpc_create.rs | 6 +- nexus/src/app/vpc_subnet.rs | 21 +- 5 files changed, 390 insertions(+), 375 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 07b98c0542..2540790477 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -394,9 +394,9 @@ mod test { BlockSize, ConsoleSession, Dataset, DatasetKind, ExternalIp, PhysicalDisk, PhysicalDiskKind, PhysicalDiskPolicy, PhysicalDiskState, Project, Rack, Region, SiloUser, SledBaseboard, SledSystemHardware, - SledUpdate, SshKey, VpcSubnet, Zpool, + SledUpdate, SshKey, Zpool, }; - use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; + use crate::db::queries::vpc_subnet::InsertVpcSubnetQuery; use chrono::{Duration, Utc}; use futures::stream; use futures::StreamExt; @@ -1603,11 +1603,7 @@ mod test { "172.30.0.0/22".parse().unwrap(), "fd00::/64".parse().unwrap(), ); - let values = FilterConflictingVpcSubnetRangesQuery::new(subnet); - let query = - diesel::insert_into(db::schema::vpc_subnet::dsl::vpc_subnet) - .values(values) - .returning(VpcSubnet::as_returning()); + let query = InsertVpcSubnetQuery::new(subnet); println!("{}", diesel::debug_query(&query)); let explanation = query.explain_async(&conn).await.unwrap(); assert!( diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index fdb9c82fb5..615ecdac93 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -40,8 +40,8 @@ use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::queries::vpc::InsertVpcQuery; use crate::db::queries::vpc::VniSearchIter; -use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; -use crate::db::queries::vpc_subnet::SubnetError; +use crate::db::queries::vpc_subnet::InsertVpcSubnetError; +use crate::db::queries::vpc_subnet::InsertVpcSubnetQuery; use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -288,7 +288,7 @@ impl DataStore { self.vpc_create_subnet(opctx, &authz_vpc, vpc_subnet.clone()) .await .map(|_| ()) - .map_err(SubnetError::into_external) + .map_err(InsertVpcSubnetError::into_external) .or_else(|e| match e { Error::ObjectAlreadyExists { .. } => Ok(()), _ => Err(e), @@ -809,17 +809,17 @@ impl DataStore { opctx: &OpContext, authz_vpc: &authz::Vpc, subnet: VpcSubnet, - ) -> Result<(authz::VpcSubnet, VpcSubnet), SubnetError> { + ) -> Result<(authz::VpcSubnet, VpcSubnet), InsertVpcSubnetError> { opctx .authorize(authz::Action::CreateChild, authz_vpc) .await - .map_err(SubnetError::External)?; + .map_err(InsertVpcSubnetError::External)?; assert_eq!(authz_vpc.id(), subnet.vpc_id); let db_subnet = self.vpc_create_subnet_raw(subnet).await?; self.vpc_system_router_ensure_subnet_routes(opctx, authz_vpc.id()) .await - .map_err(SubnetError::External)?; + .map_err(InsertVpcSubnetError::External)?; Ok(( authz::VpcSubnet::new( authz_vpc.clone(), @@ -833,20 +833,16 @@ impl DataStore { pub(crate) async fn vpc_create_subnet_raw( &self, subnet: VpcSubnet, - ) -> Result { - use db::schema::vpc_subnet::dsl; - let values = FilterConflictingVpcSubnetRangesQuery::new(subnet.clone()); + ) -> Result { let conn = self .pool_connection_unauthorized() .await - .map_err(SubnetError::External)?; - - diesel::insert_into(dsl::vpc_subnet) - .values(values) - .returning(VpcSubnet::as_returning()) + .map_err(InsertVpcSubnetError::External)?; + let query = InsertVpcSubnetQuery::new(subnet.clone()); + query .get_result_async(&*conn) .await - .map_err(|e| SubnetError::from_diesel(e, &subnet)) + .map_err(|e| InsertVpcSubnetError::from_diesel(e, &subnet)) } pub async fn vpc_delete_subnet( diff --git a/nexus/db-queries/src/db/queries/vpc_subnet.rs b/nexus/db-queries/src/db/queries/vpc_subnet.rs index 72f2771a1e..8cbf4495ca 100644 --- a/nexus/db-queries/src/db/queries/vpc_subnet.rs +++ b/nexus/db-queries/src/db/queries/vpc_subnet.rs @@ -7,407 +7,322 @@ use crate::db; use crate::db::identity::Resource; use crate::db::model::VpcSubnet; -use chrono::{DateTime, Utc}; +use crate::db::schema::vpc_subnet::dsl; +use crate::db::DbConnection; use diesel::pg::Pg; use diesel::prelude::*; use diesel::query_builder::*; use diesel::result::Error as DieselError; use diesel::sql_types; +use ipnetwork::IpNetwork; use omicron_common::api::external; use ref_cast::RefCast; use uuid::Uuid; -/// Errors related to allocating VPC Subnets. -#[derive(Debug, PartialEq)] -pub enum SubnetError { - /// An IPv4 or IPv6 subnet overlaps with an existing VPC Subnet - OverlappingIpRange(ipnetwork::IpNetwork), - /// An other error - External(external::Error), -} - -impl SubnetError { - /// Construct a `SubnetError` from a Diesel error, catching the desired - /// cases and building useful errors. - pub fn from_diesel(e: DieselError, subnet: &VpcSubnet) -> Self { - use crate::db::error; - use diesel::result::DatabaseErrorKind; - const IPV4_OVERLAP_ERROR_MESSAGE: &str = - r#"null value in column "ipv4_block" violates not-null constraint"#; - const IPV6_OVERLAP_ERROR_MESSAGE: &str = - r#"null value in column "ipv6_block" violates not-null constraint"#; - const NAME_CONFLICT_CONSTRAINT: &str = "vpc_subnet_vpc_id_name_key"; - match e { - // Attempt to insert overlapping IPv4 subnet - DieselError::DatabaseError( - DatabaseErrorKind::NotNullViolation, - ref info, - ) if info.message() == IPV4_OVERLAP_ERROR_MESSAGE => { - SubnetError::OverlappingIpRange(ipnetwork::IpNetwork::V4( - subnet.ipv4_block.0.into(), - )) - } - - // Attempt to insert overlapping IPv6 subnet - DieselError::DatabaseError( - DatabaseErrorKind::NotNullViolation, - ref info, - ) if info.message() == IPV6_OVERLAP_ERROR_MESSAGE => { - SubnetError::OverlappingIpRange(ipnetwork::IpNetwork::V6( - subnet.ipv6_block.0.into(), - )) - } - - // Conflicting name for the subnet within a VPC - DieselError::DatabaseError( - DatabaseErrorKind::UniqueViolation, - ref info, - ) if info.constraint_name() == Some(NAME_CONFLICT_CONSTRAINT) => { - SubnetError::External(error::public_error_from_diesel( - e, - error::ErrorHandler::Conflict( - external::ResourceType::VpcSubnet, - subnet.identity().name.as_str(), - ), - )) - } - - // Any other error at all is a bug - _ => SubnetError::External(error::public_error_from_diesel( - e, - error::ErrorHandler::Server, - )), - } - } - - /// Convert into a public error - pub fn into_external(self) -> external::Error { - match self { - SubnetError::OverlappingIpRange(ip) => { - external::Error::invalid_request( - format!("IP address range '{}' conflicts with an existing subnet", ip).as_str() - ) - }, - SubnetError::External(e) => e, - } - } -} - -/// Generate a subquery that selects any overlapping address ranges of the same -/// type as the input IP subnet. +/// Query used to insert VPC Subnets. /// -/// This generates a query that, in full, looks like: +/// This query is used to idempotently insert a VPC Subnet. The query also looks +/// for any other subnets in the same VPC whose IP address blocks overlap. All +/// Subnets are required to have non-overlapping IP blocks. /// -/// ```sql -/// SELECT -/// -/// FROM -/// vpc_subnet -/// WHERE -/// vpc_id = AND -/// time_deleted IS NULL AND -/// inet_contains_or_equals(ipv*_block, ) -/// LIMIT 1 -/// ``` -/// -/// The input may be either an IPv4 or IPv6 subnet, and the corresponding column -/// is compared against. Note that the exact input IP range is returned on -/// purpose. -fn push_select_overlapping_ip_range<'a>( - mut out: AstPass<'_, 'a, Pg>, - vpc_id: &'a Uuid, - ip: &'a ipnetwork::IpNetwork, -) -> diesel::QueryResult<()> { - use crate::db::schema::vpc_subnet::dsl; - out.push_sql("SELECT "); - out.push_bind_param::(ip)?; - out.push_sql(" FROM "); - VPC_SUBNET_FROM_CLAUSE.walk_ast(out.reborrow())?; - out.push_sql(" WHERE "); - out.push_identifier(dsl::vpc_id::NAME)?; - out.push_sql(" = "); - out.push_bind_param::(vpc_id)?; - out.push_sql(" AND "); - out.push_identifier(dsl::time_deleted::NAME)?; - out.push_sql(" IS NULL AND inet_contains_or_equals("); - if ip.is_ipv4() { - out.push_identifier(dsl::ipv4_block::NAME)?; - } else { - out.push_identifier(dsl::ipv6_block::NAME)?; - } - out.push_sql(", "); - out.push_bind_param::(ip)?; - out.push_sql(")"); - Ok(()) -} - -/// Generate a subquery that returns NULL if there is an overlapping IP address -/// range of any type. +/// Note that this query is idempotent. If a record with the provided primary +/// key already exists, that record is returned exactly from the DB, without any +/// other modification or alteration. If callers care, they can inspect the +/// record to make sure it's what they expected, though that's usually a fraught +/// endeavor. /// -/// This specifically generates a query that looks like: +/// Here is the entire query: /// /// ```sql -/// SELECT NULLIF( -/// , -/// push_select_overlapping_ip_range(, ) -/// ) -/// ``` -/// -/// The `NULLIF` function returns NULL if those two expressions are equal, and -/// the first expression otherwise. That is, this returns NULL if there exists -/// an overlapping IP range already in the VPC Subnet table, and the requested -/// IP range if not. -fn push_null_if_overlapping_ip_range<'a>( - mut out: AstPass<'_, 'a, Pg>, - vpc_id: &'a Uuid, - ip: &'a ipnetwork::IpNetwork, -) -> diesel::QueryResult<()> { - out.push_sql("SELECT NULLIF("); - out.push_bind_param::(ip)?; - out.push_sql(", ("); - push_select_overlapping_ip_range(out.reborrow(), vpc_id, ip)?; - out.push_sql("))"); - Ok(()) -} - -/// Generate a CTE that can be used to insert a VPC Subnet, only if the IP -/// address ranges of that subnet don't overlap with existing Subnets in the -/// same VPC. -/// -/// In particular, this generates a CTE like so: -/// -/// ```sql -/// WITH candidate( -/// id, -/// name, -/// description, -/// time_created, -/// time_modified, -/// time_deleted, -/// vpc_id, -/// rcgen -/// ) AS (VALUES ( -/// , -/// , -/// , -/// , -/// , -/// NULL::TIMESTAMPTZ, -/// , -/// 0 -/// )), -/// candidate_ipv4(ipv4_block) AS ( -/// SELECT( -/// NULLIF( -/// , -/// ( -/// SELECT -/// ipv4_block -/// FROM -/// vpc_subnet -/// WHERE -/// vpc_id = AND -/// time_deleted IS NULL AND -/// inet_contains_or_equals(, ipv4_block) -/// LIMIT 1 +/// WITH +/// -- This CTE generates a casting error if any live records, other than _this_ +/// -- record, have overlapping IP blocks of either family. +/// overlap AS MATERIALIZED ( +/// SELECT +/// -- NOTE: This cast always fails, we just use _how_ it fails to +/// -- learn which IP block overlaps. The filter `id != ` below +/// -- means we're explicitly ignoring an existing, identical record. +/// -- So this cast is only run if there is another record in the same +/// -- VPC with an overlapping subnet, which is exactly the error case +/// -- we're trying to cacth. +/// CAST( +/// IF( +/// inet_contains_or_equals(ipv4_block, ), +/// 'ipv4', +/// 'ipv6' /// ) -/// ) -/// ) -/// ), -/// candidate_ipv6(ipv6_block) AS ( -/// +/// AS BOOL +/// ) +/// FROM +/// vpc_subnet +/// WHERE +/// vpc_id = AND +/// time_deleted IS NULL AND +/// id != AND +/// ( +/// inet_contains_or_equals(ipv4_block, ) OR +/// inet_contains_or_equals(ipv6_block, ) +/// ) +/// ) +/// INSERT INTO +/// vpc_subnet +/// VALUES ( +/// /// ) -/// SELECT * -/// FROM candidate, candidate_ipv4, candidate_ipv6 +/// ON CONFLICT (id) +/// -- We use this "no-op" update to allow us to return the actual row from the +/// -- DB, either the existing or inserted one. +/// DO UPDATE SET id = id +/// RETURNING *; /// ``` -pub struct FilterConflictingVpcSubnetRangesQuery { - // TODO: update with random one if the insertion fails. +#[derive(Clone, Debug)] +pub struct InsertVpcSubnetQuery { + /// The subnet to insert subnet: VpcSubnet, - - // The following fields are derived from the previous field. This begs the - // question: "Why bother storing them at all?" - // - // Diesel's [`diesel::query_builder::ast_pass::AstPass:push_bind_param`] method - // requires that the provided value now live as long as the entire AstPass - // type. By storing these values in the struct, they'll live at least as - // long as the entire call to [`QueryFragment::walk_ast`]. - ipv4_block: ipnetwork::IpNetwork, - ipv6_block: ipnetwork::IpNetwork, + /// Owned values of the IP blocks to check, for inserting in internal pieces + /// of the query. + ipv4_block: IpNetwork, + ipv6_block: IpNetwork, } -impl FilterConflictingVpcSubnetRangesQuery { +impl InsertVpcSubnetQuery { + /// Construct a new query to insert the provided subnet. pub fn new(subnet: VpcSubnet) -> Self { - let ipv4_block = - ipnetwork::Ipv4Network::from(subnet.ipv4_block.0).into(); - let ipv6_block = - ipnetwork::Ipv6Network::from(subnet.ipv6_block.0).into(); + let ipv4_block = IpNetwork::V4(subnet.ipv4_block.0.into()); + let ipv6_block = IpNetwork::V6(subnet.ipv6_block.0.into()); Self { subnet, ipv4_block, ipv6_block } } } -impl QueryId for FilterConflictingVpcSubnetRangesQuery { +impl QueryId for InsertVpcSubnetQuery { type QueryId = (); const HAS_STATIC_QUERY_ID: bool = false; } -impl QueryFragment for FilterConflictingVpcSubnetRangesQuery { +impl QueryFragment for InsertVpcSubnetQuery { fn walk_ast<'a>( &'a self, mut out: AstPass<'_, 'a, Pg>, ) -> diesel::QueryResult<()> { - use db::schema::vpc_subnet::dsl; - - // Create the base `candidate` from values provided that need no - // verificiation. - out.push_sql("SELECT * FROM (WITH candidate("); - out.push_identifier(dsl::id::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::name::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::description::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::time_created::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::time_modified::NAME)?; + out.push_sql("WITH overlap AS MATERIALIZED (SELECT CAST(IF(inet_contains_or_equals("); + out.push_identifier(dsl::ipv4_block::NAME)?; out.push_sql(", "); - out.push_identifier(dsl::time_deleted::NAME)?; + out.push_bind_param::(&self.ipv4_block)?; + out.push_sql("), "); + out.push_bind_param::( + InsertVpcSubnetError::OVERLAPPING_IPV4_BLOCK_SENTINEL, + )?; out.push_sql(", "); + out.push_bind_param::( + InsertVpcSubnetError::OVERLAPPING_IPV6_BLOCK_SENTINEL, + )?; + out.push_sql(") AS BOOL) FROM "); + VPC_SUBNET_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql(" WHERE "); out.push_identifier(dsl::vpc_id::NAME)?; - out.push_sql(","); - out.push_identifier(dsl::rcgen::NAME)?; - out.push_sql(") AS (VALUES ("); + out.push_sql(" = "); + out.push_bind_param::(&self.subnet.vpc_id)?; + out.push_sql(" AND "); + out.push_identifier(dsl::time_deleted::NAME)?; + out.push_sql(" IS NULL AND "); + out.push_identifier(dsl::id::NAME)?; + out.push_sql(" != "); out.push_bind_param::(&self.subnet.identity.id)?; + out.push_sql(" AND (inet_contains_or_equals("); + out.push_identifier(dsl::ipv4_block::NAME)?; out.push_sql(", "); - out.push_bind_param::( - db::model::Name::ref_cast(self.subnet.name()), - )?; + out.push_bind_param::(&self.ipv4_block)?; + out.push_sql(") OR inet_contains_or_equals("); + out.push_identifier(dsl::ipv6_block::NAME)?; out.push_sql(", "); - out.push_bind_param::( + out.push_bind_param::(&self.ipv6_block)?; + + out.push_sql("))) INSERT INTO "); + VPC_SUBNET_FROM_CLAUSE.walk_ast(out.reborrow())?; + out.push_sql("VALUES ("); + out.push_bind_param::(&self.subnet.identity.id)?; + out.push_sql(", "); + out.push_bind_param::(db::model::Name::ref_cast( + self.subnet.name(), + ))?; + out.push_sql(", "); + out.push_bind_param::( &self.subnet.identity.description, )?; out.push_sql(", "); - out.push_bind_param::>( + out.push_bind_param::( &self.subnet.identity.time_created, )?; out.push_sql(", "); - out.push_bind_param::>( + out.push_bind_param::( &self.subnet.identity.time_modified, )?; out.push_sql(", "); - out.push_sql("NULL::TIMESTAMPTZ, "); - out.push_bind_param::(&self.subnet.vpc_id)?; - out.push_sql(", 0)), "); - - // Push the candidate IPv4 and IPv6 selection subqueries, which return - // NULL if the corresponding address range overlaps. - out.push_sql("candidate_ipv4("); - out.push_identifier(dsl::ipv4_block::NAME)?; - out.push_sql(") AS ("); - push_null_if_overlapping_ip_range( - out.reborrow(), - &self.subnet.vpc_id, - &self.ipv4_block, + out.push_bind_param::, _>( + &self.subnet.identity.time_deleted, )?; - - out.push_sql("), candidate_ipv6("); - out.push_identifier(dsl::ipv6_block::NAME)?; - out.push_sql(") AS ("); - push_null_if_overlapping_ip_range( - out.reborrow(), - &self.subnet.vpc_id, - &self.ipv6_block, + out.push_sql(", "); + out.push_bind_param::(&self.subnet.vpc_id)?; + out.push_sql(", "); + out.push_bind_param::(&self.subnet.rcgen)?; + out.push_sql(", "); + out.push_bind_param::(&self.ipv4_block)?; + out.push_sql(", "); + out.push_bind_param::(&self.ipv6_block)?; + out.push_sql(", "); + out.push_bind_param::, _>( + &self.subnet.custom_router_id, )?; - out.push_sql(") "); + out.push_sql(") ON CONFLICT ("); + out.push_identifier(dsl::id::NAME)?; + out.push_sql(") DO UPDATE SET "); + out.push_identifier(dsl::id::NAME)?; + out.push_sql(" = "); + out.push_bind_param::(&self.subnet.identity.id)?; + out.push_sql(" RETURNING *"); - // Select the entire set of candidate columns. - out.push_sql( - "SELECT * FROM candidate, candidate_ipv4, candidate_ipv6)", - ); Ok(()) } } -impl Insertable - for FilterConflictingVpcSubnetRangesQuery -{ - type Values = FilterConflictingVpcSubnetRangesQueryValues; +type FromClause = + diesel::internal::table_macro::StaticQueryFragmentInstance; +type VpcSubnetFromClause = FromClause; +const VPC_SUBNET_FROM_CLAUSE: VpcSubnetFromClause = VpcSubnetFromClause::new(); - fn values(self) -> Self::Values { - FilterConflictingVpcSubnetRangesQueryValues(self) - } +impl RunQueryDsl for InsertVpcSubnetQuery {} +impl Query for InsertVpcSubnetQuery { + type SqlType = <>::SelectExpression as diesel::Expression>::SqlType; } -/// Used to allow inserting the result of the -/// `FilterConflictingVpcSubnetRangesQuery`, as in -/// `diesel::insert_into(foo).values(_). Should not be used directly. -pub struct FilterConflictingVpcSubnetRangesQueryValues( - pub FilterConflictingVpcSubnetRangesQuery, -); - -impl QueryId for FilterConflictingVpcSubnetRangesQueryValues { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; +/// Errors related to inserting VPC Subnets. +#[derive(Debug, PartialEq)] +pub enum InsertVpcSubnetError { + /// The IPv4 or IPv6 subnet overlaps with an existing VPC Subnet + OverlappingIpRange(oxnet::IpNet), + /// Any other error + External(external::Error), } -impl diesel::insertable::CanInsertInSingleQuery - for FilterConflictingVpcSubnetRangesQueryValues -{ - fn rows_to_insert(&self) -> Option { - Some(1) +impl InsertVpcSubnetError { + const OVERLAPPING_IPV4_BLOCK_SENTINEL: &'static str = "ipv4"; + const OVERLAPPING_IPV4_BLOCK_ERROR_MESSAGE: &'static str = + r#"could not parse "ipv4" as type bool: invalid bool value"#; + const OVERLAPPING_IPV6_BLOCK_SENTINEL: &'static str = "ipv6"; + const OVERLAPPING_IPV6_BLOCK_ERROR_MESSAGE: &'static str = + r#"could not parse "ipv6" as type bool: invalid bool value"#; + const NAME_CONFLICT_CONSTRAINT: &'static str = "vpc_subnet_vpc_id_name_key"; + + /// Construct an `InsertError` from a Diesel error, catching the desired + /// cases and building useful errors. + pub fn from_diesel(e: DieselError, subnet: &VpcSubnet) -> Self { + use crate::db::error; + use diesel::result::DatabaseErrorKind; + match e { + // Attempt to insert an overlapping IPv4 subnet + DieselError::DatabaseError( + DatabaseErrorKind::Unknown, + ref info, + ) if info.message() + == Self::OVERLAPPING_IPV4_BLOCK_ERROR_MESSAGE => + { + InsertVpcSubnetError::OverlappingIpRange( + subnet.ipv4_block.0.into(), + ) + } + + // Attempt to insert an overlapping IPv6 subnet + DieselError::DatabaseError( + DatabaseErrorKind::Unknown, + ref info, + ) if info.message() + == Self::OVERLAPPING_IPV6_BLOCK_ERROR_MESSAGE => + { + InsertVpcSubnetError::OverlappingIpRange( + subnet.ipv6_block.0.into(), + ) + } + + // Conflicting name for the subnet within a VPC + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref info, + ) if info.constraint_name() + == Some(Self::NAME_CONFLICT_CONSTRAINT) => + { + InsertVpcSubnetError::External(error::public_error_from_diesel( + e, + error::ErrorHandler::Conflict( + external::ResourceType::VpcSubnet, + subnet.identity().name.as_str(), + ), + )) + } + + // Any other error at all is a bug + _ => InsertVpcSubnetError::External( + error::public_error_from_diesel(e, error::ErrorHandler::Server), + ), + } } -} -impl QueryFragment for FilterConflictingVpcSubnetRangesQueryValues { - fn walk_ast<'a>( - &'a self, - mut out: AstPass<'_, 'a, Pg>, - ) -> diesel::QueryResult<()> { - use db::schema::vpc_subnet::dsl; - out.push_sql("("); - out.push_identifier(dsl::id::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::name::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::description::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::time_created::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::time_modified::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::time_deleted::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::vpc_id::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::rcgen::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::ipv4_block::NAME)?; - out.push_sql(", "); - out.push_identifier(dsl::ipv6_block::NAME)?; - out.push_sql(") "); - self.0.walk_ast(out) + /// Convert into a public error + pub fn into_external(self) -> external::Error { + match self { + InsertVpcSubnetError::OverlappingIpRange(ip) => { + external::Error::invalid_request( + format!( + "IP address range '{}' \ + conflicts with an existing subnet", + ip, + ) + .as_str(), + ) + } + InsertVpcSubnetError::External(e) => e, + } } } -type FromClause = - diesel::internal::table_macro::StaticQueryFragmentInstance; -type VpcSubnetFromClause = FromClause; -const VPC_SUBNET_FROM_CLAUSE: VpcSubnetFromClause = VpcSubnetFromClause::new(); - #[cfg(test)] mod test { - use super::SubnetError; + use super::InsertVpcSubnetError; + use super::InsertVpcSubnetQuery; + use crate::db::explain::ExplainableAsync as _; use crate::db::model::VpcSubnet; - use ipnetwork::IpNetwork; use nexus_test_utils::db::test_setup_database; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_test_utils::dev; use std::convert::TryInto; use std::sync::Arc; - use uuid::Uuid; #[tokio::test] - async fn test_filter_conflicting_vpc_subnet_ranges_query() { + async fn explain_insert_query() { + let ipv4_block = "172.30.0.0/24".parse().unwrap(); + let ipv6_block = "fd12:3456:7890::/64".parse().unwrap(); + let name = "a-name".to_string().try_into().unwrap(); + let description = "some description".to_string(); + let identity = IdentityMetadataCreateParams { name, description }; + let vpc_id = "d402369d-c9ec-c5ad-9138-9fbee732d53e".parse().unwrap(); + let subnet_id = "093ad2db-769b-e3c2-bc1c-b46e84ce5532".parse().unwrap(); + let row = + VpcSubnet::new(subnet_id, vpc_id, identity, ipv4_block, ipv6_block); + let query = InsertVpcSubnetQuery::new(row); + let logctx = dev::test_setup_log("explain_insert_query"); + let log = logctx.log.new(o!()); + let mut db = test_setup_database(&log).await; + let cfg = crate::db::Config { url: db.pg_config().clone() }; + let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg)); + let conn = pool.pool().get().await.unwrap(); + let explain = query.explain_async(&conn).await.unwrap(); + println!("{explain}"); + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_insert_vpc_subnet_query() { let make_id = |name: &Name, description: &str| IdentityMetadataCreateParams { name: name.clone(), @@ -427,12 +342,13 @@ mod test { let subnet_id = "093ad2db-769b-e3c2-bc1c-b46e84ce5532".parse().unwrap(); let other_subnet_id = "695debcc-e197-447d-ffb2-976150a7b7cf".parse().unwrap(); + let other_other_subnet_id = + "ddbdc2b7-d22f-40d9-98df-fef5da151e0d".parse().unwrap(); let row = VpcSubnet::new(subnet_id, vpc_id, identity, ipv4_block, ipv6_block); // Setup the test database - let logctx = - dev::test_setup_log("test_filter_conflicting_vpc_subnet_ranges"); + let logctx = dev::test_setup_log("test_insert_vpc_subnet_query"); let log = logctx.log.new(o!()); let mut db = test_setup_database(&log).await; let cfg = crate::db::Config { url: db.pg_config().clone() }; @@ -445,7 +361,10 @@ mod test { // We should be able to insert anything into an empty table. assert!( - matches!(db_datastore.vpc_create_subnet_raw(row).await, Ok(_)), + matches!( + db_datastore.vpc_create_subnet_raw(row.clone()).await, + Ok(_) + ), "Should be able to insert VPC subnet into empty table" ); @@ -460,10 +379,13 @@ mod test { ); assert!( matches!( - db_datastore.vpc_create_subnet_raw(new_row).await, - Err(SubnetError::OverlappingIpRange(IpNetwork::V4(_))) + db_datastore.vpc_create_subnet_raw(new_row.clone()).await, + Err(InsertVpcSubnetError::OverlappingIpRange { .. }), ), - "Should not be able to insert new VPC subnet with the same IPv4 and IPv6 ranges" + "Should not be able to insert new VPC subnet with the \ + same IPv4 and IPv6 ranges,\n\ + first row: {row:?}\n\ + new row: {new_row:?}", ); // We should be able to insert data with the same ranges, if we change @@ -483,7 +405,7 @@ mod test { // We shouldn't be able to insert a subnet if we change only the // IPv4 or IPv6 block. They must _both_ be non-overlapping. let new_row = VpcSubnet::new( - other_subnet_id, + other_other_subnet_id, vpc_id, make_id(&other_name, &description), other_ipv4_block, @@ -495,11 +417,11 @@ mod test { .expect_err("Should not be able to insert VPC Subnet with overlapping IPv6 range"); assert_eq!( err, - SubnetError::OverlappingIpRange(ipnetwork::IpNetwork::from(oxnet::IpNet::from(ipv6_block))), - "SubnetError variant should include the exact IP range that overlaps" + InsertVpcSubnetError::OverlappingIpRange(ipv6_block.into()), + "InsertError variant should indicate an IP block overlaps" ); let new_row = VpcSubnet::new( - other_subnet_id, + other_other_subnet_id, vpc_id, make_id(&other_name, &description), ipv4_block, @@ -511,14 +433,14 @@ mod test { .expect_err("Should not be able to insert VPC Subnet with overlapping IPv4 range"); assert_eq!( err, - SubnetError::OverlappingIpRange(ipnetwork::IpNetwork::from(oxnet::IpNet::from(ipv4_block))), - "SubnetError variant should include the exact IP range that overlaps" + InsertVpcSubnetError::OverlappingIpRange(ipv4_block.into()), + "InsertError variant should indicate an IP block overlaps" ); // We should get an _external error_ if the IP address ranges are OK, // but the name conflicts. let new_row = VpcSubnet::new( - other_subnet_id, + other_other_subnet_id, vpc_id, make_id(&name, &description), other_ipv4_block, @@ -527,7 +449,7 @@ mod test { assert!( matches!( db_datastore.vpc_create_subnet_raw(new_row).await, - Err(SubnetError::External(_)) + Err(InsertVpcSubnetError::External(_)) ), "Should get an error inserting a VPC Subnet with unique IP ranges, but the same name" ); @@ -535,7 +457,7 @@ mod test { // We should be able to insert the row if _both ranges_ are different, // and the name is unique as well. let new_row = VpcSubnet::new( - Uuid::new_v4(), + other_other_subnet_id, vpc_id, make_id(&other_name, &description), other_ipv4_block, @@ -549,4 +471,104 @@ mod test { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + // Helper to verify equality of rows, handling timestamp precision. + fn assert_rows_eq(left: &VpcSubnet, right: &VpcSubnet) { + assert_eq!( + left.identity.id, right.identity.id, + "VPC Subnet rows should be equal" + ); + assert_eq!( + left.identity.name, right.identity.name, + "VPC Subnet rows should be equal" + ); + assert_eq!( + left.identity.description, right.identity.description, + "VPC Subnet rows should be equal" + ); + // Timestamps in CRDB have microsecond precision, so ensure we're + // within 1000 nanos. + assert!( + (left.identity.time_modified - right.identity.time_modified) + .num_nanoseconds() + .unwrap() + < 1_000, + "VPC Subnet rows should be equal", + ); + assert!( + (left.identity.time_created - right.identity.time_created) + .num_nanoseconds() + .unwrap() + < 1_000, + "VPC Subnet rows should be equal", + ); + assert_eq!( + left.identity.time_deleted, right.identity.time_deleted, + "VPC Subnet rows should be equal", + ); + assert_eq!( + left.vpc_id, right.vpc_id, + "VPC Subnet rows should be equal" + ); + assert_eq!(left.rcgen, right.rcgen, "VPC Subnet rows should be equal"); + assert_eq!( + left.ipv4_block, right.ipv4_block, + "VPC Subnet rows should be equal" + ); + assert_eq!( + left.ipv6_block, right.ipv6_block, + "VPC Subnet rows should be equal" + ); + assert_eq!( + left.custom_router_id, right.custom_router_id, + "VPC Subnet rows should be equal" + ); + } + + // Regression test for https://github.com/oxidecomputer/omicron/issues/6069. + #[tokio::test] + async fn test_insert_vpc_subnet_query_is_idempotent() { + let ipv4_block = "172.30.0.0/24".parse().unwrap(); + let ipv6_block = "fd12:3456:7890::/64".parse().unwrap(); + let name = "a-name".to_string().try_into().unwrap(); + let description = "some description".to_string(); + let identity = IdentityMetadataCreateParams { name, description }; + let vpc_id = "d402369d-c9ec-c5ad-9138-9fbee732d53e".parse().unwrap(); + let subnet_id = "093ad2db-769b-e3c2-bc1c-b46e84ce5532".parse().unwrap(); + let row = + VpcSubnet::new(subnet_id, vpc_id, identity, ipv4_block, ipv6_block); + + // Setup the test database + let logctx = + dev::test_setup_log("test_insert_vpc_subnet_query_is_idempotent"); + let log = logctx.log.new(o!()); + let mut db = test_setup_database(&log).await; + let cfg = crate::db::Config { url: db.pg_config().clone() }; + let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg)); + let db_datastore = Arc::new( + crate::db::DataStore::new(&log, Arc::clone(&pool), None) + .await + .unwrap(), + ); + + // We should be able to insert anything into an empty table. + let inserted = db_datastore + .vpc_create_subnet_raw(row.clone()) + .await + .expect("Should be able to insert VPC subnet into empty table"); + assert_rows_eq(&inserted, &row); + + // We should be able to insert the exact same row again. The IP ranges + // overlap, but the ID is also identical, which should not be an error. + // This is important for saga idempotency. + let inserted = db_datastore + .vpc_create_subnet_raw(row.clone()) + .await + .expect( + "Must be able to insert the exact same VPC subnet more than once", + ); + assert_rows_eq(&inserted, &row); + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/sagas/vpc_create.rs b/nexus/src/app/sagas/vpc_create.rs index a34b25ceb7..832ca64ace 100644 --- a/nexus/src/app/sagas/vpc_create.rs +++ b/nexus/src/app/sagas/vpc_create.rs @@ -8,7 +8,7 @@ use super::NexusSaga; use super::ACTION_GENERATE_ID; use crate::app::sagas::declare_saga_actions; use crate::external_api::params; -use nexus_db_queries::db::queries::vpc_subnet::SubnetError; +use nexus_db_queries::db::queries::vpc_subnet::InsertVpcSubnetError; use nexus_db_queries::{authn, authz, db}; use nexus_defaults as defaults; use omicron_common::api::external; @@ -368,7 +368,7 @@ async fn svc_create_subnet( .vpc_create_subnet(&opctx, &authz_vpc, subnet) .await .map_err(|err| match err { - SubnetError::OverlappingIpRange(ip) => { + InsertVpcSubnetError::OverlappingIpRange(ip) => { let ipv4_block = &defaults::DEFAULT_VPC_SUBNET_IPV4_BLOCK; let log = sagactx.user_data().log(); error!( @@ -388,7 +388,7 @@ async fn svc_create_subnet( found overlapping IP address ranges", ) } - SubnetError::External(e) => e, + InsertVpcSubnetError::External(e) => e, }) .map_err(ActionError::action_failed) } diff --git a/nexus/src/app/vpc_subnet.rs b/nexus/src/app/vpc_subnet.rs index ce0cd423f4..39b9844799 100644 --- a/nexus/src/app/vpc_subnet.rs +++ b/nexus/src/app/vpc_subnet.rs @@ -13,7 +13,7 @@ use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::VpcSubnet; -use nexus_db_queries::db::queries::vpc_subnet::SubnetError; +use nexus_db_queries::db::queries::vpc_subnet::InsertVpcSubnetError; use omicron_common::api::external; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; @@ -24,6 +24,7 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; use omicron_common::api::external::UpdateResult; +use oxnet::IpNet; use uuid::Uuid; impl super::Nexus { @@ -141,9 +142,9 @@ impl super::Nexus { // Note that we only catch IPv6 overlaps. The client // always specifies the IPv4 range, so we fail the // request if that overlaps with an existing range. - Err(SubnetError::OverlappingIpRange(ip)) - if retry <= NUM_RETRIES && ip.is_ipv6() => - { + Err(InsertVpcSubnetError::OverlappingIpRange( + IpNet::V6(_), + )) if retry <= NUM_RETRIES => { debug!( self.log, "autogenerated random IPv6 range overlap"; @@ -157,9 +158,9 @@ impl super::Nexus { } }; match result { - Err(SubnetError::OverlappingIpRange(ip)) - if ip.is_ipv6() => - { + Err(InsertVpcSubnetError::OverlappingIpRange( + IpNet::V6(_), + )) => { // TODO-monitoring TODO-debugging // // We should maintain a counter for this occurrence, and @@ -181,11 +182,11 @@ impl super::Nexus { for VPC Subnet", )) } - Err(SubnetError::OverlappingIpRange(_)) => { + Err(InsertVpcSubnetError::OverlappingIpRange(_)) => { // Overlapping IPv4 ranges, which is always a client error. Err(result.unwrap_err().into_external()) } - Err(SubnetError::External(e)) => Err(e), + Err(InsertVpcSubnetError::External(e)) => Err(e), Ok((.., subnet)) => Ok(subnet), } } @@ -210,7 +211,7 @@ impl super::Nexus { .vpc_create_subnet(opctx, &authz_vpc, subnet) .await .map(|(.., subnet)| subnet) - .map_err(SubnetError::into_external) + .map_err(InsertVpcSubnetError::into_external) } }?; From 2eb63b157b20e618078883a8a2a37148dcfcc5fc Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:39:59 -0700 Subject: [PATCH 09/21] Update Rust crate toml to 0.8.15 (#6104) --- Cargo.lock | 66 +++++++++++++++++++-------------------- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 4 +-- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 867c2eec25..f08c7a483d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -858,7 +858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4895c018bb228aa6b3ba1a0285543fcb4b704734c3fb1f72afaa75aa769500c1" dependencies = [ "serde", - "toml 0.8.14", + "toml 0.8.15", ] [[package]] @@ -1945,7 +1945,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "trust-dns-client", "trust-dns-proto", "trust-dns-resolver", @@ -2027,7 +2027,7 @@ dependencies = [ "serde", "serde_json", "slog", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -2070,7 +2070,7 @@ dependencies = [ "slog-term", "tokio", "tokio-rustls 0.25.0", - "toml 0.8.14", + "toml 0.8.15", "usdt", "uuid", "version_check", @@ -2233,7 +2233,7 @@ dependencies = [ "serde_json", "socket2 0.5.7", "tokio", - "toml 0.8.14", + "toml 0.8.15", "trust-dns-resolver", "uuid", ] @@ -3492,7 +3492,7 @@ dependencies = [ "smf", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "uuid", "whoami", "zone 0.3.0", @@ -4542,7 +4542,7 @@ dependencies = [ "serde_json", "serde_with", "tokio-postgres", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -5335,7 +5335,7 @@ dependencies = [ "thiserror", "tokio", "tokio-postgres", - "toml 0.8.14", + "toml 0.8.15", "url", ] @@ -5383,7 +5383,7 @@ dependencies = [ "test-strategy", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -5408,7 +5408,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -5443,7 +5443,7 @@ dependencies = [ "subprocess", "tokio", "tokio-postgres", - "toml 0.8.14", + "toml 0.8.15", ] [[package]] @@ -5485,7 +5485,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-tungstenite 0.20.1", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -5712,7 +5712,7 @@ dependencies = [ "tar", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "walkdir", ] @@ -5760,7 +5760,7 @@ dependencies = [ "slog-term", "tar", "tokio", - "toml 0.8.14", + "toml 0.8.15", "tufaceous-lib", ] @@ -5863,7 +5863,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml 0.8.14", + "toml 0.8.15", "usdt", "uuid", "zeroize", @@ -6023,7 +6023,7 @@ dependencies = [ "toml 0.7.8", "toml_datetime", "toml_edit 0.19.15", - "toml_edit 0.22.14", + "toml_edit 0.22.16", "tracing", "trust-dns-proto", "unicode-bidi", @@ -6309,7 +6309,7 @@ dependencies = [ "oximeter-timeseries-macro", "prettyplease", "syn 2.0.71", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -6364,7 +6364,7 @@ dependencies = [ "subprocess", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "uuid", ] @@ -6443,7 +6443,7 @@ dependencies = [ "strum", "syn 2.0.71", "thiserror", - "toml 0.8.14", + "toml 0.8.15", "trybuild", "uuid", ] @@ -8183,7 +8183,7 @@ dependencies = [ "serde", "tempfile", "thiserror", - "toml 0.8.14", + "toml 0.8.15", "toolchain_find", ] @@ -9208,7 +9208,7 @@ dependencies = [ "slog-dtrace", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", ] [[package]] @@ -10035,14 +10035,14 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.14" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +checksum = "ac2caab0bf757388c6c0ae23b3293fdb463fee59434529014f85e3263b995c28" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.14", + "toml_edit 0.22.16", ] [[package]] @@ -10069,9 +10069,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.14" +version = "0.22.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +checksum = "278f3d518e152219c994ce877758516bca5e118eaed6996192a774fb9fbf0788" dependencies = [ "indexmap 2.2.6", "serde", @@ -10297,7 +10297,7 @@ dependencies = [ "serde_derive", "serde_json", "termcolor", - "toml 0.8.14", + "toml 0.8.15", ] [[package]] @@ -10357,7 +10357,7 @@ dependencies = [ "slog", "tar", "tokio", - "toml 0.8.14", + "toml 0.8.15", "tough", "url", "zip", @@ -11042,8 +11042,8 @@ dependencies = [ "textwrap", "tokio", "tokio-util", - "toml 0.8.14", - "toml_edit 0.22.14", + "toml 0.8.15", + "toml_edit 0.22.16", "tui-tree-widget", "unicode-width", "update-engine", @@ -11073,7 +11073,7 @@ dependencies = [ "slog", "thiserror", "tokio", - "toml 0.8.14", + "toml 0.8.15", "update-engine", ] @@ -11161,7 +11161,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "toml 0.8.14", + "toml 0.8.15", "tough", "trust-dns-resolver", "tufaceous", @@ -11493,7 +11493,7 @@ dependencies = [ "tabled", "tar", "tokio", - "toml 0.8.14", + "toml 0.8.15", "usdt", ] diff --git a/Cargo.toml b/Cargo.toml index e74eb6ab60..dc464e547f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -512,7 +512,7 @@ tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1 tokio-stream = "0.1.15" tokio-tungstenite = "0.20" tokio-util = { version = "0.7.10", features = ["io", "io-util"] } -toml = "0.8.12" +toml = "0.8.15" toml_edit = "0.22.12" tough = { version = "0.17.1", features = [ "http" ] } trust-dns-client = "0.22" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 14534baa6f..498f25d017 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -110,7 +110,7 @@ tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serd tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } -toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.14", features = ["serde"] } +toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.16", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.15" } @@ -218,7 +218,7 @@ tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serd tokio-stream = { version = "0.1.15", features = ["net"] } tokio-util = { version = "0.7.11", features = ["codec", "io-util"] } toml = { version = "0.7.8" } -toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.14", features = ["serde"] } +toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.16", features = ["serde"] } tracing = { version = "0.1.40", features = ["log"] } trust-dns-proto = { version = "0.22.0" } unicode-bidi = { version = "0.3.15" } From 708c288345a4405d9573f0d8945c316057b64149 Mon Sep 17 00:00:00 2001 From: Rain Date: Thu, 18 Jul 2024 15:14:08 -0700 Subject: [PATCH 10/21] [nexus-db-queries] make saga_update more resilient, record_event idempotent (#6113) See discussion in https://github.com/oxidecomputer/omicron/issues/2416 and https://github.com/oxidecomputer/omicron/issues/6090#issuecomment-2229509411. A summary of the changes here: 1. Made `saga_create_event` idempotent. Previously, creating another event that duplicated the one which already existed would fail -- now it succeeds. These events are meant to be an append-only idempotent log, so this is okay. Also added a test for this. 2. `saga_update_state` was already idempotent -- added a test which made sure of this. Also added a comment about how idempotence may not be enough in the future. 3. Added a retry loop around saga state updates, similar to the one around recording events. --- nexus/db-queries/src/db/datastore/saga.rs | 201 ++++++++++++++++++---- nexus/db-queries/src/db/sec_store.rs | 155 ++++++++++------- 2 files changed, 268 insertions(+), 88 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index e632bce881..939929e665 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -47,6 +47,15 @@ impl DataStore { // owning this saga. diesel::insert_into(dsl::saga_node_event) .values(event.clone()) + // (saga_id, node_id, event_type) is the primary key, and this is + // expected to be idempotent. + // + // Consider the situation where a saga event gets recorded and + // committed, but there's a network reset which makes the client + // (us) believe that the event wasn't recorded. If we retry the + // event, we want to not fail with a conflict. + .on_conflict((dsl::saga_id, dsl::node_id, dsl::event_type)) + .do_nothing() .execute_async(&*self.pool_connection_unauthorized().await?) .await .map_err(|e| { @@ -58,6 +67,28 @@ impl DataStore { Ok(()) } + /// Update the state of a saga in the database. + /// + /// This function is meant to be called in a loop, so that in the event of + /// network flakiness, the operation is retried until successful. + /// + /// ## About conflicts + /// + /// Currently, if the value of `saga_state` in the database is the same as + /// the value we're trying to set it to, the update will be a no-op. That + /// is okay, because at any time only one SEC will update the saga. (For + /// now, we're implementing saga adoption only in cases where the original + /// SEC/Nexus has been expunged.) + /// + /// However, in the future, it may be possible for multiple SECs to try and + /// update the same saga, and overwrite each other's state. For example, + /// one SEC might try and update the state to Running while the other one + /// updates it to Done. That case would have to be carefully considered and + /// tested here, probably using the (currently unused) + /// `current_adopt_generation` field to enable optimistic concurrency. + /// + /// To reiterate, we are *not* considering the case where several SECs try + /// to update the same saga. That will be a future enhancement. pub async fn saga_update_state( &self, saga_id: steno::SagaId, @@ -182,6 +213,7 @@ impl DataStore { mod test { use super::*; use crate::db::datastore::test_utils::datastore_test; + use nexus_db_model::{SagaNodeEvent, SecId}; use nexus_test_utils::db::test_setup_database; use omicron_test_utils::dev; use rand::seq::SliceRandom; @@ -195,20 +227,8 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; let sec_id = db::SecId(uuid::Uuid::new_v4()); - - // Create a couple batches of sagas. - let new_running_db_saga = || { - let params = steno::SagaCreateParams { - id: steno::SagaId(Uuid::new_v4()), - name: steno::SagaName::new("test saga"), - dag: serde_json::value::Value::Null, - state: steno::SagaCachedState::Running, - }; - - db::model::saga_types::Saga::new(sec_id, params) - }; let mut inserted_sagas = (0..SQL_BATCH_SIZE.get() * 2) - .map(|_| new_running_db_saga()) + .map(|_| SagaTestContext::new(sec_id).new_running_db_saga()) .collect::>(); // Shuffle these sagas into a random order to check that the pagination @@ -263,20 +283,9 @@ mod test { let logctx = dev::test_setup_log("test_list_unfinished_nodes"); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; - let sec_id = db::SecId(uuid::Uuid::new_v4()); - let saga_id = steno::SagaId(Uuid::new_v4()); + let node_cx = SagaTestContext::new(SecId(Uuid::new_v4())); // Create a couple batches of saga events - let new_db_saga_nodes = - |node_id: u32, event_type: steno::SagaNodeEventType| { - let event = steno::SagaNodeEvent { - saga_id, - node_id: steno::SagaNodeId::from(node_id), - event_type, - }; - - db::model::saga_types::SagaNodeEvent::new(event, sec_id) - }; let mut inserted_nodes = (0..SQL_BATCH_SIZE.get() * 2) .flat_map(|i| { // This isn't an exhaustive list of event types, but gives us a @@ -284,9 +293,9 @@ mod test { // it's important to include a variety here. use steno::SagaNodeEventType::*; [ - new_db_saga_nodes(i, Started), - new_db_saga_nodes(i, UndoStarted), - new_db_saga_nodes(i, UndoFinished), + node_cx.new_db_event(i, Started), + node_cx.new_db_event(i, UndoStarted), + node_cx.new_db_event(i, UndoFinished), ] }) .collect::>(); @@ -311,7 +320,7 @@ mod test { let observed_nodes = datastore .saga_fetch_log_batched( &opctx, - nexus_db_model::saga_types::SagaId::from(saga_id), + nexus_db_model::saga_types::SagaId::from(node_cx.saga_id), ) .await .expect("Failed to list nodes of unfinished saga"); @@ -366,4 +375,138 @@ mod test { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_create_event_idempotent() { + // Test setup + let logctx = dev::test_setup_log("test_create_event_idempotent"); + let mut db = test_setup_database(&logctx.log).await; + let (_, datastore) = datastore_test(&logctx, &db).await; + let node_cx = SagaTestContext::new(SecId(Uuid::new_v4())); + + // Generate a bunch of events. + let inserted_nodes = (0..2) + .flat_map(|i| { + use steno::SagaNodeEventType::*; + [ + node_cx.new_db_event(i, Started), + node_cx.new_db_event(i, UndoStarted), + node_cx.new_db_event(i, UndoFinished), + ] + }) + .collect::>(); + + // Insert the events into the database. + for node in &inserted_nodes { + datastore + .saga_create_event(node) + .await + .expect("inserting first node events"); + } + + // Insert the events again into the database and ensure that we don't + // get a conflict. + for node in &inserted_nodes { + datastore + .saga_create_event(node) + .await + .expect("inserting duplicate node events"); + } + + // Test cleanup + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_update_state_idempotent() { + // Test setup + let logctx = dev::test_setup_log("test_create_event_idempotent"); + let mut db = test_setup_database(&logctx.log).await; + let (_, datastore) = datastore_test(&logctx, &db).await; + let node_cx = SagaTestContext::new(SecId(Uuid::new_v4())); + + // Create a saga in the running state. + let params = node_cx.new_running_db_saga(); + datastore + .saga_create(¶ms) + .await + .expect("creating saga in Running state"); + + // Attempt to update its state to Running, which is a no-op -- this + // should be idempotent, so expect success. + datastore + .saga_update_state( + node_cx.saga_id, + steno::SagaCachedState::Running, + node_cx.sec_id, + db::model::Generation::new(), + ) + .await + .expect("updating state to Running again"); + + // Update the state to Done. + datastore + .saga_update_state( + node_cx.saga_id, + steno::SagaCachedState::Done, + node_cx.sec_id, + db::model::Generation::new(), + ) + .await + .expect("updating state to Done"); + + // Attempt to update its state to Done again, which is a no-op -- this + // should be idempotent, so expect success. + datastore + .saga_update_state( + node_cx.saga_id, + steno::SagaCachedState::Done, + node_cx.sec_id, + db::model::Generation::new(), + ) + .await + .expect("updating state to Done again"); + + // Test cleanup + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + /// Helpers to create sagas. + struct SagaTestContext { + saga_id: steno::SagaId, + sec_id: SecId, + } + + impl SagaTestContext { + fn new(sec_id: SecId) -> Self { + Self { saga_id: steno::SagaId(Uuid::new_v4()), sec_id } + } + + fn new_running_db_saga(&self) -> db::model::saga_types::Saga { + let params = steno::SagaCreateParams { + id: self.saga_id, + name: steno::SagaName::new("test saga"), + dag: serde_json::value::Value::Null, + state: steno::SagaCachedState::Running, + }; + + db::model::saga_types::Saga::new(self.sec_id, params) + } + + fn new_db_event( + &self, + node_id: u32, + event_type: steno::SagaNodeEventType, + ) -> SagaNodeEvent { + let event = steno::SagaNodeEvent { + saga_id: self.saga_id, + node_id: steno::SagaNodeId::from(node_id), + event_type, + }; + + SagaNodeEvent::new(event, self.sec_id) + } + } } diff --git a/nexus/db-queries/src/db/sec_store.rs b/nexus/db-queries/src/db/sec_store.rs index 72de02ff54..0dcc3aa717 100644 --- a/nexus/db-queries/src/db/sec_store.rs +++ b/nexus/db-queries/src/db/sec_store.rs @@ -8,7 +8,8 @@ use crate::db::{self, model::Generation}; use anyhow::Context; use async_trait::async_trait; use dropshot::HttpError; -use futures::TryFutureExt; +use futures::{Future, TryFutureExt}; +use omicron_common::api::external; use omicron_common::backoff; use slog::Logger; use std::fmt; @@ -66,78 +67,114 @@ impl steno::SecStore for CockroachDbSecStore { debug!(&log, "recording saga event"); let our_event = db::saga_types::SagaNodeEvent::new(event, self.sec_id); - backoff::retry_notify_ext( - // This is an internal service query to CockroachDB. - backoff::retry_policy_internal_service(), + // Add retries for this operation. saga_create_event is internally + // idempotent, so we can retry indefinitely until the event has been + // durably recorded. + backoff_saga_operation( + &log, || { - // In general, there are some kinds of database errors that are - // temporary/server errors (e.g. network failures), and some - // that are permanent/client errors (e.g. conflict during - // insertion). The permanent ones would require operator - // intervention to fix. - // - // However, there is no way to bubble up errors here, and for - // good reason: it is inherent to the nature of sagas that - // progress is durably recorded. So within *this* code there is - // no option but to retry forever. (Below, however, we do mark - // errors that likely require operator intervention.) - // - // At a higher level, callers should plan for the fact that - // record_event (and, so, saga execution) could potentially loop - // indefinitely while the datastore (or other dependent - // services) are down. self.datastore .saga_create_event(&our_event) .map_err(backoff::BackoffError::transient) }, - move |error, call_count, total_duration| { - let http_error = HttpError::from(error.clone()); - if http_error.status_code.is_client_error() { - error!( - &log, - "client error while recording saga event (likely \ - requires operator intervention), retrying anyway"; - "error" => &error, - "call_count" => call_count, - "total_duration" => ?total_duration, - ); - } else if total_duration > Duration::from_secs(20) { - warn!( - &log, - "server error while recording saga event, retrying"; - "error" => &error, - "call_count" => call_count, - "total_duration" => ?total_duration, - ); - } else { - info!( - &log, - "server error while recording saga event, retrying"; - "error" => &error, - "call_count" => call_count, - "total_duration" => ?total_duration, - ); - } - }, + "recording saga event", ) .await - .expect("the above backoff retries forever") } async fn saga_update(&self, id: SagaId, update: steno::SagaCachedState) { // TODO-robustness We should track the current generation of the saga // and use it. We'll know this either from when it was created or when // it was recovered. - info!(&self.log, "updating state"; + + let log = self.log.new(o!( "saga_id" => id.to_string(), - "new_state" => update.to_string() - ); + "new_state" => update.to_string(), + )); - // TODO-robustness This should be wrapped with a retry loop rather than - // unwrapping the result. See omicron#2416. - self.datastore - .saga_update_state(id, update, self.sec_id, Generation::new()) - .await - .unwrap(); + info!(&log, "updating state"); + + // Add retries for this operation. saga_update_state is internally + // idempotent, so we can retry indefinitely until the event has been + // durably recorded. (But see the note in saga_update_state about how + // idempotence is enough for now, but may not be in the future.) + backoff_saga_operation( + &log, + || { + self.datastore + .saga_update_state( + id, + update, + self.sec_id, + Generation::new(), + ) + .map_err(backoff::BackoffError::transient) + }, + "updating saga state", + ) + .await } } + +/// Implements backoff retry logic for saga operations. +/// +/// In general, there are some kinds of database errors that are +/// temporary/server errors (e.g. network failures), and some that are +/// permanent/client errors (e.g. conflict during insertion). The permanent +/// ones would require operator intervention to fix. +/// +/// However, there is no way to bubble up errors from the SEC store, and for +/// good reason: it is inherent to the nature of sagas that progress is durably +/// recorded. So inside this code there is no option but to retry forever. +/// (Below, however, we do mark errors that likely require operator +/// intervention.) +/// +/// At a higher level, callers should plan for the fact saga execution could +/// potentially loop indefinitely while the datastore (or other dependent +/// services) are down. +async fn backoff_saga_operation(log: &Logger, op: F, description: &str) +where + F: Fn() -> Fut, + Fut: Future>>, +{ + backoff::retry_notify_ext( + // This is an internal service query to CockroachDB. + backoff::retry_policy_internal_service(), + op, + move |error, call_count, total_duration| { + let http_error = HttpError::from(error.clone()); + if http_error.status_code.is_client_error() { + error!( + &log, + "client error while {description} (likely \ + requires operator intervention), retrying anyway"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else if total_duration > WARN_DURATION { + warn!( + &log, + "server error while {description}, retrying"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else { + info!( + &log, + "server error while {description}, retrying"; + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } + }, + ) + .await + .expect("the above backoff retries forever") +} + +/// Threshold at which logs about server errors during retries switch from INFO +/// to WARN. +const WARN_DURATION: Duration = Duration::from_secs(20); From d4876768b00c8c2909e9b013e81b04898421a936 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 04:19:40 +0000 Subject: [PATCH 11/21] Update taiki-e/install-action digest to 37461a1 (#6121) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [taiki-e/install-action](https://togithub.com/taiki-e/install-action) | action | digest | [`ea7e518` -> `37461a1`](https://togithub.com/taiki-e/install-action/compare/ea7e518...37461a1) | --- ### Configuration 📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone America/Los_Angeles. 🚦 **Automerge**: Enabled. â™» **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://togithub.com/renovatebot/renovate). Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com> --- .github/workflows/hakari.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 3a31a5323d..ceb7b603b0 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@ea7e5189a7664872699532b4cd92a443f520624e # v2 + uses: taiki-e/install-action@37461a1de4134bec919a737ee9ba018e72011b7c # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date From 92c39e50b07dd60a2d34c2877023f8531e14af40 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Thu, 18 Jul 2024 21:55:31 -0700 Subject: [PATCH 12/21] Update Rust crate tokio-util to 0.7.11 (#6094) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index dc464e547f..8e9663bc7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -511,7 +511,7 @@ tokio = "1.38.1" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.15" tokio-tungstenite = "0.20" -tokio-util = { version = "0.7.10", features = ["io", "io-util"] } +tokio-util = { version = "0.7.11", features = ["io", "io-util"] } toml = "0.8.15" toml_edit = "0.22.12" tough = { version = "0.17.1", features = [ "http" ] } From 19388772a4470176127c73a86a3b0ea334f14336 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 19 Jul 2024 13:37:35 -0700 Subject: [PATCH 13/21] Add sled-identifiers to instance vCPU stats (#6128) This adds the sled UUID and baseboard info into the existing vCPU stats. This is an incompatible change, and will require dropping old data from the previous versions of the timeseries. That's unused today, and it's not yet clear how we'll actually manage the timeseries data across a schema change, so I'd like to include this now. --- oximeter/oximeter/schema/virtual-machine.toml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/oximeter/oximeter/schema/virtual-machine.toml b/oximeter/oximeter/schema/virtual-machine.toml index 3ef0da4615..520a97bba3 100644 --- a/oximeter/oximeter/schema/virtual-machine.toml +++ b/oximeter/oximeter/schema/virtual-machine.toml @@ -5,7 +5,7 @@ name = "virtual_machine" description = "A guest virtual machine instance" authz_scope = "project" versions = [ - { version = 1, fields = [ "instance_id", "project_id", "silo_id" ] }, + { version = 1, fields = [ "instance_id", "project_id", "silo_id", "sled_id", "sled_model", "sled_revision", "sled_serial" ] }, ] [[metrics]] @@ -56,6 +56,22 @@ description = "ID of the virtual machine instance's project" type = "uuid" description = "ID of the virtual machine instance's silo" +[fields.sled_id] +type = "uuid" +description = "ID of the sled hosting the instance" + +[fields.sled_model] +type = "string" +description = "Model number of the sled hosting the instance" + +[fields.sled_revision] +type = "u32" +description = "Revision number of the sled hosting the instance" + +[fields.sled_serial] +type = "string" +description = "Serial number of the sled hosting the instance" + [fields.state] type = "string" description = "The state of the vCPU" From 39691244c01efea793ee9325c9e34c4981c3b972 Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 19 Jul 2024 13:38:24 -0700 Subject: [PATCH 14/21] [1/3 sled-agent] use omicron-uuid-kinds for rack init and reset IDs (#6089) These IDs were hand-implemented typed UUIDs, so they're good candidates for conversion to omicron-uuid-kinds. Encountered this while trying to convert the bootstrap agent API to using a trait -- didn't want to copy over these types into that trait, wanted to leave it better than I found it. --- Cargo.lock | 4 ++ clients/bootstrap-agent-client/Cargo.toml | 1 + clients/bootstrap-agent-client/src/lib.rs | 2 + clients/wicketd-client/Cargo.toml | 1 + clients/wicketd-client/src/lib.rs | 2 + openapi/bootstrap-agent.json | 36 ++++++------ openapi/wicketd.json | 40 +++++++------ sled-agent/src/bootstrap/http_entrypoints.rs | 25 ++++---- sled-agent/src/bootstrap/rack_ops.rs | 60 +++++--------------- sled-agent/src/bootstrap/server.rs | 4 +- uuid-kinds/src/lib.rs | 2 + wicket/src/ui/panes/rack_setup.rs | 16 +++--- wicketd-api/Cargo.toml | 1 + wicketd-api/src/lib.rs | 8 +-- wicketd/Cargo.toml | 1 + wicketd/src/http_entrypoints.rs | 8 +-- 16 files changed, 96 insertions(+), 115 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f08c7a483d..ce366cd7f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -703,6 +703,7 @@ name = "bootstrap-agent-client" version = "0.1.0" dependencies = [ "omicron-common", + "omicron-uuid-kinds", "omicron-workspace-hack", "oxnet", "progenitor", @@ -11141,6 +11142,7 @@ dependencies = [ "omicron-ddm-admin-client", "omicron-passwords", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", @@ -11184,6 +11186,7 @@ dependencies = [ "gateway-client", "omicron-common", "omicron-passwords", + "omicron-uuid-kinds", "omicron-workspace-hack", "schemars", "serde", @@ -11199,6 +11202,7 @@ dependencies = [ "chrono", "installinator-common", "omicron-common", + "omicron-uuid-kinds", "omicron-workspace-hack", "progenitor", "regress", diff --git a/clients/bootstrap-agent-client/Cargo.toml b/clients/bootstrap-agent-client/Cargo.toml index 0b1d2fab4b..e152e31966 100644 --- a/clients/bootstrap-agent-client/Cargo.toml +++ b/clients/bootstrap-agent-client/Cargo.toml @@ -18,5 +18,6 @@ serde_json.workspace = true sled-hardware-types.workspace = true slog.workspace = true uuid.workspace = true +omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true oxnet.workspace = true diff --git a/clients/bootstrap-agent-client/src/lib.rs b/clients/bootstrap-agent-client/src/lib.rs index b29f4e69f4..c737283d84 100644 --- a/clients/bootstrap-agent-client/src/lib.rs +++ b/clients/bootstrap-agent-client/src/lib.rs @@ -24,6 +24,8 @@ progenitor::generate_api!( replace = { AllowedSourceIps = omicron_common::api::external::AllowedSourceIps, ImportExportPolicy = omicron_common::api::external::ImportExportPolicy, + TypedUuidForRackInitKind = omicron_uuid_kinds::RackInitUuid, + TypedUuidForRackResetKind = omicron_uuid_kinds::RackResetUuid, } ); diff --git a/clients/wicketd-client/Cargo.toml b/clients/wicketd-client/Cargo.toml index 8e50964e59..0e55acd8bb 100644 --- a/clients/wicketd-client/Cargo.toml +++ b/clients/wicketd-client/Cargo.toml @@ -11,6 +11,7 @@ workspace = true chrono.workspace = true installinator-common.workspace = true omicron-common.workspace = true +omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true progenitor.workspace = true regress.workspace = true diff --git a/clients/wicketd-client/src/lib.rs b/clients/wicketd-client/src/lib.rs index bb377de31e..7a07ecd6a5 100644 --- a/clients/wicketd-client/src/lib.rs +++ b/clients/wicketd-client/src/lib.rs @@ -81,6 +81,8 @@ progenitor::generate_api!( StepEventForInstallinatorSpec = installinator_common::StepEvent, StepEventForWicketdEngineSpec = wicket_common::update_events::StepEvent, SwitchLocation = omicron_common::api::internal::shared::SwitchLocation, + TypedUuidForRackInitKind = omicron_uuid_kinds::RackInitUuid, + TypedUuidForRackResetKind = omicron_uuid_kinds::RackResetUuid, UpdateSimulatedResult = wicket_common::rack_update::UpdateSimulatedResult, UpdateTestError = wicket_common::rack_update::UpdateTestError, UplinkPreflightStepId = wicket_common::preflight_check::UplinkPreflightStepId, diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 879e8cdc3f..68ae76e523 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -105,7 +105,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" } } } @@ -127,7 +127,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" } } } @@ -838,10 +838,6 @@ "speed400_g" ] }, - "RackInitId": { - "type": "string", - "format": "uuid" - }, "RackInitializeRequest": { "description": "Configuration for the \"rack setup service\".\n\nThe Rack Setup Service should be responsible for one-time setup actions, such as CockroachDB placement and initialization. Without operator intervention, however, these actions need a way to be automated in our deployment.", "type": "object", @@ -998,7 +994,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "status": { "type": "string", @@ -1020,7 +1016,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" } ] }, @@ -1039,7 +1035,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "message": { "type": "string" @@ -1061,7 +1057,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "status": { "type": "string", @@ -1079,7 +1075,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "status": { "type": "string", @@ -1101,7 +1097,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" } ] }, @@ -1120,7 +1116,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "message": { "type": "string" @@ -1142,7 +1138,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "status": { "type": "string", @@ -1158,10 +1154,6 @@ } ] }, - "RackResetId": { - "type": "string", - "format": "uuid" - }, "RecoverySiloConfig": { "description": "RecoverySiloConfig\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"silo_name\", \"user_name\", \"user_password_hash\" ], \"properties\": { \"silo_name\": { \"$ref\": \"#/components/schemas/Name\" }, \"user_name\": { \"$ref\": \"#/components/schemas/UserId\" }, \"user_password_hash\": { \"$ref\": \"#/components/schemas/NewPasswordHash\" } } } ```
", "type": "object", @@ -1235,6 +1227,14 @@ } ] }, + "TypedUuidForRackInitKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForRackResetKind": { + "type": "string", + "format": "uuid" + }, "UplinkAddressConfig": { "type": "object", "properties": { diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 34e7eadb54..48e5d290a3 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -355,7 +355,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" } } } @@ -377,7 +377,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" } } } @@ -2618,19 +2618,14 @@ "rack_network_config" ] }, - "RackInitId": { - "description": "RackInitId\n\n
JSON schema\n\n```json { \"type\": \"string\", \"format\": \"uuid\" } ```
", - "type": "string", - "format": "uuid" - }, "RackOperationStatus": { - "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackInitId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackInitId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/RackResetId\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/RackResetId\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", + "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", "oneOf": [ { "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "status": { "type": "string", @@ -2652,7 +2647,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" } ] }, @@ -2671,7 +2666,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "message": { "type": "string" @@ -2693,7 +2688,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackInitId" + "$ref": "#/components/schemas/TypedUuidForRackInitKind" }, "status": { "type": "string", @@ -2711,7 +2706,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "status": { "type": "string", @@ -2733,7 +2728,7 @@ "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" } ] }, @@ -2752,7 +2747,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "message": { "type": "string" @@ -2774,7 +2769,7 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/RackResetId" + "$ref": "#/components/schemas/TypedUuidForRackResetKind" }, "status": { "type": "string", @@ -2790,11 +2785,6 @@ } ] }, - "RackResetId": { - "description": "RackResetId\n\n
JSON schema\n\n```json { \"type\": \"string\", \"format\": \"uuid\" } ```
", - "type": "string", - "format": "uuid" - }, "RackV1Inventory": { "description": "The current state of the v1 Rack as known to wicketd", "type": "object", @@ -5682,6 +5672,14 @@ } ] }, + "TypedUuidForRackInitKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForRackResetKind": { + "type": "string", + "format": "uuid" + }, "UpdateComponent": { "oneOf": [ { diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index 2fa0b83f1d..d3207f05a8 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -11,7 +11,6 @@ use super::rack_ops::RssAccess; use super::BootstrapError; use super::RssAccessError; use crate::bootstrap::params::RackInitializeRequest; -use crate::bootstrap::rack_ops::{RackInitId, RackResetId}; use crate::updates::ConfigUpdates; use crate::updates::{Component, UpdateManager}; use bootstore::schemes::v0 as bootstore; @@ -22,6 +21,8 @@ use dropshot::{ }; use http::StatusCode; use omicron_common::api::external::Error; +use omicron_uuid_kinds::RackInitUuid; +use omicron_uuid_kinds::RackResetUuid; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware_types::Baseboard; @@ -47,7 +48,7 @@ impl BootstrapServerContext { pub(super) fn start_rack_initialize( &self, request: RackInitializeRequest, - ) -> Result { + ) -> Result { self.rss_access.start_initializing( &self.base_log, self.global_zone_bootstrap_ip, @@ -89,34 +90,34 @@ pub(crate) fn api() -> BootstrapApiDescription { #[serde(tag = "status", rename_all = "snake_case")] pub enum RackOperationStatus { Initializing { - id: RackInitId, + id: RackInitUuid, }, /// `id` will be none if the rack was already initialized on startup. Initialized { - id: Option, + id: Option, }, InitializationFailed { - id: RackInitId, + id: RackInitUuid, message: String, }, InitializationPanicked { - id: RackInitId, + id: RackInitUuid, }, Resetting { - id: RackResetId, + id: RackResetUuid, }, /// `reset_id` will be None if the rack is in an uninitialized-on-startup, /// or Some if it is in an uninitialized state due to a reset operation /// completing. Uninitialized { - reset_id: Option, + reset_id: Option, }, ResetFailed { - id: RackResetId, + id: RackResetUuid, message: String, }, ResetPanicked { - id: RackResetId, + id: RackResetUuid, }, } @@ -173,7 +174,7 @@ async fn rack_initialization_status( async fn rack_initialize( rqctx: RequestContext, body: TypedBody, -) -> Result, HttpError> { +) -> Result, HttpError> { let ctx = rqctx.context(); let request = body.into_inner(); let id = ctx @@ -189,7 +190,7 @@ async fn rack_initialize( }] async fn rack_reset( rqctx: RequestContext, -) -> Result, HttpError> { +) -> Result, HttpError> { let ctx = rqctx.context(); let id = ctx .rss_access diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs index 5cfd0b074a..4da5f0ab28 100644 --- a/sled-agent/src/bootstrap/rack_ops.rs +++ b/sled-agent/src/bootstrap/rack_ops.rs @@ -9,9 +9,8 @@ use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rss_handle::RssHandle; use crate::rack_setup::service::SetupServiceError; use bootstore::schemes::v0 as bootstore; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; +use omicron_uuid_kinds::RackInitUuid; +use omicron_uuid_kinds::RackResetUuid; use sled_storage::manager::StorageHandle; use slog::Logger; use std::mem; @@ -20,37 +19,6 @@ use std::sync::Arc; use std::sync::Mutex; use tokio::sync::oneshot; use tokio::sync::oneshot::error::TryRecvError; -use uuid::Uuid; - -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - Hash, - PartialOrd, - Ord, - Serialize, - Deserialize, - JsonSchema, -)] -pub struct RackInitId(pub Uuid); - -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - Hash, - PartialOrd, - Ord, - Serialize, - Deserialize, - JsonSchema, -)] -pub struct RackResetId(pub Uuid); #[derive(Debug, Clone, thiserror::Error)] pub enum RssAccessError { @@ -174,7 +142,7 @@ impl RssAccess { storage_manager: &StorageHandle, bootstore_node_handle: &bootstore::NodeHandle, request: RackInitializeRequest, - ) -> Result { + ) -> Result { let mut status = self.status.lock().unwrap(); match &*status { @@ -202,7 +170,7 @@ impl RssAccess { } RssStatus::Uninitialized { .. } => { let (completion_tx, completion) = oneshot::channel(); - let id = RackInitId(Uuid::new_v4()); + let id = RackInitUuid::new_v4(); *status = RssStatus::Initializing { id, completion }; mem::drop(status); @@ -240,7 +208,7 @@ impl RssAccess { &self, parent_log: &Logger, global_zone_bootstrap_ip: Ipv6Addr, - ) -> Result { + ) -> Result { let mut status = self.status.lock().unwrap(); match &*status { @@ -267,7 +235,7 @@ impl RssAccess { } RssStatus::Initialized { .. } => { let (completion_tx, completion) = oneshot::channel(); - let id = RackResetId(Uuid::new_v4()); + let id = RackResetUuid::new_v4(); *status = RssStatus::Resetting { id, completion }; mem::drop(status); @@ -302,40 +270,40 @@ enum RssStatus { // We can either be uninitialized on startup (in which case `reset_id` // is None) or because a reset has completed (in which case `reset_id` // is Some). - reset_id: Option, + reset_id: Option, }, Initialized { // We can either be initialized on startup (in which case `id` // is None) or because initialization has completed (in which case `id` // is Some). - id: Option, + id: Option, }, // Tranistory states (which we may be in for a long time, even on human time // scales, but should eventually leave). Initializing { - id: RackInitId, + id: RackInitUuid, completion: oneshot::Receiver<()>, }, Resetting { - id: RackResetId, + id: RackResetUuid, completion: oneshot::Receiver<()>, }, // Terminal failure states; these require support intervention. InitializationFailed { - id: RackInitId, + id: RackInitUuid, err: SetupServiceError, }, InitializationPanicked { - id: RackInitId, + id: RackInitUuid, }, ResetFailed { - id: RackResetId, + id: RackResetUuid, err: SetupServiceError, }, ResetPanicked { - id: RackResetId, + id: RackResetUuid, }, } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 369437d3aa..69a6f455cc 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -8,7 +8,6 @@ use super::config::BOOTSTRAP_AGENT_HTTP_PORT; use super::http_entrypoints; use super::params::RackInitializeRequest; use super::params::StartSledAgentRequest; -use super::rack_ops::RackInitId; use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; @@ -42,6 +41,7 @@ use omicron_common::ledger; use omicron_common::ledger::Ledger; use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_ddm_admin_client::DdmError; +use omicron_uuid_kinds::RackInitUuid; use sled_hardware::underlay; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; @@ -290,7 +290,7 @@ impl Server { pub fn start_rack_initialize( &self, request: RackInitializeRequest, - ) -> Result { + ) -> Result { self.bootstrap_http_server.app_private().start_rack_initialize(request) } diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 53acc9c1ed..fb8a6f6fa9 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -58,6 +58,8 @@ impl_typed_uuid_kind! { OmicronZone => "service", PhysicalDisk => "physical_disk", Propolis => "propolis", + RackInit => "rack_init", + RackReset => "rack_reset", Sled => "sled", TufRepo => "tuf_repo", Upstairs => "upstairs", diff --git a/wicket/src/ui/panes/rack_setup.rs b/wicket/src/ui/panes/rack_setup.rs index b4fa9de6f0..7bb63b6b1b 100644 --- a/wicket/src/ui/panes/rack_setup.rs +++ b/wicket/src/ui/panes/rack_setup.rs @@ -352,7 +352,7 @@ fn draw_rack_status_details_popup( ])); if let Some(id) = reset_id { body.lines.push(Line::from(vec![Span::styled( - format!("Last reset operation ID: {}", id.0), + format!("Last reset operation ID: {}", id), style::plain_text(), )])); } @@ -364,7 +364,7 @@ fn draw_rack_status_details_popup( ])); if let Some(id) = id { body.lines.push(Line::from(vec![Span::styled( - format!("Last initialization operation ID: {}", id.0), + format!("Last initialization operation ID: {}", id), style::plain_text(), )])); } @@ -375,7 +375,7 @@ fn draw_rack_status_details_popup( Span::styled("Initialization Failed", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Last initialization operation ID: {}", id.0), + format!("Last initialization operation ID: {}", id), style::plain_text(), )])); push_text_lines(message, prefix, &mut body.lines); @@ -386,7 +386,7 @@ fn draw_rack_status_details_popup( Span::styled("Initialization Panicked", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Last initialization operation ID: {}", id.0), + format!("Last initialization operation ID: {}", id), style::plain_text(), )])); } @@ -396,7 +396,7 @@ fn draw_rack_status_details_popup( Span::styled("Reset Failed", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Last reset operation ID: {}", id.0), + format!("Last reset operation ID: {}", id), style::plain_text(), )])); push_text_lines(message, prefix, &mut body.lines); @@ -407,7 +407,7 @@ fn draw_rack_status_details_popup( Span::styled("Reset Panicked", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Last reset operation ID: {}", id.0), + format!("Last reset operation ID: {}", id), style::plain_text(), )])); } @@ -417,7 +417,7 @@ fn draw_rack_status_details_popup( Span::styled("Initializing", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Current operation ID: {}", id.0), + format!("Current operation ID: {}", id), style::plain_text(), )])); } @@ -427,7 +427,7 @@ fn draw_rack_status_details_popup( Span::styled("Resetting", style::plain_text()), ])); body.lines.push(Line::from(vec![Span::styled( - format!("Current operation ID: {}", id.0), + format!("Current operation ID: {}", id), style::plain_text(), )])); } diff --git a/wicketd-api/Cargo.toml b/wicketd-api/Cargo.toml index ba1d862a40..75c3a53461 100644 --- a/wicketd-api/Cargo.toml +++ b/wicketd-api/Cargo.toml @@ -12,6 +12,7 @@ dropshot.workspace = true gateway-client.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true +omicron-uuid-kinds.workspace = true omicron-workspace-hack.workspace = true schemars.workspace = true serde.workspace = true diff --git a/wicketd-api/src/lib.rs b/wicketd-api/src/lib.rs index 9192578305..2af264eb94 100644 --- a/wicketd-api/src/lib.rs +++ b/wicketd-api/src/lib.rs @@ -2,9 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use bootstrap_agent_client::types::RackInitId; use bootstrap_agent_client::types::RackOperationStatus; -use bootstrap_agent_client::types::RackResetId; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::HttpResponseUpdatedNoContent; @@ -16,6 +14,8 @@ use gateway_client::types::IgnitionCommand; use omicron_common::api::external::SemverVersion; use omicron_common::update::ArtifactHashId; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::RackInitUuid; +use omicron_uuid_kinds::RackResetUuid; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; @@ -170,7 +170,7 @@ pub trait WicketdApi { }] async fn post_run_rack_setup( rqctx: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; /// Run rack reset. #[endpoint { @@ -179,7 +179,7 @@ pub trait WicketdApi { }] async fn post_run_rack_reset( rqctx: RequestContext, - ) -> Result, HttpError>; + ) -> Result, HttpError>; /// A status endpoint used to report high level information known to /// wicketd. diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml index d2e870226b..324ae01b42 100644 --- a/wicketd/Cargo.toml +++ b/wicketd/Cargo.toml @@ -57,6 +57,7 @@ installinator-common.workspace = true omicron-certificates.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true +omicron-uuid-kinds.workspace = true sled-hardware-types.workspace = true tufaceous-lib.workspace = true update-common.workspace = true diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index 5661843c23..55b4d61c9a 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -10,9 +10,7 @@ use crate::mgs::GetInventoryError; use crate::mgs::MgsHandle; use crate::mgs::ShutdownInProgress; use crate::SmfConfigValues; -use bootstrap_agent_client::types::RackInitId; use bootstrap_agent_client::types::RackOperationStatus; -use bootstrap_agent_client::types::RackResetId; use dropshot::ApiDescription; use dropshot::HttpError; use dropshot::HttpResponseOk; @@ -24,6 +22,8 @@ use dropshot::TypedBody; use http::StatusCode; use internal_dns::resolver::Resolver; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_uuid_kinds::RackInitUuid; +use omicron_uuid_kinds::RackResetUuid; use sled_hardware_types::Baseboard; use slog::o; use std::collections::BTreeMap; @@ -237,7 +237,7 @@ impl WicketdApi for WicketdApiImpl { async fn post_run_rack_setup( rqctx: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let ctx = rqctx.context(); let log = &rqctx.log; @@ -291,7 +291,7 @@ impl WicketdApi for WicketdApiImpl { async fn post_run_rack_reset( rqctx: RequestContext, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let ctx = rqctx.context(); let sled_agent_addr = ctx.bootstrap_agent_addr().map_err(|err| { From 3817920e7fd1286c18ba7faddfede398db96b1bc Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:51:15 -0700 Subject: [PATCH 15/21] Update Rust crate trybuild to 1.0.97 (#6125) --- Cargo.lock | 7 +++---- Cargo.toml | 2 +- workspace-hack/Cargo.toml | 2 -- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ce366cd7f4..dd35052592 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3275,7 +3275,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.4.10", + "socket2 0.5.7", "tokio", "tower-service", "tracing", @@ -6009,7 +6009,6 @@ dependencies = [ "similar", "slog", "smallvec 1.13.2", - "socket2 0.5.7", "spin 0.9.8", "string_cache", "subtle", @@ -10289,9 +10288,9 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "trybuild" -version = "1.0.96" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33a5f13f11071020bb12de7a16b925d2d58636175c20c11dc5f96cb64bb6c9b3" +checksum = "5b1e5645f2ee8025c2f1d75e1138f2dd034d74e6ba54620f3c569ba2a2a1ea06" dependencies = [ "glob", "serde", diff --git a/Cargo.toml b/Cargo.toml index 8e9663bc7b..8f2eef3efb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -519,7 +519,7 @@ trust-dns-client = "0.22" trust-dns-proto = "0.22" trust-dns-resolver = "0.22" trust-dns-server = "0.22" -trybuild = "1.0.91" +trybuild = "1.0.97" tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } tui-tree-widget = "0.21.0" diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 498f25d017..e1f2a986bd 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -99,7 +99,6 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } -socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } @@ -205,7 +204,6 @@ sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] } smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] } -socket2 = { version = "0.5.7", default-features = false, features = ["all"] } spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } From dea7eb6f4626c55defe629a4a70a05f7def36182 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:52:16 -0700 Subject: [PATCH 16/21] Update Rust crate toml_edit to 0.22.16 (#6123) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8f2eef3efb..a9832f636f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -513,7 +513,7 @@ tokio-stream = "0.1.15" tokio-tungstenite = "0.20" tokio-util = { version = "0.7.11", features = ["io", "io-util"] } toml = "0.8.15" -toml_edit = "0.22.12" +toml_edit = "0.22.16" tough = { version = "0.17.1", features = [ "http" ] } trust-dns-client = "0.22" trust-dns-proto = "0.22" From 204ea7d1e2a2fe3d429de069cc7804a898bdd78c Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 19 Jul 2024 16:11:28 -0700 Subject: [PATCH 17/21] [2/3 sled-agent] move some common types into their own crate (#6122) These are core API types used by the bootstrap agent. This is almost entirely pure code movement with no functional changes. These types will be exposed via the upcoming bootstrap-agent-api crate, and I don't want to clutter the API crate too much. In the future we could also batch-replace these with the new progenitor stuff if desired. Depends on #6089. --- Cargo.lock | 26 + Cargo.toml | 3 + end-to-end-tests/Cargo.toml | 1 + end-to-end-tests/src/helpers/ctx.rs | 10 +- openapi/sled-agent.json | 2 +- sled-agent/Cargo.toml | 1 + sled-agent/src/bin/sled-agent.rs | 4 +- sled-agent/src/bootstrap/early_networking.rs | 583 +------------- sled-agent/src/bootstrap/http_entrypoints.rs | 44 +- sled-agent/src/bootstrap/params.rs | 464 +---------- sled-agent/src/bootstrap/rack_ops.rs | 4 +- sled-agent/src/bootstrap/rss_handle.rs | 4 +- sled-agent/src/bootstrap/server.rs | 2 +- sled-agent/src/http_entrypoints.rs | 2 +- sled-agent/src/rack_setup/config.rs | 249 ------ sled-agent/src/rack_setup/mod.rs | 2 - sled-agent/src/rack_setup/plan/service.rs | 6 +- sled-agent/src/rack_setup/plan/sled.rs | 4 +- sled-agent/src/rack_setup/service.rs | 15 +- sled-agent/src/sim/http_entrypoints.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 6 +- sled-agent/src/sled_agent.rs | 5 +- .../tests/integration_tests/early_network.rs | 4 +- sled-agent/types/Cargo.toml | 30 + sled-agent/types/src/early_networking.rs | 606 +++++++++++++++ sled-agent/types/src/lib.rs | 9 + sled-agent/types/src/rack_init.rs | 732 ++++++++++++++++++ sled-agent/types/src/rack_ops.rs | 46 ++ 28 files changed, 1496 insertions(+), 1370 deletions(-) delete mode 100644 sled-agent/src/rack_setup/config.rs create mode 100644 sled-agent/types/Cargo.toml create mode 100644 sled-agent/types/src/early_networking.rs create mode 100644 sled-agent/types/src/lib.rs create mode 100644 sled-agent/types/src/rack_init.rs create mode 100644 sled-agent/types/src/rack_ops.rs diff --git a/Cargo.lock b/Cargo.lock index dd35052592..67345447c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2232,6 +2232,7 @@ dependencies = [ "russh-keys", "serde", "serde_json", + "sled-agent-types", "socket2 0.5.7", "tokio", "toml 0.8.15", @@ -5845,6 +5846,7 @@ dependencies = [ "serde_json", "sha3", "sled-agent-client", + "sled-agent-types", "sled-hardware", "sled-hardware-types", "sled-storage", @@ -8890,6 +8892,30 @@ dependencies = [ "uuid", ] +[[package]] +name = "sled-agent-types" +version = "0.1.0" +dependencies = [ + "anyhow", + "bootstore", + "camino", + "camino-tempfile", + "nexus-client", + "omicron-common", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "oxnet", + "rcgen", + "schemars", + "serde", + "serde_json", + "sled-hardware-types", + "slog", + "thiserror", + "toml 0.8.15", +] + [[package]] name = "sled-hardware" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index a9832f636f..ba68cc9cac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/types", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -170,6 +171,7 @@ default-members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/types", "sled-hardware", "sled-hardware/types", "sled-storage", @@ -472,6 +474,7 @@ similar-asserts = "1.5.0" # server zones. sled = "=0.34.7" sled-agent-client = { path = "clients/sled-agent-client" } +sled-agent-types = { path = "sled-agent/types" } sled-hardware = { path = "sled-hardware" } sled-hardware-types = { path = "sled-hardware/types" } sled-storage = { path = "sled-storage" } diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index 1102094b61..157317cdad 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -23,6 +23,7 @@ russh = "0.43.0" russh-keys = "0.43.0" serde.workspace = true serde_json.workspace = true +sled-agent-types.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } toml.workspace = true trust-dns-resolver.workspace = true diff --git a/end-to-end-tests/src/helpers/ctx.rs b/end-to-end-tests/src/helpers/ctx.rs index e4bf61356c..76b759608c 100644 --- a/end-to-end-tests/src/helpers/ctx.rs +++ b/end-to-end-tests/src/helpers/ctx.rs @@ -1,7 +1,6 @@ use crate::helpers::generate_name; use anyhow::{anyhow, Context as _, Result}; use chrono::Utc; -use omicron_sled_agent::rack_setup::config::SetupServiceConfig; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{Name, ProjectCreate}; use oxide_client::CustomDnsResolver; @@ -9,6 +8,7 @@ use oxide_client::{Client, ClientImagesExt, ClientProjectsExt, ClientVpcsExt}; use reqwest::dns::Resolve; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::Url; +use sled_agent_types::rack_init::RackInitializeRequest; use std::net::IpAddr; use std::net::SocketAddr; use std::sync::Arc; @@ -73,7 +73,7 @@ impl Context { } } -fn rss_config() -> Result { +fn rss_config() -> Result { let path = "/opt/oxide/sled-agent/pkg/config-rss.toml"; let content = std::fs::read_to_string(&path).unwrap_or(RSS_CONFIG_STR.to_string()); @@ -81,7 +81,7 @@ fn rss_config() -> Result { .with_context(|| "parsing config-rss as TOML".to_string()) } -fn nexus_external_dns_name(config: &SetupServiceConfig) -> String { +fn nexus_external_dns_name(config: &RackInitializeRequest) -> String { format!( "{}.sys.{}", config.recovery_silo.silo_name.as_str(), @@ -89,7 +89,7 @@ fn nexus_external_dns_name(config: &SetupServiceConfig) -> String { ) } -fn external_dns_addr(config: &SetupServiceConfig) -> Result { +fn external_dns_addr(config: &RackInitializeRequest) -> Result { // From the RSS config, grab the first address from the configured services // IP pool as the DNS server's IP address. let dns_ip = config @@ -138,7 +138,7 @@ pub async fn nexus_addr() -> Result { } pub struct ClientParams { - rss_config: SetupServiceConfig, + rss_config: RackInitializeRequest, nexus_dns_name: String, resolver: Arc, proto: &'static str, diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 27cfe576b7..1323769da2 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -2658,7 +2658,7 @@ ] }, "EarlyNetworkConfig": { - "description": "Network configuration required to bring up the control plane\n\nThe fields in this structure are those from [`super::params::RackInitializeRequest`] necessary for use beyond RSS. This is just for the initial rack configuration and cold boot purposes. Updates come from Nexus.", + "description": "Network configuration required to bring up the control plane\n\nThe fields in this structure are those from [`crate::rack_init::RackInitializeRequest`] necessary for use beyond RSS. This is just for the initial rack configuration and cold boot purposes. Updates come from Nexus.", "type": "object", "properties": { "body": { diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index b798ba783d..a85884587f 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -68,6 +68,7 @@ serde_human_bytes.workspace = true serde_json = { workspace = true, features = ["raw_value"] } sha3.workspace = true sled-agent-client.workspace = true +sled-agent-types.workspace = true sled-hardware.workspace = true sled-hardware-types.workspace = true sled-storage.workspace = true diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index b8b5abf07f..1bd83653ad 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -11,8 +11,8 @@ use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; use omicron_sled_agent::bootstrap::server as bootstrap_server; use omicron_sled_agent::bootstrap::RssAccessError; -use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; +use sled_agent_types::rack_init::RackInitializeRequest; #[derive(Subcommand, Debug)] enum OpenapiFlavor { @@ -81,7 +81,7 @@ async fn do_run() -> Result<(), CmdError> { }; let rss_config = if rss_config_path.exists() { Some( - RssConfig::from_file(rss_config_path) + RackInitializeRequest::from_file(rss_config_path) .map_err(|e| CmdError::Failure(anyhow!(e)))?, ) } else { diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index 664e3242ab..742cff4e61 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -5,7 +5,6 @@ //! Network setup required to bring up the control plane use anyhow::{anyhow, Context}; -use bootstore::schemes::v0 as bootstore; use dpd_client::types::{ LinkCreate, LinkId, LinkSettings, PortId, PortSettings, }; @@ -26,9 +25,8 @@ use omicron_common::address::DENDRITE_PORT; use omicron_common::address::{MGD_PORT, MGS_PORT}; use omicron_common::api::external::{BfdMode, ImportExportPolicy}; use omicron_common::api::internal::shared::{ - BfdPeerConfig, BgpConfig, BgpPeerConfig, PortConfig, PortConfigV2, PortFec, - PortSpeed, RackNetworkConfig, RackNetworkConfigV2, RouteConfig, - SwitchLocation, UplinkAddressConfig, + BgpConfig, PortConfig, PortFec, PortSpeed, RackNetworkConfig, + SwitchLocation, }; use omicron_common::backoff::{ retry_notify, retry_policy_local, BackoffError, ExponentialBackoff, @@ -36,13 +34,10 @@ use omicron_common::backoff::{ }; use omicron_common::OMICRON_DPD_TAG; use omicron_ddm_admin_client::DdmError; -use oxnet::{IpNet, Ipv4Net, Ipv6Net}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use oxnet::IpNet; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6}; -use std::str::FromStr; use std::time::{Duration, Instant}; use thiserror::Error; @@ -728,418 +723,6 @@ fn retry_policy_switch_mapping() -> ExponentialBackoff { .build() } -/// Network configuration required to bring up the control plane -/// -/// The fields in this structure are those from -/// [`super::params::RackInitializeRequest`] necessary for use beyond RSS. This -/// is just for the initial rack configuration and cold boot purposes. Updates -/// come from Nexus. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct EarlyNetworkConfig { - // The current generation number of data as stored in CRDB. - // The initial generation is set during RSS time and then only mutated - // by Nexus. - pub generation: u64, - - // Which version of the data structure do we have. This is to help with - // deserialization and conversion in future updates. - pub schema_version: u32, - - // The actual configuration details - pub body: EarlyNetworkConfigBody, -} - -impl FromStr for EarlyNetworkConfig { - type Err = String; - - fn from_str(value: &str) -> Result { - #[derive(Deserialize)] - struct ShadowConfig { - generation: u64, - schema_version: u32, - body: EarlyNetworkConfigBody, - } - - let v2_err = match serde_json::from_str::(&value) { - Ok(cfg) => { - return Ok(EarlyNetworkConfig { - generation: cfg.generation, - schema_version: cfg.schema_version, - body: cfg.body, - }) - } - Err(e) => format!("unable to parse EarlyNetworkConfig: {e:?}"), - }; - // If we fail to parse the config as any known version, we return the - // error corresponding to the parse failure of the newest schema. - serde_json::from_str::(&value) - .map(|v1| EarlyNetworkConfig { - generation: v1.generation, - schema_version: Self::schema_version(), - body: v1.body.into(), - }) - .map_err(|_| v2_err) - } -} - -impl EarlyNetworkConfig { - pub fn schema_version() -> u32 { - 2 - } - - // Note: This currently only converts between v0 and v1 or deserializes v1 of - // `EarlyNetworkConfig`. - pub fn deserialize_bootstore_config( - log: &Logger, - config: &bootstore::NetworkConfig, - ) -> Result { - // Try to deserialize the latest version of the data structure (v2). If - // that succeeds we are done. - let v2_error = - match serde_json::from_slice::(&config.blob) { - Ok(val) => return Ok(val), - Err(error) => { - // Log this error and continue trying to deserialize older - // versions. - warn!( - log, - "Failed to deserialize EarlyNetworkConfig \ - as v2, trying next as v1: {}", - error, - ); - error - } - }; - - match serde_json::from_slice::( - &config.blob, - ) { - Ok(v1) => { - // Convert from v1 to v2 - return Ok(EarlyNetworkConfig { - generation: v1.generation, - schema_version: EarlyNetworkConfig::schema_version(), - body: v1.body.into(), - }); - } - Err(error) => { - // Log this error. - warn!( - log, - "Failed to deserialize EarlyNetworkConfig \ - as v1, trying next as v0: {}", - error - ); - } - }; - - match serde_json::from_slice::( - &config.blob, - ) { - Ok(val) => { - // Convert from v0 to v2 - return Ok(EarlyNetworkConfig { - generation: val.generation, - schema_version: 2, - body: EarlyNetworkConfigBody { - ntp_servers: val.ntp_servers, - rack_network_config: val.rack_network_config.map( - |v0_config| { - back_compat::RackNetworkConfigV0::to_v2( - val.rack_subnet, - v0_config, - ) - }, - ), - }, - }); - } - Err(error) => { - // Log this error. - warn!( - log, - "Failed to deserialize EarlyNetworkConfig as v0: {}", error, - ); - } - }; - - // If we fail to parse the config as any known version, we return the - // error corresponding to the parse failure of the newest schema. - Err(v2_error) - } -} - -/// This is the actual configuration of EarlyNetworking. -/// -/// We nest it below the "header" of `generation` and `schema_version` so that -/// we can perform partial deserialization of `EarlyNetworkConfig` to only read -/// the header and defer deserialization of the body once we know the schema -/// version. This is possible via the use of [`serde_json::value::RawValue`] in -/// future (post-v1) deserialization paths. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct EarlyNetworkConfigBody { - /// The external NTP server addresses. - pub ntp_servers: Vec, - - // Rack network configuration as delivered from RSS or Nexus - pub rack_network_config: Option, -} - -impl From for bootstore::NetworkConfig { - fn from(value: EarlyNetworkConfig) -> Self { - // Can this ever actually fail? - // We literally just deserialized the same data in RSS - let blob = serde_json::to_vec(&value).unwrap(); - - // Yes this is duplicated, but that seems fine. - let generation = value.generation; - - bootstore::NetworkConfig { generation, blob } - } -} - -/// Structures and routines used to maintain backwards compatibility. The -/// contents of this module should only be used to convert older data into the -/// current format, and not for any ongoing run-time operations. -pub mod back_compat { - use super::*; - - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] - pub struct EarlyNetworkConfigBodyV1 { - /// The external NTP server addresses. - pub ntp_servers: Vec, - - // Rack network configuration as delivered from RSS or Nexus - pub rack_network_config: Option, - } - - impl From for EarlyNetworkConfigBody { - fn from(v1: EarlyNetworkConfigBodyV1) -> Self { - EarlyNetworkConfigBody { - ntp_servers: v1.ntp_servers, - rack_network_config: v1 - .rack_network_config - .map(|v1_config| v1_config.into()), - } - } - } - - /// Deprecated, use `RackNetworkConfig` instead. Cannot actually deprecate due to - /// - /// - /// Our first version of `RackNetworkConfig`. If this exists in the bootstore, we - /// upgrade out of it into `RackNetworkConfigV1` or later versions if possible. - #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] - pub(crate) struct RackNetworkConfigV0 { - // TODO: #3591 Consider making infra-ip ranges implicit for uplinks - /// First ip address to be used for configuring network infrastructure - pub infra_ip_first: Ipv4Addr, - /// Last ip address to be used for configuring network infrastructure - pub infra_ip_last: Ipv4Addr, - /// Uplinks for connecting the rack to external networks - pub uplinks: Vec, - } - - impl RackNetworkConfigV0 { - /// Convert from `RackNetworkConfigV0` to `RackNetworkConfigV1` - /// - /// We cannot use `From for `RackNetworkConfigV2` - /// because the `rack_subnet` field does not exist in `RackNetworkConfigV0` - /// and must be passed in from the `EarlyNetworkConfigV0` struct which - /// contains the `RackNetworkConfigV0` struct. - pub fn to_v2( - rack_subnet: Ipv6Addr, - v0: RackNetworkConfigV0, - ) -> RackNetworkConfigV2 { - RackNetworkConfigV2 { - rack_subnet: Ipv6Net::new(rack_subnet, 56).unwrap(), - infra_ip_first: v0.infra_ip_first, - infra_ip_last: v0.infra_ip_last, - ports: v0 - .uplinks - .into_iter() - .map(|uplink| PortConfigV2::from(uplink)) - .collect(), - bgp: vec![], - bfd: vec![], - } - } - } - - /// Deprecated, use PortConfigV2 instead. Cannot actually deprecate due to - /// - #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] - pub struct PortConfigV1 { - /// The set of routes associated with this port. - pub routes: Vec, - /// This port's addresses and optional vlan IDs - pub addresses: Vec, - /// Switch the port belongs to. - pub switch: SwitchLocation, - /// Nmae of the port this config applies to. - pub port: String, - /// Port speed. - pub uplink_port_speed: PortSpeed, - /// Port forward error correction type. - pub uplink_port_fec: PortFec, - /// BGP peers on this port - pub bgp_peers: Vec, - /// Whether or not to set autonegotiation - #[serde(default)] - pub autoneg: bool, - } - - impl From for PortConfigV2 { - fn from(v1: PortConfigV1) -> Self { - PortConfigV2 { - routes: v1.routes.clone(), - addresses: v1 - .addresses - .iter() - .map(|a| UplinkAddressConfig { address: *a, vlan_id: None }) - .collect(), - switch: v1.switch, - port: v1.port, - uplink_port_speed: v1.uplink_port_speed, - uplink_port_fec: v1.uplink_port_fec, - bgp_peers: v1.bgp_peers.clone(), - autoneg: v1.autoneg, - } - } - } - - /// Deprecated, use PortConfigV2 instead. Cannot actually deprecate due to - /// - #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] - pub(crate) struct UplinkConfig { - /// Gateway address - pub gateway_ip: Ipv4Addr, - /// Switch to use for uplink - pub switch: SwitchLocation, - /// Switchport to use for external connectivity - pub uplink_port: String, - /// Speed for the Switchport - pub uplink_port_speed: PortSpeed, - /// Forward Error Correction setting for the uplink port - pub uplink_port_fec: PortFec, - /// IP Address and prefix (e.g., `192.168.0.1/16`) to apply to switchport - /// (must be in infra_ip pool) - pub uplink_cidr: Ipv4Net, - /// VLAN id to use for uplink - pub uplink_vid: Option, - } - - impl From for PortConfigV2 { - fn from(value: UplinkConfig) -> Self { - PortConfigV2 { - routes: vec![RouteConfig { - destination: "0.0.0.0/0".parse().unwrap(), - nexthop: value.gateway_ip.into(), - vlan_id: value.uplink_vid, - }], - addresses: vec![UplinkAddressConfig { - address: value.uplink_cidr.into(), - vlan_id: value.uplink_vid, - }], - switch: value.switch, - port: value.uplink_port, - uplink_port_speed: value.uplink_port_speed, - uplink_port_fec: value.uplink_port_fec, - bgp_peers: vec![], - autoneg: false, - } - } - } - - /// Deprecated, use `RackNetworkConfig` instead. Cannot actually deprecate due to - /// - /// - /// Our second version of `RackNetworkConfig`. If this exists in the bootstore, - /// we upgrade out of it into `RackNetworkConfigV1` or later versions if - /// possible. - #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] - pub struct RackNetworkConfigV1 { - pub rack_subnet: Ipv6Net, - // TODO: #3591 Consider making infra-ip ranges implicit for uplinks - /// First ip address to be used for configuring network infrastructure - pub infra_ip_first: Ipv4Addr, - /// Last ip address to be used for configuring network infrastructure - pub infra_ip_last: Ipv4Addr, - /// Uplinks for connecting the rack to external networks - pub ports: Vec, - /// BGP configurations for connecting the rack to external networks - pub bgp: Vec, - /// BFD configuration for connecting the rack to external networks - #[serde(default)] - pub bfd: Vec, - } - - impl From for RackNetworkConfigV2 { - fn from(v1: RackNetworkConfigV1) -> Self { - RackNetworkConfigV2 { - rack_subnet: v1.rack_subnet, - infra_ip_first: v1.infra_ip_first, - infra_ip_last: v1.infra_ip_last, - ports: v1 - .ports - .into_iter() - .map(|ports| PortConfigV2::from(ports)) - .collect(), - bgp: v1.bgp.clone(), - bfd: v1.bfd.clone(), - } - } - } - - // The second production version of the `EarlyNetworkConfig`. - // - // If this version is in the bootstore than we need to convert it to - // `EarlyNetworkConfigV2`. - // - // Once we do this for all customers that have initialized racks with the - // old version we can go ahead and remove this type and its conversion code - // altogether. - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] - pub struct EarlyNetworkConfigV1 { - // The current generation number of data as stored in CRDB. - // The initial generation is set during RSS time and then only mutated - // by Nexus. - pub generation: u64, - - // Which version of the data structure do we have. This is to help with - // deserialization and conversion in future updates. - pub schema_version: u32, - - // The actual configuration details - pub body: EarlyNetworkConfigBodyV1, - } - - // The first production version of the `EarlyNetworkConfig`. - // - // If this version is in the bootstore than we need to convert it to - // `EarlyNetworkConfigV2`. - // - // Once we do this for all customers that have initialized racks with the - // old version we can go ahead and remove this type and its conversion code - // altogether. - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] - pub(crate) struct EarlyNetworkConfigV0 { - // The current generation number of data as stored in CRDB. - // The initial generation is set during RSS time and then only mutated - // by Nexus. - pub generation: u64, - - pub rack_subnet: Ipv6Addr, - - /// The external NTP server addresses. - pub ntp_servers: Vec, - - // Rack network configuration as delivered from RSS and only existing at - // generation 1 - pub rack_network_config: Option, - } -} - // The following two conversion functions translate the speed and fec types used // in the internal API to the types used in the dpd-client API. The conversion // is done here, rather than with "impl From" at the definition, to avoid a @@ -1165,163 +748,3 @@ fn convert_fec(fec: &PortFec) -> dpd_client::types::PortFec { PortFec::Rs => dpd_client::types::PortFec::Rs, } } - -#[cfg(test)] -mod tests { - use super::*; - use omicron_common::api::internal::shared::RouteConfig; - use omicron_common::api::internal::shared::UplinkAddressConfig; - use omicron_test_utils::dev::test_setup_log; - - #[test] - fn serialized_early_network_config_v0_to_v2_conversion() { - let logctx = test_setup_log( - "serialized_early_network_config_v0_to_v2_conversion", - ); - let v0 = back_compat::EarlyNetworkConfigV0 { - generation: 1, - rack_subnet: Ipv6Addr::UNSPECIFIED, - ntp_servers: Vec::new(), - rack_network_config: Some(back_compat::RackNetworkConfigV0 { - infra_ip_first: Ipv4Addr::UNSPECIFIED, - infra_ip_last: Ipv4Addr::UNSPECIFIED, - uplinks: vec![back_compat::UplinkConfig { - gateway_ip: Ipv4Addr::UNSPECIFIED, - switch: SwitchLocation::Switch0, - uplink_port: "Port0".to_string(), - uplink_port_speed: PortSpeed::Speed100G, - uplink_port_fec: PortFec::None, - uplink_cidr: "192.168.0.1/16".parse().unwrap(), - uplink_vid: None, - }], - }), - }; - - let v0_serialized = serde_json::to_vec(&v0).unwrap(); - let bootstore_conf = - bootstore::NetworkConfig { generation: 1, blob: v0_serialized }; - - let v2 = EarlyNetworkConfig::deserialize_bootstore_config( - &logctx.log, - &bootstore_conf, - ) - .unwrap(); - let v0_rack_network_config = v0.rack_network_config.unwrap(); - let uplink = v0_rack_network_config.uplinks[0].clone(); - let expected = EarlyNetworkConfig { - generation: 1, - schema_version: EarlyNetworkConfig::schema_version(), - body: EarlyNetworkConfigBody { - ntp_servers: v0.ntp_servers.clone(), - rack_network_config: Some(RackNetworkConfigV2 { - rack_subnet: Ipv6Net::new(v0.rack_subnet, 56).unwrap(), - infra_ip_first: v0_rack_network_config.infra_ip_first, - infra_ip_last: v0_rack_network_config.infra_ip_last, - ports: vec![PortConfigV2 { - routes: vec![RouteConfig { - destination: "0.0.0.0/0".parse().unwrap(), - nexthop: uplink.gateway_ip.into(), - vlan_id: None, - }], - addresses: vec![UplinkAddressConfig { - address: uplink.uplink_cidr.into(), - vlan_id: None, - }], - switch: uplink.switch, - port: uplink.uplink_port, - uplink_port_speed: uplink.uplink_port_speed, - uplink_port_fec: uplink.uplink_port_fec, - autoneg: false, - bgp_peers: vec![], - }], - bgp: vec![], - bfd: vec![], - }), - }, - }; - - assert_eq!(expected, v2); - - logctx.cleanup_successful(); - } - - #[test] - fn serialized_early_network_config_v1_to_v2_conversion() { - let logctx = test_setup_log( - "serialized_early_network_config_v1_to_v2_conversion", - ); - - let v1 = back_compat::EarlyNetworkConfigV1 { - generation: 1, - schema_version: 1, - body: back_compat::EarlyNetworkConfigBodyV1 { - ntp_servers: Vec::new(), - rack_network_config: Some(back_compat::RackNetworkConfigV1 { - rack_subnet: Ipv6Net::new(Ipv6Addr::UNSPECIFIED, 56) - .unwrap(), - infra_ip_first: Ipv4Addr::UNSPECIFIED, - infra_ip_last: Ipv4Addr::UNSPECIFIED, - ports: vec![back_compat::PortConfigV1 { - routes: vec![RouteConfig { - destination: "0.0.0.0/0".parse().unwrap(), - nexthop: "192.168.0.2".parse().unwrap(), - vlan_id: None, - }], - addresses: vec!["192.168.0.1/16".parse().unwrap()], - switch: SwitchLocation::Switch0, - port: "Port0".to_string(), - uplink_port_speed: PortSpeed::Speed100G, - uplink_port_fec: PortFec::None, - bgp_peers: Vec::new(), - autoneg: false, - }], - bgp: Vec::new(), - bfd: Vec::new(), - }), - }, - }; - - let v1_serialized = serde_json::to_vec(&v1).unwrap(); - let bootstore_conf = - bootstore::NetworkConfig { generation: 1, blob: v1_serialized }; - - let v2 = EarlyNetworkConfig::deserialize_bootstore_config( - &logctx.log, - &bootstore_conf, - ) - .unwrap(); - let v1_rack_network_config = v1.body.rack_network_config.unwrap(); - let port = v1_rack_network_config.ports[0].clone(); - let expected = EarlyNetworkConfig { - generation: 1, - schema_version: EarlyNetworkConfig::schema_version(), - body: EarlyNetworkConfigBody { - ntp_servers: v1.body.ntp_servers.clone(), - rack_network_config: Some(RackNetworkConfigV2 { - rack_subnet: v1_rack_network_config.rack_subnet, - infra_ip_first: v1_rack_network_config.infra_ip_first, - infra_ip_last: v1_rack_network_config.infra_ip_last, - ports: vec![PortConfigV2 { - routes: port.routes.clone(), - addresses: vec![UplinkAddressConfig { - address: port.addresses[0], - vlan_id: None, - }], - switch: port.switch, - port: port.port, - uplink_port_speed: port.uplink_port_speed, - uplink_port_fec: port.uplink_port_fec, - autoneg: false, - bgp_peers: vec![], - }], - bgp: vec![], - bfd: vec![], - }), - }, - }; - - assert_eq!(expected, v2); - - logctx.cleanup_successful(); - } -} diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index d3207f05a8..824bb5fd25 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -10,7 +10,6 @@ use super::rack_ops::RssAccess; use super::BootstrapError; use super::RssAccessError; -use crate::bootstrap::params::RackInitializeRequest; use crate::updates::ConfigUpdates; use crate::updates::{Component, UpdateManager}; use bootstore::schemes::v0 as bootstore; @@ -23,8 +22,8 @@ use http::StatusCode; use omicron_common::api::external::Error; use omicron_uuid_kinds::RackInitUuid; use omicron_uuid_kinds::RackResetUuid; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use sled_agent_types::rack_init::RackInitializeRequest; +use sled_agent_types::rack_ops::RackOperationStatus; use sled_hardware_types::Baseboard; use sled_storage::manager::StorageHandle; use slog::Logger; @@ -82,45 +81,6 @@ pub(crate) fn api() -> BootstrapApiDescription { api } -/// Current status of any rack-level operation being performed by this bootstrap -/// agent. -#[derive( - Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, -)] -#[serde(tag = "status", rename_all = "snake_case")] -pub enum RackOperationStatus { - Initializing { - id: RackInitUuid, - }, - /// `id` will be none if the rack was already initialized on startup. - Initialized { - id: Option, - }, - InitializationFailed { - id: RackInitUuid, - message: String, - }, - InitializationPanicked { - id: RackInitUuid, - }, - Resetting { - id: RackResetUuid, - }, - /// `reset_id` will be None if the rack is in an uninitialized-on-startup, - /// or Some if it is in an uninitialized state due to a reset operation - /// completing. - Uninitialized { - reset_id: Option, - }, - ResetFailed { - id: RackResetUuid, - message: String, - }, - ResetPanicked { - id: RackResetUuid, - }, -} - /// Return the baseboard identity of this sled. #[endpoint { method = GET, diff --git a/sled-agent/src/bootstrap/params.rs b/sled-agent/src/bootstrap/params.rs index 4a5b443dc3..9fe399419f 100644 --- a/sled-agent/src/bootstrap/params.rs +++ b/sled-agent/src/bootstrap/params.rs @@ -4,301 +4,17 @@ //! Request types for the bootstrap agent -use crate::bootstrap::early_networking::back_compat::RackNetworkConfigV1; -use anyhow::{bail, Result}; +use anyhow::Result; use async_trait::async_trait; use omicron_common::address::{self, Ipv6Subnet, SLED_PREFIX}; -use omicron_common::api::external::AllowedSourceIps; -use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::ledger::Ledgerable; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sha3::{Digest, Sha3_256}; -use sled_hardware_types::Baseboard; use std::borrow::Cow; -use std::collections::BTreeSet; use std::net::{IpAddr, Ipv6Addr, SocketAddrV6}; use uuid::Uuid; -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] -#[serde(rename_all = "snake_case", tag = "type")] -pub enum BootstrapAddressDiscovery { - /// Ignore all bootstrap addresses except our own. - OnlyOurs, - /// Ignore all bootstrap addresses except the following. - OnlyThese { addrs: BTreeSet }, -} - -/// Structures and routines used to maintain backwards compatibility. The -/// contents of this module should only be used to convert older data into the -/// current format, and not for any ongoing run-time operations. -pub mod back_compat { - use super::*; - - #[derive(Clone, Deserialize)] - struct UnvalidatedRackInitializeRequestV1 { - trust_quorum_peers: Option>, - bootstrap_discovery: BootstrapAddressDiscovery, - ntp_servers: Vec, - dns_servers: Vec, - internal_services_ip_pool_ranges: Vec, - external_dns_ips: Vec, - external_dns_zone_name: String, - external_certificates: Vec, - recovery_silo: RecoverySiloConfig, - rack_network_config: RackNetworkConfigV1, - #[serde(default = "default_allowed_source_ips")] - allowed_source_ips: AllowedSourceIps, - } - - /// This is a deprecated format, maintained to allow importing from older - /// versions. - #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] - #[serde(try_from = "UnvalidatedRackInitializeRequestV1")] - pub struct RackInitializeRequestV1 { - pub trust_quorum_peers: Option>, - pub bootstrap_discovery: BootstrapAddressDiscovery, - pub ntp_servers: Vec, - pub dns_servers: Vec, - pub internal_services_ip_pool_ranges: Vec, - pub external_dns_ips: Vec, - pub external_dns_zone_name: String, - pub external_certificates: Vec, - pub recovery_silo: RecoverySiloConfig, - pub rack_network_config: RackNetworkConfigV1, - #[serde(default = "default_allowed_source_ips")] - pub allowed_source_ips: AllowedSourceIps, - } - - impl TryFrom for RackInitializeRequestV1 { - type Error = anyhow::Error; - - fn try_from(value: UnvalidatedRackInitializeRequestV1) -> Result { - validate_external_dns( - &value.external_dns_ips, - &value.internal_services_ip_pool_ranges, - )?; - - Ok(RackInitializeRequestV1 { - trust_quorum_peers: value.trust_quorum_peers, - bootstrap_discovery: value.bootstrap_discovery, - ntp_servers: value.ntp_servers, - dns_servers: value.dns_servers, - internal_services_ip_pool_ranges: value - .internal_services_ip_pool_ranges, - external_dns_ips: value.external_dns_ips, - external_dns_zone_name: value.external_dns_zone_name, - external_certificates: value.external_certificates, - recovery_silo: value.recovery_silo, - rack_network_config: value.rack_network_config, - allowed_source_ips: value.allowed_source_ips, - }) - } - } - impl From for RackInitializeRequest { - fn from(v1: RackInitializeRequestV1) -> Self { - RackInitializeRequest { - trust_quorum_peers: v1.trust_quorum_peers, - bootstrap_discovery: v1.bootstrap_discovery, - ntp_servers: v1.ntp_servers, - dns_servers: v1.dns_servers, - internal_services_ip_pool_ranges: v1 - .internal_services_ip_pool_ranges, - external_dns_ips: v1.external_dns_ips, - external_dns_zone_name: v1.external_dns_zone_name, - external_certificates: v1.external_certificates, - recovery_silo: v1.recovery_silo, - rack_network_config: v1.rack_network_config.into(), - allowed_source_ips: v1.allowed_source_ips, - } - } - } -} - -// "Shadow" copy of `RackInitializeRequest` that does no validation on its -// fields. -#[derive(Clone, Deserialize)] -struct UnvalidatedRackInitializeRequest { - trust_quorum_peers: Option>, - bootstrap_discovery: BootstrapAddressDiscovery, - ntp_servers: Vec, - dns_servers: Vec, - internal_services_ip_pool_ranges: Vec, - external_dns_ips: Vec, - external_dns_zone_name: String, - external_certificates: Vec, - recovery_silo: RecoverySiloConfig, - rack_network_config: RackNetworkConfig, - #[serde(default = "default_allowed_source_ips")] - allowed_source_ips: AllowedSourceIps, -} - -/// Configuration for the "rack setup service". -/// -/// The Rack Setup Service should be responsible for one-time setup actions, -/// such as CockroachDB placement and initialization. Without operator -/// intervention, however, these actions need a way to be automated in our -/// deployment. -#[derive(Clone, Deserialize, Serialize, PartialEq, JsonSchema)] -#[serde(try_from = "UnvalidatedRackInitializeRequest")] -pub struct RackInitializeRequest { - /// The set of peer_ids required to initialize trust quorum - /// - /// The value is `None` if we are not using trust quorum - pub trust_quorum_peers: Option>, - - /// Describes how bootstrap addresses should be collected during RSS. - pub bootstrap_discovery: BootstrapAddressDiscovery, - - /// The external NTP server addresses. - pub ntp_servers: Vec, - - /// The external DNS server addresses. - pub dns_servers: Vec, - - /// Ranges of the service IP pool which may be used for internal services. - // TODO(https://github.com/oxidecomputer/omicron/issues/1530): Eventually, - // we want to configure multiple pools. - pub internal_services_ip_pool_ranges: Vec, - - /// Service IP addresses on which we run external DNS servers. - /// - /// Each address must be present in `internal_services_ip_pool_ranges`. - pub external_dns_ips: Vec, - - /// DNS name for the DNS zone delegated to the rack for external DNS - pub external_dns_zone_name: String, - - /// initial TLS certificates for the external API - pub external_certificates: Vec, - - /// Configuration of the Recovery Silo (the initial Silo) - pub recovery_silo: RecoverySiloConfig, - - /// Initial rack network configuration - pub rack_network_config: RackNetworkConfig, - - /// IPs or subnets allowed to make requests to user-facing services - #[serde(default = "default_allowed_source_ips")] - pub allowed_source_ips: AllowedSourceIps, -} - -impl RackInitializeRequest { - pub fn from_toml_with_fallback( - data: &str, - ) -> Result { - let v2_err = match toml::from_str::(&data) { - Ok(req) => return Ok(req), - Err(e) => e, - }; - if let Ok(v1) = - toml::from_str::(&data) - { - return Ok(v1.into()); - } - - // If we fail to parse the request as any known version, we return the - // error corresponding to the parse failure of the newest schema. - Err(v2_err.into()) - } -} - -/// This field was added after several racks were already deployed. RSS plans -/// for those racks should default to allowing any source IP, since that is -/// effectively what they did. -const fn default_allowed_source_ips() -> AllowedSourceIps { - AllowedSourceIps::Any -} - -// This custom debug implementation hides the private keys. -impl std::fmt::Debug for RackInitializeRequest { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // If you find a compiler error here, and you just added a field to this - // struct, be sure to add it to the Debug impl below! - let RackInitializeRequest { - trust_quorum_peers: trust_qurorum_peers, - bootstrap_discovery, - ntp_servers, - dns_servers, - internal_services_ip_pool_ranges, - external_dns_ips, - external_dns_zone_name, - external_certificates: _, - recovery_silo, - rack_network_config, - allowed_source_ips, - } = &self; - - f.debug_struct("RackInitializeRequest") - .field("trust_quorum_peers", trust_qurorum_peers) - .field("bootstrap_discovery", bootstrap_discovery) - .field("ntp_servers", ntp_servers) - .field("dns_servers", dns_servers) - .field( - "internal_services_ip_pool_ranges", - internal_services_ip_pool_ranges, - ) - .field("external_dns_ips", external_dns_ips) - .field("external_dns_zone_name", external_dns_zone_name) - .field("external_certificates", &"") - .field("recovery_silo", recovery_silo) - .field("rack_network_config", rack_network_config) - .field("allowed_source_ips", allowed_source_ips) - .finish() - } -} - -fn validate_external_dns( - dns_ips: &Vec, - internal_ranges: &Vec, -) -> Result<()> { - if dns_ips.is_empty() { - bail!("At least one external DNS IP is required"); - } - - // Every external DNS IP should also be present in one of the internal - // services IP pool ranges. This check is O(N*M), but we expect both N - // and M to be small (~5 DNS servers, and a small number of pools). - for &dns_ip in dns_ips { - if !internal_ranges.iter().any(|range| range.contains(dns_ip)) { - bail!( - "External DNS IP {dns_ip} is not contained in \ - `internal_services_ip_pool_ranges`" - ); - } - } - Ok(()) -} - -impl TryFrom for RackInitializeRequest { - type Error = anyhow::Error; - - fn try_from(value: UnvalidatedRackInitializeRequest) -> Result { - validate_external_dns( - &value.external_dns_ips, - &value.internal_services_ip_pool_ranges, - )?; - - Ok(RackInitializeRequest { - trust_quorum_peers: value.trust_quorum_peers, - bootstrap_discovery: value.bootstrap_discovery, - ntp_servers: value.ntp_servers, - dns_servers: value.dns_servers, - internal_services_ip_pool_ranges: value - .internal_services_ip_pool_ranges, - external_dns_ips: value.external_dns_ips, - external_dns_zone_name: value.external_dns_zone_name, - external_certificates: value.external_certificates, - recovery_silo: value.recovery_silo, - rack_network_config: value.rack_network_config, - allowed_source_ips: value.allowed_source_ips, - }) - } -} - -pub type Certificate = nexus_client::types::Certificate; -pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; - /// A representation of a Baseboard ID as used in the inventory subsystem /// This type is essentially the same as a `Baseboard` except it doesn't have a /// revision or HW type (Gimlet, PC, Unknown). @@ -480,70 +196,11 @@ pub(super) mod version { pub(crate) const V1: u32 = 1; } -#[cfg(test)] -pub fn test_config() -> RackInitializeRequest { - let manifest = std::env::var("CARGO_MANIFEST_DIR") - .expect("Cannot access manifest directory"); - let manifest = camino::Utf8PathBuf::from(manifest); - let path = manifest.join("../smf/sled-agent/non-gimlet/config-rss.toml"); - let contents = std::fs::read_to_string(&path).unwrap(); - toml::from_str(&contents) - .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)) -} - #[cfg(test)] mod tests { - use std::net::Ipv4Addr; use std::net::Ipv6Addr; use super::*; - use camino::Utf8PathBuf; - use oxnet::Ipv6Net; - - #[test] - fn parse_rack_initialization() { - let manifest = std::env::var("CARGO_MANIFEST_DIR") - .expect("Cannot access manifest directory"); - let manifest = Utf8PathBuf::from(manifest); - - let path = - manifest.join("../smf/sled-agent/non-gimlet/config-rss.toml"); - let contents = std::fs::read_to_string(&path).unwrap(); - let _: RackInitializeRequest = toml::from_str(&contents) - .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); - - let path = manifest - .join("../smf/sled-agent/gimlet-standalone/config-rss.toml"); - let contents = std::fs::read_to_string(&path).unwrap(); - let _: RackInitializeRequest = toml::from_str(&contents) - .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); - } - - #[test] - fn parse_rack_initialization_weak_hash() { - let config = r#" - bootstrap_discovery.type = "only_ours" - ntp_servers = [ "ntp.eng.oxide.computer" ] - dns_servers = [ "1.1.1.1", "9.9.9.9" ] - external_dns_zone_name = "oxide.test" - - [[internal_services_ip_pool_ranges]] - first = "192.168.1.20" - last = "192.168.1.22" - - [recovery_silo] - silo_name = "recovery" - user_name = "recovery" - user_password_hash = "$argon2i$v=19$m=16,t=2,p=1$NVR0a2QxVXNiQjlObFJXbA$iGFJWOlUqN20B8KR4Fsmrg" - "#; - - let error = toml::from_str::(config) - .expect_err("unexpectedly parsed with bad password hash"); - println!("found error: {}", error); - assert!(error.to_string().contains( - "password hash: algorithm: expected argon2id, found argon2i" - )); - } #[test] fn json_serialization_round_trips() { @@ -600,123 +257,4 @@ mod tests { Ledgerable::deserialize(&serialized).unwrap(); assert_eq!(expected, actual); } - - #[test] - fn validate_external_dns_ips_must_be_in_internal_services_ip_pools() { - // Conjure up a config; we'll tweak the internal services pools and - // external DNS IPs, but no other fields matter. - let mut config = UnvalidatedRackInitializeRequest { - trust_quorum_peers: None, - bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, - ntp_servers: Vec::new(), - dns_servers: Vec::new(), - internal_services_ip_pool_ranges: Vec::new(), - external_dns_ips: Vec::new(), - external_dns_zone_name: "".to_string(), - external_certificates: Vec::new(), - recovery_silo: RecoverySiloConfig { - silo_name: "recovery".parse().unwrap(), - user_name: "recovery".parse().unwrap(), - user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), - }, - rack_network_config: RackNetworkConfig { - rack_subnet: Ipv6Net::host_net(Ipv6Addr::LOCALHOST), - infra_ip_first: Ipv4Addr::LOCALHOST, - infra_ip_last: Ipv4Addr::LOCALHOST, - ports: Vec::new(), - bgp: Vec::new(), - bfd: Vec::new(), - }, - allowed_source_ips: AllowedSourceIps::Any, - }; - - // Valid configs: all external DNS IPs are contained in the IP pool - // ranges. - for (ip_pool_ranges, dns_ips) in [ - ( - &[("fd00::1", "fd00::10")] as &[(&str, &str)], - &["fd00::1", "fd00::5", "fd00::10"] as &[&str], - ), - ( - &[("192.168.1.10", "192.168.1.20")], - &["192.168.1.10", "192.168.1.15", "192.168.1.20"], - ), - ( - &[("fd00::1", "fd00::10"), ("192.168.1.10", "192.168.1.20")], - &[ - "fd00::1", - "fd00::5", - "fd00::10", - "192.168.1.10", - "192.168.1.15", - "192.168.1.20", - ], - ), - ] { - config.internal_services_ip_pool_ranges = ip_pool_ranges - .iter() - .map(|(a, b)| { - address::IpRange::try_from(( - a.parse::().unwrap(), - b.parse::().unwrap(), - )) - .unwrap() - }) - .collect(); - config.external_dns_ips = - dns_ips.iter().map(|ip| ip.parse().unwrap()).collect(); - - match RackInitializeRequest::try_from(config.clone()) { - Ok(_) => (), - Err(err) => panic!( - "failure on {ip_pool_ranges:?} with DNS IPs {dns_ips:?}: \ - {err}" - ), - } - } - - // Invalid configs: either no DNS IPs, or one or more DNS IPs are not - // contained in the ip pool ranges. - for (ip_pool_ranges, dns_ips) in [ - (&[("fd00::1", "fd00::10")] as &[(&str, &str)], &[] as &[&str]), - (&[("fd00::1", "fd00::10")], &["fd00::1", "fd00::5", "fd00::11"]), - ( - &[("192.168.1.10", "192.168.1.20")], - &["192.168.1.9", "192.168.1.15", "192.168.1.20"], - ), - ( - &[("fd00::1", "fd00::10"), ("192.168.1.10", "192.168.1.20")], - &[ - "fd00::1", - "fd00::5", - "fd00::10", - "192.168.1.10", - "192.168.1.15", - "192.168.1.20", - "192.168.1.21", - ], - ), - ] { - config.internal_services_ip_pool_ranges = ip_pool_ranges - .iter() - .map(|(a, b)| { - address::IpRange::try_from(( - a.parse::().unwrap(), - b.parse::().unwrap(), - )) - .unwrap() - }) - .collect(); - config.external_dns_ips = - dns_ips.iter().map(|ip| ip.parse().unwrap()).collect(); - - match RackInitializeRequest::try_from(config.clone()) { - Ok(_) => panic!( - "unexpected success on {ip_pool_ranges:?} with \ - DNS IPs {dns_ips:?}" - ), - Err(_) => (), - } - } - } } diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs index 4da5f0ab28..3eb00b419a 100644 --- a/sled-agent/src/bootstrap/rack_ops.rs +++ b/sled-agent/src/bootstrap/rack_ops.rs @@ -4,13 +4,13 @@ //! Internal API for rack-level bootstrap agent operations. -use crate::bootstrap::http_entrypoints::RackOperationStatus; -use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rss_handle::RssHandle; use crate::rack_setup::service::SetupServiceError; use bootstore::schemes::v0 as bootstore; use omicron_uuid_kinds::RackInitUuid; use omicron_uuid_kinds::RackResetUuid; +use sled_agent_types::rack_init::RackInitializeRequest; +use sled_agent_types::rack_ops::RackOperationStatus; use sled_storage::manager::StorageHandle; use slog::Logger; use std::mem; diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index 9baf0e7ef3..73f7537853 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -6,7 +6,6 @@ use super::client as bootstrap_agent_client; use super::params::StartSledAgentRequest; -use crate::rack_setup::config::SetupServiceConfig; use crate::rack_setup::service::RackSetupService; use crate::rack_setup::service::SetupServiceError; use ::bootstrap_agent_client::Client as BootstrapAgentClient; @@ -16,6 +15,7 @@ use futures::StreamExt; use omicron_common::backoff::retry_notify; use omicron_common::backoff::retry_policy_local; use omicron_common::backoff::BackoffError; +use sled_agent_types::rack_init::RackInitializeRequest; use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; @@ -44,7 +44,7 @@ impl RssHandle { /// Executes the rack setup service until it has completed pub(super) async fn run_rss( log: &Logger, - config: SetupServiceConfig, + config: RackInitializeRequest, our_bootstrap_address: Ipv6Addr, storage_manager: StorageHandle, bootstore: bootstore::NodeHandle, diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 69a6f455cc..656be1a394 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -6,7 +6,6 @@ use super::config::BOOTSTRAP_AGENT_HTTP_PORT; use super::http_entrypoints; -use super::params::RackInitializeRequest; use super::params::StartSledAgentRequest; use super::views::SledAgentResponse; use super::BootstrapError; @@ -42,6 +41,7 @@ use omicron_common::ledger::Ledger; use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_ddm_admin_client::DdmError; use omicron_uuid_kinds::RackInitUuid; +use sled_agent_types::rack_init::RackInitializeRequest; use sled_hardware::underlay; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 2612e504f5..1ecda51657 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -5,7 +5,6 @@ //! HTTP entrypoint functions for the sled agent's exposed API use super::sled_agent::SledAgent; -use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, @@ -37,6 +36,7 @@ use omicron_common::api::internal::shared::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_hardware::DiskVariant; use sled_storage::resources::DisksManagementResult; use std::collections::BTreeMap; diff --git a/sled-agent/src/rack_setup/config.rs b/sled-agent/src/rack_setup/config.rs deleted file mode 100644 index 43664cfd04..0000000000 --- a/sled-agent/src/rack_setup/config.rs +++ /dev/null @@ -1,249 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Interfaces for working with RSS config. - -use crate::config::ConfigError; -use camino::Utf8Path; -use omicron_common::address::{ - get_64_subnet, Ipv6Subnet, AZ_PREFIX, RACK_PREFIX, SLED_PREFIX, -}; - -pub use crate::bootstrap::params::back_compat::RackInitializeRequestV1 as SetupServiceConfigV1; -use crate::bootstrap::params::Certificate; -pub use crate::bootstrap::params::RackInitializeRequest as SetupServiceConfig; - -impl SetupServiceConfig { - pub fn from_file>(path: P) -> Result { - let path = path.as_ref(); - let contents = std::fs::read_to_string(&path) - .map_err(|err| ConfigError::Io { path: path.into(), err })?; - let mut raw_config = - SetupServiceConfig::from_toml_with_fallback(&contents) - .map_err(|err| ConfigError::Parse { path: path.into(), err })?; - - // In the same way that sled-agent itself (our caller) discovers the - // optional config-rss.toml in a well-known path relative to its config - // file, we look for a pair of well-known paths adjacent to - // config-rss.toml that specify an extra TLS certificate and private - // key. This is used by the end-to-end tests. Any developer can also - // use this to inject a TLS certificate into their setup. - // (config-rss.toml is only used for dev/test, not production - // deployments, which will always get their RSS configuration from - // Wicket.) - if let Some(parent) = path.parent() { - let cert_path = parent.join("initial-tls-cert.pem"); - let key_path = parent.join("initial-tls-key.pem"); - let cert_bytes = std::fs::read_to_string(&cert_path); - let key_bytes = std::fs::read_to_string(&key_path); - match (cert_bytes, key_bytes) { - (Ok(cert), Ok(key)) => { - raw_config - .external_certificates - .push(Certificate { key, cert }); - } - (Err(cert_error), Err(key_error)) - if cert_error.kind() == std::io::ErrorKind::NotFound - && key_error.kind() == std::io::ErrorKind::NotFound => - { - // Fine. No extra cert was provided. - } - (Err(cert_error), _) => { - return Err(ConfigError::Certificate( - anyhow::Error::new(cert_error).context(format!( - "loading certificate from {:?}", - cert_path - )), - )); - } - (_, Err(key_error)) => { - return Err(ConfigError::Certificate( - anyhow::Error::new(key_error).context(format!( - "loading private key from {:?}", - key_path - )), - )); - } - }; - } - - Ok(raw_config) - } - - pub fn az_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new( - self.rack_network_config.rack_subnet.addr(), - ) - } - - /// Returns the subnet for our rack. - pub fn rack_subnet(&self) -> Ipv6Subnet { - Ipv6Subnet::::new( - self.rack_network_config.rack_subnet.addr(), - ) - } - - /// Returns the subnet for the `index`-th sled in the rack. - pub fn sled_subnet(&self, index: u8) -> Ipv6Subnet { - get_64_subnet(self.rack_subnet(), index) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::bootstrap::params::BootstrapAddressDiscovery; - use crate::bootstrap::params::RecoverySiloConfig; - use anyhow::Context; - use camino::Utf8PathBuf; - use omicron_common::address::IpRange; - use omicron_common::api::internal::shared::AllowedSourceIps; - use omicron_common::api::internal::shared::RackNetworkConfig; - use oxnet::Ipv6Net; - use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; - - #[test] - fn test_subnets() { - let cfg = SetupServiceConfig { - trust_quorum_peers: None, - bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, - ntp_servers: vec![String::from("test.pool.example.com")], - dns_servers: vec!["1.1.1.1".parse().unwrap()], - external_dns_zone_name: String::from("oxide.test"), - internal_services_ip_pool_ranges: vec![IpRange::from(IpAddr::V4( - Ipv4Addr::new(129, 168, 1, 20), - ))], - external_dns_ips: vec![], - external_certificates: vec![], - recovery_silo: RecoverySiloConfig { - silo_name: "test-silo".parse().unwrap(), - user_name: "dummy".parse().unwrap(), - // This is a hash for the password "oxide". It doesn't matter, - // though; it's not used. - user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$\ - RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/\ - ek3GL0el/oProgTwWpHJZ8lsQQoY" - .parse() - .unwrap(), - }, - rack_network_config: RackNetworkConfig { - rack_subnet: Ipv6Net::new( - "fd00:1122:3344:0100::".parse().unwrap(), - RACK_PREFIX, - ) - .unwrap(), - infra_ip_first: Ipv4Addr::LOCALHOST, - infra_ip_last: Ipv4Addr::LOCALHOST, - ports: Vec::new(), - bgp: Vec::new(), - bfd: Vec::new(), - }, - allowed_source_ips: AllowedSourceIps::Any, - }; - - assert_eq!( - Ipv6Subnet::::new( - // Masked out in AZ Subnet - // vv - "fd00:1122:3344:0000::".parse::().unwrap(), - ), - cfg.az_subnet() - ); - assert_eq!( - Ipv6Subnet::::new( - // Shows up from Rack Subnet - // vv - "fd00:1122:3344:0100::".parse::().unwrap(), - ), - cfg.rack_subnet() - ); - assert_eq!( - Ipv6Subnet::::new( - // 0th Sled Subnet - // vv - "fd00:1122:3344:0100::".parse::().unwrap(), - ), - cfg.sled_subnet(0) - ); - assert_eq!( - Ipv6Subnet::::new( - // 1st Sled Subnet - // vv - "fd00:1122:3344:0101::".parse::().unwrap(), - ), - cfg.sled_subnet(1) - ); - assert_eq!( - Ipv6Subnet::::new( - // Last Sled Subnet - // vv - "fd00:1122:3344:01ff::".parse::().unwrap(), - ), - cfg.sled_subnet(255) - ); - } - - #[test] - fn test_extra_certs() { - // The stock non-Gimlet config has no TLS certificates. - let path = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("../smf/sled-agent/non-gimlet/config-rss.toml"); - let cfg = SetupServiceConfig::from_file(&path) - .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); - assert!(cfg.external_certificates.is_empty()); - - // Now let's create a configuration that does have an adjacent - // certificate and key. - let tempdir = - camino_tempfile::tempdir().expect("creating temporary directory"); - println!("using temp path: {:?}", tempdir); - - // Generate the certificate. - let domain = format!( - "{}.sys.{}", - cfg.external_dns_zone_name, - cfg.recovery_silo.silo_name.as_str(), - ); - let cert = rcgen::generate_simple_self_signed(vec![domain.clone()]) - .unwrap_or_else(|error| { - panic!( - "generating certificate for domain {:?}: {}", - domain, error - ) - }); - - // Write the configuration file. - let cfg_path = tempdir.path().join("config-rss.toml"); - let _ = std::fs::copy(&path, &cfg_path) - .with_context(|| { - format!("failed to copy file {:?} to {:?}", &path, &cfg_path) - }) - .unwrap(); - - // Write the certificate. - let cert_bytes = cert - .serialize_pem() - .expect("serializing generated certificate") - .into_bytes(); - let cert_path = tempdir.path().join("initial-tls-cert.pem"); - std::fs::write(&cert_path, &cert_bytes) - .with_context(|| format!("failed to write to {:?}", &cert_path)) - .unwrap(); - - // Write the private key. - let key_path = tempdir.path().join("initial-tls-key.pem"); - let key_bytes = cert.serialize_private_key_pem().into_bytes(); - std::fs::write(&key_path, &key_bytes) - .with_context(|| format!("failed to write to {:?}", &key_path)) - .unwrap(); - - // Now try to load it all. - let read_cfg = SetupServiceConfig::from_file(&cfg_path) - .expect("failed to read generated config with certificate"); - assert_eq!(read_cfg.external_certificates.len(), 1); - let cert = read_cfg.external_certificates.first().unwrap(); - let _ = rcgen::KeyPair::from_pem(&cert.key) - .expect("generated PEM did not parse as KeyPair"); - } -} diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs index 0ad8e0ce71..0ec14138fc 100644 --- a/sled-agent/src/rack_setup/mod.rs +++ b/sled-agent/src/rack_setup/mod.rs @@ -4,8 +4,6 @@ //! Rack Setup Service -/// Configuration files which automate input to RSS. -pub mod config; mod plan; /// The main implementation of the RSS service. pub mod service; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 9493361d19..d23c6715c6 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -9,7 +9,6 @@ use crate::params::{ OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, }; -use crate::rack_setup::config::SetupServiceConfig as Config; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use illumos_utils::zpool::ZpoolName; @@ -37,6 +36,7 @@ use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; +use sled_agent_types::rack_init::RackInitializeRequest as Config; use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; use sled_storage::manager::StorageHandle; use slog::Logger; @@ -1180,12 +1180,12 @@ impl ServicePortBuilder { #[cfg(test)] mod tests { use super::*; - use crate::bootstrap::params::BootstrapAddressDiscovery; - use crate::bootstrap::params::RecoverySiloConfig; use omicron_common::address::IpRange; use omicron_common::api::internal::shared::AllowedSourceIps; use omicron_common::api::internal::shared::RackNetworkConfig; use oxnet::Ipv6Net; + use sled_agent_types::rack_init::BootstrapAddressDiscovery; + use sled_agent_types::rack_init::RecoverySiloConfig; const EXPECTED_RESERVED_ADDRESSES: u16 = 2; const EXPECTED_USABLE_ADDRESSES: u16 = diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index c6d2e73ccd..3d5b90a22d 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -8,12 +8,12 @@ use crate::bootstrap::params::StartSledAgentRequestBody; use crate::bootstrap::{ config::BOOTSTRAP_AGENT_RACK_INIT_PORT, params::StartSledAgentRequest, }; -use crate::rack_setup::config::SetupServiceConfig as Config; -use crate::rack_setup::config::SetupServiceConfigV1 as ConfigV1; use camino::Utf8PathBuf; use omicron_common::ledger::{self, Ledger, Ledgerable}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_agent_types::rack_init::back_compat::RackInitializeRequestV1 as ConfigV1; +use sled_agent_types::rack_init::RackInitializeRequest as Config; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; use slog::Logger; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 2d7a355440..c8e56ae9f4 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -64,14 +64,11 @@ //! completing execution, and unconditionally calls the "handoff to Nexus" API //! thereafter. -use super::config::SetupServiceConfig as Config; use super::plan::service::SledConfig; use crate::bootstrap::config::BOOTSTRAP_AGENT_HTTP_PORT; use crate::bootstrap::early_networking::{ - EarlyNetworkConfig, EarlyNetworkConfigBody, EarlyNetworkSetup, - EarlyNetworkSetupError, + EarlyNetworkSetup, EarlyNetworkSetupError, }; -use crate::bootstrap::params::BootstrapAddressDiscovery; use crate::bootstrap::params::StartSledAgentRequest; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::{d2n_params, ConvertInto}; @@ -111,6 +108,12 @@ use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; +use sled_agent_types::early_networking::{ + EarlyNetworkConfig, EarlyNetworkConfigBody, +}; +use sled_agent_types::rack_init::{ + BootstrapAddressDiscovery, RackInitializeRequest as Config, +}; use sled_hardware_types::underlay::BootstrapInterface; use sled_storage::dataset::CONFIG_DATASET; use sled_storage::manager::StorageHandle; @@ -1545,7 +1548,7 @@ impl<'a> OmicronZonesConfigGenerator<'a> { #[cfg(test)] mod test { - use super::OmicronZonesConfigGenerator; + use super::{Config, OmicronZonesConfigGenerator}; use crate::{ params::OmicronZoneType, rack_setup::plan::service::{Plan as ServicePlan, SledInfo}, @@ -1594,7 +1597,7 @@ mod test { } fn make_test_service_plan() -> ServicePlan { - let rss_config = crate::bootstrap::params::test_config(); + let rss_config = Config::test_config(); let fake_sleds = vec![ make_sled_info( SledUuid::new_v4(), diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 78d48be0ff..399ec334f4 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -4,7 +4,6 @@ //! HTTP entrypoint functions for the sled agent's exposed API -use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, @@ -30,6 +29,7 @@ use omicron_common::api::internal::shared::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_storage::resources::DisksManagementResult; use std::sync::Arc; use uuid::Uuid; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 9cb146531b..f23b14c377 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -10,9 +10,6 @@ use super::disk::SimDisk; use super::instance::SimInstance; use super::storage::CrucibleData; use super::storage::Storage; -use crate::bootstrap::early_networking::{ - EarlyNetworkConfig, EarlyNetworkConfigBody, -}; use crate::nexus::NexusClient; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, @@ -47,6 +44,9 @@ use propolis_client::{ types::VolumeConstructionRequest, Client as PropolisClient, }; use propolis_mock_server::Context as PropolisContext; +use sled_agent_types::early_networking::{ + EarlyNetworkConfig, EarlyNetworkConfigBody, +}; use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::{HashMap, HashSet, VecDeque}; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index dc946c1bfa..4bf7117bc9 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -6,9 +6,7 @@ use crate::boot_disk_os_writer::BootDiskOsWriter; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; -use crate::bootstrap::early_networking::{ - EarlyNetworkConfig, EarlyNetworkSetupError, -}; +use crate::bootstrap::early_networking::EarlyNetworkSetupError; use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; use crate::instance_manager::InstanceManager; @@ -63,6 +61,7 @@ use omicron_common::backoff::{ use omicron_ddm_admin_client::Client as DdmAdminClient; use omicron_uuid_kinds::{InstanceUuid, PropolisUuid}; use oximeter::types::ProducerRegistry; +use sled_agent_types::early_networking::EarlyNetworkConfig; use sled_hardware::{underlay, HardwareManager}; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; diff --git a/sled-agent/tests/integration_tests/early_network.rs b/sled-agent/tests/integration_tests/early_network.rs index 28fc0fd010..6fa91e0e4a 100644 --- a/sled-agent/tests/integration_tests/early_network.rs +++ b/sled-agent/tests/integration_tests/early_network.rs @@ -15,10 +15,10 @@ use omicron_common::api::{ RackNetworkConfig, RouteConfig, }, }; -use omicron_sled_agent::bootstrap::early_networking::{ +use omicron_test_utils::dev::test_setup_log; +use sled_agent_types::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, }; -use omicron_test_utils::dev::test_setup_log; const BLOB_PATH: &str = "tests/data/early_network_blobs.txt"; diff --git a/sled-agent/types/Cargo.toml b/sled-agent/types/Cargo.toml new file mode 100644 index 0000000000..57881a37d1 --- /dev/null +++ b/sled-agent/types/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "sled-agent-types" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +anyhow.workspace = true +bootstore.workspace = true +camino.workspace = true +nexus-client.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +omicron-workspace-hack.workspace = true +oxnet.workspace = true +schemars.workspace = true +serde.workspace = true +serde_json.workspace = true +sled-hardware-types.workspace = true +slog.workspace = true +thiserror.workspace = true +toml.workspace = true + +[dev-dependencies] +camino-tempfile.workspace = true +omicron-test-utils.workspace = true +rcgen.workspace = true diff --git a/sled-agent/types/src/early_networking.rs b/sled-agent/types/src/early_networking.rs new file mode 100644 index 0000000000..dc93aa1300 --- /dev/null +++ b/sled-agent/types/src/early_networking.rs @@ -0,0 +1,606 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for network setup required to bring up the control plane. + +use std::str::FromStr; + +use bootstore::schemes::v0 as bootstore; +use omicron_common::api::internal::shared::RackNetworkConfig; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use slog::{warn, Logger}; + +/// Network configuration required to bring up the control plane +/// +/// The fields in this structure are those from +/// [`crate::rack_init::RackInitializeRequest`] necessary for use beyond RSS. +/// This is just for the initial rack configuration and cold boot purposes. +/// Updates come from Nexus. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct EarlyNetworkConfig { + // The current generation number of data as stored in CRDB. + // The initial generation is set during RSS time and then only mutated + // by Nexus. + pub generation: u64, + + // Which version of the data structure do we have. This is to help with + // deserialization and conversion in future updates. + pub schema_version: u32, + + // The actual configuration details + pub body: EarlyNetworkConfigBody, +} + +impl FromStr for EarlyNetworkConfig { + type Err = String; + + fn from_str(value: &str) -> Result { + #[derive(Deserialize)] + struct ShadowConfig { + generation: u64, + schema_version: u32, + body: EarlyNetworkConfigBody, + } + + let v2_err = match serde_json::from_str::(&value) { + Ok(cfg) => { + return Ok(EarlyNetworkConfig { + generation: cfg.generation, + schema_version: cfg.schema_version, + body: cfg.body, + }) + } + Err(e) => format!("unable to parse EarlyNetworkConfig: {e:?}"), + }; + // If we fail to parse the config as any known version, we return the + // error corresponding to the parse failure of the newest schema. + serde_json::from_str::(&value) + .map(|v1| EarlyNetworkConfig { + generation: v1.generation, + schema_version: Self::schema_version(), + body: v1.body.into(), + }) + .map_err(|_| v2_err) + } +} + +impl EarlyNetworkConfig { + pub fn schema_version() -> u32 { + 2 + } + + // Note: This currently only converts between v0 and v1 or deserializes v1 of + // `EarlyNetworkConfig`. + pub fn deserialize_bootstore_config( + log: &Logger, + config: &bootstore::NetworkConfig, + ) -> Result { + // Try to deserialize the latest version of the data structure (v2). If + // that succeeds we are done. + let v2_error = + match serde_json::from_slice::(&config.blob) { + Ok(val) => return Ok(val), + Err(error) => { + // Log this error and continue trying to deserialize older + // versions. + warn!( + log, + "Failed to deserialize EarlyNetworkConfig \ + as v2, trying next as v1: {}", + error, + ); + error + } + }; + + match serde_json::from_slice::( + &config.blob, + ) { + Ok(v1) => { + // Convert from v1 to v2 + return Ok(EarlyNetworkConfig { + generation: v1.generation, + schema_version: EarlyNetworkConfig::schema_version(), + body: v1.body.into(), + }); + } + Err(error) => { + // Log this error. + warn!( + log, + "Failed to deserialize EarlyNetworkConfig \ + as v1, trying next as v0: {}", + error + ); + } + }; + + match serde_json::from_slice::( + &config.blob, + ) { + Ok(val) => { + // Convert from v0 to v2 + return Ok(EarlyNetworkConfig { + generation: val.generation, + schema_version: 2, + body: EarlyNetworkConfigBody { + ntp_servers: val.ntp_servers, + rack_network_config: val.rack_network_config.map( + |v0_config| { + back_compat::RackNetworkConfigV0::to_v2( + val.rack_subnet, + v0_config, + ) + }, + ), + }, + }); + } + Err(error) => { + // Log this error. + warn!( + log, + "Failed to deserialize EarlyNetworkConfig as v0: {}", error, + ); + } + }; + + // If we fail to parse the config as any known version, we return the + // error corresponding to the parse failure of the newest schema. + Err(v2_error) + } +} + +/// This is the actual configuration of EarlyNetworking. +/// +/// We nest it below the "header" of `generation` and `schema_version` so that +/// we can perform partial deserialization of `EarlyNetworkConfig` to only read +/// the header and defer deserialization of the body once we know the schema +/// version. This is possible via the use of [`serde_json::value::RawValue`] in +/// future (post-v1) deserialization paths. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct EarlyNetworkConfigBody { + /// The external NTP server addresses. + pub ntp_servers: Vec, + + // Rack network configuration as delivered from RSS or Nexus + pub rack_network_config: Option, +} + +impl From for bootstore::NetworkConfig { + fn from(value: EarlyNetworkConfig) -> Self { + // Can this ever actually fail? + // We literally just deserialized the same data in RSS + let blob = serde_json::to_vec(&value).unwrap(); + + // Yes this is duplicated, but that seems fine. + let generation = value.generation; + + bootstore::NetworkConfig { generation, blob } + } +} + +/// Structures and routines used to maintain backwards compatibility. The +/// contents of this module should only be used to convert older data into the +/// current format, and not for any ongoing run-time operations. +pub mod back_compat { + use std::net::{Ipv4Addr, Ipv6Addr}; + + use omicron_common::api::{ + external::SwitchLocation, + internal::shared::{ + BfdPeerConfig, BgpConfig, BgpPeerConfig, PortConfigV2, PortFec, + PortSpeed, RackNetworkConfigV2, RouteConfig, UplinkAddressConfig, + }, + }; + use oxnet::{IpNet, Ipv4Net, Ipv6Net}; + + use super::*; + + #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] + pub struct EarlyNetworkConfigBodyV1 { + /// The external NTP server addresses. + pub ntp_servers: Vec, + + // Rack network configuration as delivered from RSS or Nexus + pub rack_network_config: Option, + } + + impl From for EarlyNetworkConfigBody { + fn from(v1: EarlyNetworkConfigBodyV1) -> Self { + EarlyNetworkConfigBody { + ntp_servers: v1.ntp_servers, + rack_network_config: v1 + .rack_network_config + .map(|v1_config| v1_config.into()), + } + } + } + + /// Deprecated, use `RackNetworkConfig` instead. Cannot actually deprecate due to + /// + /// + /// Our first version of `RackNetworkConfig`. If this exists in the bootstore, we + /// upgrade out of it into `RackNetworkConfigV1` or later versions if possible. + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] + pub(crate) struct RackNetworkConfigV0 { + // TODO: #3591 Consider making infra-ip ranges implicit for uplinks + /// First ip address to be used for configuring network infrastructure + pub infra_ip_first: Ipv4Addr, + /// Last ip address to be used for configuring network infrastructure + pub infra_ip_last: Ipv4Addr, + /// Uplinks for connecting the rack to external networks + pub uplinks: Vec, + } + + impl RackNetworkConfigV0 { + /// Convert from `RackNetworkConfigV0` to `RackNetworkConfigV1` + /// + /// We cannot use `From for `RackNetworkConfigV2` + /// because the `rack_subnet` field does not exist in `RackNetworkConfigV0` + /// and must be passed in from the `EarlyNetworkConfigV0` struct which + /// contains the `RackNetworkConfigV0` struct. + pub fn to_v2( + rack_subnet: Ipv6Addr, + v0: RackNetworkConfigV0, + ) -> RackNetworkConfigV2 { + RackNetworkConfigV2 { + rack_subnet: Ipv6Net::new(rack_subnet, 56).unwrap(), + infra_ip_first: v0.infra_ip_first, + infra_ip_last: v0.infra_ip_last, + ports: v0 + .uplinks + .into_iter() + .map(|uplink| PortConfigV2::from(uplink)) + .collect(), + bgp: vec![], + bfd: vec![], + } + } + } + + /// Deprecated, use PortConfigV2 instead. Cannot actually deprecate due to + /// + #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] + pub struct PortConfigV1 { + /// The set of routes associated with this port. + pub routes: Vec, + /// This port's addresses and optional vlan IDs + pub addresses: Vec, + /// Switch the port belongs to. + pub switch: SwitchLocation, + /// Nmae of the port this config applies to. + pub port: String, + /// Port speed. + pub uplink_port_speed: PortSpeed, + /// Port forward error correction type. + pub uplink_port_fec: PortFec, + /// BGP peers on this port + pub bgp_peers: Vec, + /// Whether or not to set autonegotiation + #[serde(default)] + pub autoneg: bool, + } + + impl From for PortConfigV2 { + fn from(v1: PortConfigV1) -> Self { + PortConfigV2 { + routes: v1.routes.clone(), + addresses: v1 + .addresses + .iter() + .map(|a| UplinkAddressConfig { address: *a, vlan_id: None }) + .collect(), + switch: v1.switch, + port: v1.port, + uplink_port_speed: v1.uplink_port_speed, + uplink_port_fec: v1.uplink_port_fec, + bgp_peers: v1.bgp_peers.clone(), + autoneg: v1.autoneg, + } + } + } + + /// Deprecated, use PortConfigV2 instead. Cannot actually deprecate due to + /// + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] + pub(crate) struct UplinkConfig { + /// Gateway address + pub gateway_ip: Ipv4Addr, + /// Switch to use for uplink + pub switch: SwitchLocation, + /// Switchport to use for external connectivity + pub uplink_port: String, + /// Speed for the Switchport + pub uplink_port_speed: PortSpeed, + /// Forward Error Correction setting for the uplink port + pub uplink_port_fec: PortFec, + /// IP Address and prefix (e.g., `192.168.0.1/16`) to apply to switchport + /// (must be in infra_ip pool) + pub uplink_cidr: Ipv4Net, + /// VLAN id to use for uplink + pub uplink_vid: Option, + } + + impl From for PortConfigV2 { + fn from(value: UplinkConfig) -> Self { + PortConfigV2 { + routes: vec![RouteConfig { + destination: "0.0.0.0/0".parse().unwrap(), + nexthop: value.gateway_ip.into(), + vlan_id: value.uplink_vid, + }], + addresses: vec![UplinkAddressConfig { + address: value.uplink_cidr.into(), + vlan_id: value.uplink_vid, + }], + switch: value.switch, + port: value.uplink_port, + uplink_port_speed: value.uplink_port_speed, + uplink_port_fec: value.uplink_port_fec, + bgp_peers: vec![], + autoneg: false, + } + } + } + + /// Deprecated, use `RackNetworkConfig` instead. Cannot actually deprecate due to + /// + /// + /// Our second version of `RackNetworkConfig`. If this exists in the bootstore, + /// we upgrade out of it into `RackNetworkConfigV1` or later versions if + /// possible. + #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] + pub struct RackNetworkConfigV1 { + pub rack_subnet: Ipv6Net, + // TODO: #3591 Consider making infra-ip ranges implicit for uplinks + /// First ip address to be used for configuring network infrastructure + pub infra_ip_first: Ipv4Addr, + /// Last ip address to be used for configuring network infrastructure + pub infra_ip_last: Ipv4Addr, + /// Uplinks for connecting the rack to external networks + pub ports: Vec, + /// BGP configurations for connecting the rack to external networks + pub bgp: Vec, + /// BFD configuration for connecting the rack to external networks + #[serde(default)] + pub bfd: Vec, + } + + impl From for RackNetworkConfigV2 { + fn from(v1: RackNetworkConfigV1) -> Self { + RackNetworkConfigV2 { + rack_subnet: v1.rack_subnet, + infra_ip_first: v1.infra_ip_first, + infra_ip_last: v1.infra_ip_last, + ports: v1 + .ports + .into_iter() + .map(|ports| PortConfigV2::from(ports)) + .collect(), + bgp: v1.bgp.clone(), + bfd: v1.bfd.clone(), + } + } + } + + // The second production version of the `EarlyNetworkConfig`. + // + // If this version is in the bootstore than we need to convert it to + // `EarlyNetworkConfigV2`. + // + // Once we do this for all customers that have initialized racks with the + // old version we can go ahead and remove this type and its conversion code + // altogether. + #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] + pub struct EarlyNetworkConfigV1 { + // The current generation number of data as stored in CRDB. + // The initial generation is set during RSS time and then only mutated + // by Nexus. + pub generation: u64, + + // Which version of the data structure do we have. This is to help with + // deserialization and conversion in future updates. + pub schema_version: u32, + + // The actual configuration details + pub body: EarlyNetworkConfigBodyV1, + } + + // The first production version of the `EarlyNetworkConfig`. + // + // If this version is in the bootstore than we need to convert it to + // `EarlyNetworkConfigV2`. + // + // Once we do this for all customers that have initialized racks with the + // old version we can go ahead and remove this type and its conversion code + // altogether. + #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] + pub(crate) struct EarlyNetworkConfigV0 { + // The current generation number of data as stored in CRDB. + // The initial generation is set during RSS time and then only mutated + // by Nexus. + pub generation: u64, + + pub rack_subnet: Ipv6Addr, + + /// The external NTP server addresses. + pub ntp_servers: Vec, + + // Rack network configuration as delivered from RSS and only existing at + // generation 1 + pub rack_network_config: Option, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::net::Ipv4Addr; + use std::net::Ipv6Addr; + + use omicron_common::api::external::SwitchLocation; + use omicron_common::api::internal::shared::PortConfigV2; + use omicron_common::api::internal::shared::PortFec; + use omicron_common::api::internal::shared::PortSpeed; + use omicron_common::api::internal::shared::RackNetworkConfigV2; + use omicron_common::api::internal::shared::RouteConfig; + use omicron_common::api::internal::shared::UplinkAddressConfig; + use omicron_test_utils::dev::test_setup_log; + use oxnet::Ipv6Net; + + #[test] + fn serialized_early_network_config_v0_to_v2_conversion() { + let logctx = test_setup_log( + "serialized_early_network_config_v0_to_v2_conversion", + ); + let v0 = back_compat::EarlyNetworkConfigV0 { + generation: 1, + rack_subnet: Ipv6Addr::UNSPECIFIED, + ntp_servers: Vec::new(), + rack_network_config: Some(back_compat::RackNetworkConfigV0 { + infra_ip_first: Ipv4Addr::UNSPECIFIED, + infra_ip_last: Ipv4Addr::UNSPECIFIED, + uplinks: vec![back_compat::UplinkConfig { + gateway_ip: Ipv4Addr::UNSPECIFIED, + switch: SwitchLocation::Switch0, + uplink_port: "Port0".to_string(), + uplink_port_speed: PortSpeed::Speed100G, + uplink_port_fec: PortFec::None, + uplink_cidr: "192.168.0.1/16".parse().unwrap(), + uplink_vid: None, + }], + }), + }; + + let v0_serialized = serde_json::to_vec(&v0).unwrap(); + let bootstore_conf = + bootstore::NetworkConfig { generation: 1, blob: v0_serialized }; + + let v2 = EarlyNetworkConfig::deserialize_bootstore_config( + &logctx.log, + &bootstore_conf, + ) + .unwrap(); + let v0_rack_network_config = v0.rack_network_config.unwrap(); + let uplink = v0_rack_network_config.uplinks[0].clone(); + let expected = EarlyNetworkConfig { + generation: 1, + schema_version: EarlyNetworkConfig::schema_version(), + body: EarlyNetworkConfigBody { + ntp_servers: v0.ntp_servers.clone(), + rack_network_config: Some(RackNetworkConfigV2 { + rack_subnet: Ipv6Net::new(v0.rack_subnet, 56).unwrap(), + infra_ip_first: v0_rack_network_config.infra_ip_first, + infra_ip_last: v0_rack_network_config.infra_ip_last, + ports: vec![PortConfigV2 { + routes: vec![RouteConfig { + destination: "0.0.0.0/0".parse().unwrap(), + nexthop: uplink.gateway_ip.into(), + vlan_id: None, + }], + addresses: vec![UplinkAddressConfig { + address: uplink.uplink_cidr.into(), + vlan_id: None, + }], + switch: uplink.switch, + port: uplink.uplink_port, + uplink_port_speed: uplink.uplink_port_speed, + uplink_port_fec: uplink.uplink_port_fec, + autoneg: false, + bgp_peers: vec![], + }], + bgp: vec![], + bfd: vec![], + }), + }, + }; + + assert_eq!(expected, v2); + + logctx.cleanup_successful(); + } + + #[test] + fn serialized_early_network_config_v1_to_v2_conversion() { + let logctx = test_setup_log( + "serialized_early_network_config_v1_to_v2_conversion", + ); + + let v1 = back_compat::EarlyNetworkConfigV1 { + generation: 1, + schema_version: 1, + body: back_compat::EarlyNetworkConfigBodyV1 { + ntp_servers: Vec::new(), + rack_network_config: Some(back_compat::RackNetworkConfigV1 { + rack_subnet: Ipv6Net::new(Ipv6Addr::UNSPECIFIED, 56) + .unwrap(), + infra_ip_first: Ipv4Addr::UNSPECIFIED, + infra_ip_last: Ipv4Addr::UNSPECIFIED, + ports: vec![back_compat::PortConfigV1 { + routes: vec![RouteConfig { + destination: "0.0.0.0/0".parse().unwrap(), + nexthop: "192.168.0.2".parse().unwrap(), + vlan_id: None, + }], + addresses: vec!["192.168.0.1/16".parse().unwrap()], + switch: SwitchLocation::Switch0, + port: "Port0".to_string(), + uplink_port_speed: PortSpeed::Speed100G, + uplink_port_fec: PortFec::None, + bgp_peers: Vec::new(), + autoneg: false, + }], + bgp: Vec::new(), + bfd: Vec::new(), + }), + }, + }; + + let v1_serialized = serde_json::to_vec(&v1).unwrap(); + let bootstore_conf = + bootstore::NetworkConfig { generation: 1, blob: v1_serialized }; + + let v2 = EarlyNetworkConfig::deserialize_bootstore_config( + &logctx.log, + &bootstore_conf, + ) + .unwrap(); + let v1_rack_network_config = v1.body.rack_network_config.unwrap(); + let port = v1_rack_network_config.ports[0].clone(); + let expected = EarlyNetworkConfig { + generation: 1, + schema_version: EarlyNetworkConfig::schema_version(), + body: EarlyNetworkConfigBody { + ntp_servers: v1.body.ntp_servers.clone(), + rack_network_config: Some(RackNetworkConfigV2 { + rack_subnet: v1_rack_network_config.rack_subnet, + infra_ip_first: v1_rack_network_config.infra_ip_first, + infra_ip_last: v1_rack_network_config.infra_ip_last, + ports: vec![PortConfigV2 { + routes: port.routes.clone(), + addresses: vec![UplinkAddressConfig { + address: port.addresses[0], + vlan_id: None, + }], + switch: port.switch, + port: port.port, + uplink_port_speed: port.uplink_port_speed, + uplink_port_fec: port.uplink_port_fec, + autoneg: false, + bgp_peers: vec![], + }], + bgp: vec![], + bfd: vec![], + }), + }, + }; + + assert_eq!(expected, v2); + + logctx.cleanup_successful(); + } +} diff --git a/sled-agent/types/src/lib.rs b/sled-agent/types/src/lib.rs new file mode 100644 index 0000000000..12e8f049f9 --- /dev/null +++ b/sled-agent/types/src/lib.rs @@ -0,0 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Common types for sled-agent. + +pub mod early_networking; +pub mod rack_init; +pub mod rack_ops; diff --git a/sled-agent/types/src/rack_init.rs b/sled-agent/types/src/rack_init.rs new file mode 100644 index 0000000000..8fcf3c93fd --- /dev/null +++ b/sled-agent/types/src/rack_init.rs @@ -0,0 +1,732 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Rack initialization types. + +use std::{ + collections::BTreeSet, + net::{IpAddr, Ipv6Addr}, +}; + +use anyhow::{bail, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use omicron_common::{ + address::{ + get_64_subnet, IpRange, Ipv6Subnet, AZ_PREFIX, RACK_PREFIX, SLED_PREFIX, + }, + api::{external::AllowedSourceIps, internal::shared::RackNetworkConfig}, +}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_hardware_types::Baseboard; + +pub type Certificate = nexus_client::types::Certificate; +pub type RecoverySiloConfig = nexus_client::types::RecoverySiloConfig; + +/// Structures and routines used to maintain backwards compatibility. The +/// contents of this module should only be used to convert older data into the +/// current format, and not for any ongoing run-time operations. +pub mod back_compat { + use crate::early_networking::back_compat::RackNetworkConfigV1; + + use super::*; + + #[derive(Clone, Deserialize)] + struct UnvalidatedRackInitializeRequestV1 { + trust_quorum_peers: Option>, + bootstrap_discovery: BootstrapAddressDiscovery, + ntp_servers: Vec, + dns_servers: Vec, + internal_services_ip_pool_ranges: Vec, + external_dns_ips: Vec, + external_dns_zone_name: String, + external_certificates: Vec, + recovery_silo: RecoverySiloConfig, + rack_network_config: RackNetworkConfigV1, + #[serde(default = "default_allowed_source_ips")] + allowed_source_ips: AllowedSourceIps, + } + + /// This is a deprecated format, maintained to allow importing from older + /// versions. + #[derive(Clone, Debug, PartialEq, Deserialize, Serialize, JsonSchema)] + #[serde(try_from = "UnvalidatedRackInitializeRequestV1")] + pub struct RackInitializeRequestV1 { + pub trust_quorum_peers: Option>, + pub bootstrap_discovery: BootstrapAddressDiscovery, + pub ntp_servers: Vec, + pub dns_servers: Vec, + pub internal_services_ip_pool_ranges: Vec, + pub external_dns_ips: Vec, + pub external_dns_zone_name: String, + pub external_certificates: Vec, + pub recovery_silo: RecoverySiloConfig, + pub rack_network_config: RackNetworkConfigV1, + #[serde(default = "default_allowed_source_ips")] + pub allowed_source_ips: AllowedSourceIps, + } + + impl TryFrom for RackInitializeRequestV1 { + type Error = anyhow::Error; + + fn try_from(value: UnvalidatedRackInitializeRequestV1) -> Result { + validate_external_dns( + &value.external_dns_ips, + &value.internal_services_ip_pool_ranges, + )?; + + Ok(RackInitializeRequestV1 { + trust_quorum_peers: value.trust_quorum_peers, + bootstrap_discovery: value.bootstrap_discovery, + ntp_servers: value.ntp_servers, + dns_servers: value.dns_servers, + internal_services_ip_pool_ranges: value + .internal_services_ip_pool_ranges, + external_dns_ips: value.external_dns_ips, + external_dns_zone_name: value.external_dns_zone_name, + external_certificates: value.external_certificates, + recovery_silo: value.recovery_silo, + rack_network_config: value.rack_network_config, + allowed_source_ips: value.allowed_source_ips, + }) + } + } + impl From for RackInitializeRequest { + fn from(v1: RackInitializeRequestV1) -> Self { + RackInitializeRequest { + trust_quorum_peers: v1.trust_quorum_peers, + bootstrap_discovery: v1.bootstrap_discovery, + ntp_servers: v1.ntp_servers, + dns_servers: v1.dns_servers, + internal_services_ip_pool_ranges: v1 + .internal_services_ip_pool_ranges, + external_dns_ips: v1.external_dns_ips, + external_dns_zone_name: v1.external_dns_zone_name, + external_certificates: v1.external_certificates, + recovery_silo: v1.recovery_silo, + rack_network_config: v1.rack_network_config.into(), + allowed_source_ips: v1.allowed_source_ips, + } + } + } +} + +// "Shadow" copy of `RackInitializeRequest` that does no validation on its +// fields. +#[derive(Clone, Deserialize)] +struct UnvalidatedRackInitializeRequest { + trust_quorum_peers: Option>, + bootstrap_discovery: BootstrapAddressDiscovery, + ntp_servers: Vec, + dns_servers: Vec, + internal_services_ip_pool_ranges: Vec, + external_dns_ips: Vec, + external_dns_zone_name: String, + external_certificates: Vec, + recovery_silo: RecoverySiloConfig, + rack_network_config: RackNetworkConfig, + #[serde(default = "default_allowed_source_ips")] + allowed_source_ips: AllowedSourceIps, +} + +fn validate_external_dns( + dns_ips: &Vec, + internal_ranges: &Vec, +) -> Result<()> { + if dns_ips.is_empty() { + bail!("At least one external DNS IP is required"); + } + + // Every external DNS IP should also be present in one of the internal + // services IP pool ranges. This check is O(N*M), but we expect both N + // and M to be small (~5 DNS servers, and a small number of pools). + for &dns_ip in dns_ips { + if !internal_ranges.iter().any(|range| range.contains(dns_ip)) { + bail!( + "External DNS IP {dns_ip} is not contained in \ + `internal_services_ip_pool_ranges`" + ); + } + } + Ok(()) +} + +impl TryFrom for RackInitializeRequest { + type Error = anyhow::Error; + + fn try_from(value: UnvalidatedRackInitializeRequest) -> Result { + validate_external_dns( + &value.external_dns_ips, + &value.internal_services_ip_pool_ranges, + )?; + + Ok(RackInitializeRequest { + trust_quorum_peers: value.trust_quorum_peers, + bootstrap_discovery: value.bootstrap_discovery, + ntp_servers: value.ntp_servers, + dns_servers: value.dns_servers, + internal_services_ip_pool_ranges: value + .internal_services_ip_pool_ranges, + external_dns_ips: value.external_dns_ips, + external_dns_zone_name: value.external_dns_zone_name, + external_certificates: value.external_certificates, + recovery_silo: value.recovery_silo, + rack_network_config: value.rack_network_config, + allowed_source_ips: value.allowed_source_ips, + }) + } +} + +/// Configuration for the "rack setup service". +/// +/// The Rack Setup Service should be responsible for one-time setup actions, +/// such as CockroachDB placement and initialization. Without operator +/// intervention, however, these actions need a way to be automated in our +/// deployment. +#[derive(Clone, Deserialize, Serialize, PartialEq, JsonSchema)] +#[serde(try_from = "UnvalidatedRackInitializeRequest")] +pub struct RackInitializeRequest { + /// The set of peer_ids required to initialize trust quorum + /// + /// The value is `None` if we are not using trust quorum + pub trust_quorum_peers: Option>, + + /// Describes how bootstrap addresses should be collected during RSS. + pub bootstrap_discovery: BootstrapAddressDiscovery, + + /// The external NTP server addresses. + pub ntp_servers: Vec, + + /// The external DNS server addresses. + pub dns_servers: Vec, + + /// Ranges of the service IP pool which may be used for internal services. + // TODO(https://github.com/oxidecomputer/omicron/issues/1530): Eventually, + // we want to configure multiple pools. + pub internal_services_ip_pool_ranges: Vec, + + /// Service IP addresses on which we run external DNS servers. + /// + /// Each address must be present in `internal_services_ip_pool_ranges`. + pub external_dns_ips: Vec, + + /// DNS name for the DNS zone delegated to the rack for external DNS + pub external_dns_zone_name: String, + + /// initial TLS certificates for the external API + pub external_certificates: Vec, + + /// Configuration of the Recovery Silo (the initial Silo) + pub recovery_silo: RecoverySiloConfig, + + /// Initial rack network configuration + pub rack_network_config: RackNetworkConfig, + + /// IPs or subnets allowed to make requests to user-facing services + #[serde(default = "default_allowed_source_ips")] + pub allowed_source_ips: AllowedSourceIps, +} + +impl RackInitializeRequest { + pub fn from_file>( + path: P, + ) -> Result { + let path = path.as_ref(); + let contents = std::fs::read_to_string(&path).map_err(|err| { + RackInitializeRequestParseError::Io { path: path.into(), err } + })?; + let mut raw_config = + Self::from_toml_with_fallback(&contents).map_err(|err| { + RackInitializeRequestParseError::Deserialize { + path: path.into(), + err, + } + })?; + + // In the same way that sled-agent itself (our caller) discovers the + // optional config-rss.toml in a well-known path relative to its config + // file, we look for a pair of well-known paths adjacent to + // config-rss.toml that specify an extra TLS certificate and private + // key. This is used by the end-to-end tests. Any developer can also + // use this to inject a TLS certificate into their setup. + // (config-rss.toml is only used for dev/test, not production + // deployments, which will always get their RSS configuration from + // Wicket.) + if let Some(parent) = path.parent() { + let cert_path = parent.join("initial-tls-cert.pem"); + let key_path = parent.join("initial-tls-key.pem"); + let cert_bytes = std::fs::read_to_string(&cert_path); + let key_bytes = std::fs::read_to_string(&key_path); + match (cert_bytes, key_bytes) { + (Ok(cert), Ok(key)) => { + raw_config + .external_certificates + .push(Certificate { key, cert }); + } + (Err(cert_error), Err(key_error)) + if cert_error.kind() == std::io::ErrorKind::NotFound + && key_error.kind() == std::io::ErrorKind::NotFound => + { + // Fine. No extra cert was provided. + } + (Err(cert_error), _) => { + return Err(RackInitializeRequestParseError::Certificate( + anyhow::Error::new(cert_error).context(format!( + "loading certificate from {:?}", + cert_path + )), + )); + } + (_, Err(key_error)) => { + return Err(RackInitializeRequestParseError::Certificate( + anyhow::Error::new(key_error).context(format!( + "loading private key from {:?}", + key_path + )), + )); + } + }; + } + + Ok(raw_config) + } + + pub fn from_toml_with_fallback( + data: &str, + ) -> Result { + let v2_err = match toml::from_str::(&data) { + Ok(req) => return Ok(req), + Err(e) => e, + }; + if let Ok(v1) = + toml::from_str::(&data) + { + return Ok(v1.into()); + } + + // If we fail to parse the request as any known version, we return the + // error corresponding to the parse failure of the newest schema. + Err(v2_err.into()) + } + + /// Return a configuration suitable for testing. + pub fn test_config() -> Self { + // Use env! rather than std::env::var because this might be called from + // a dependent crate. + let manifest_dir = Utf8Path::new(env!("CARGO_MANIFEST_DIR")); + let path = manifest_dir + .join("../../smf/sled-agent/non-gimlet/config-rss.toml"); + let contents = std::fs::read_to_string(&path).unwrap(); + toml::from_str(&contents) + .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)) + } + + pub fn az_subnet(&self) -> Ipv6Subnet { + Ipv6Subnet::::new( + self.rack_network_config.rack_subnet.addr(), + ) + } + + /// Returns the subnet for our rack. + pub fn rack_subnet(&self) -> Ipv6Subnet { + Ipv6Subnet::::new( + self.rack_network_config.rack_subnet.addr(), + ) + } + + /// Returns the subnet for the `index`-th sled in the rack. + pub fn sled_subnet(&self, index: u8) -> Ipv6Subnet { + get_64_subnet(self.rack_subnet(), index) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum RackInitializeRequestParseError { + #[error("Failed to read config from {path}: {err}")] + Io { + path: Utf8PathBuf, + #[source] + err: std::io::Error, + }, + #[error("Failed to deserialize config from {path}: {err}")] + Deserialize { + path: Utf8PathBuf, + #[source] + err: anyhow::Error, + }, + #[error("Loading certificate: {0}")] + Certificate(#[source] anyhow::Error), +} + +/// This field was added after several racks were already deployed. RSS plans +/// for those racks should default to allowing any source IP, since that is +/// effectively what they did. +const fn default_allowed_source_ips() -> AllowedSourceIps { + AllowedSourceIps::Any +} + +// This custom debug implementation hides the private keys. +impl std::fmt::Debug for RackInitializeRequest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // If you find a compiler error here, and you just added a field to this + // struct, be sure to add it to the Debug impl below! + let RackInitializeRequest { + trust_quorum_peers: trust_qurorum_peers, + bootstrap_discovery, + ntp_servers, + dns_servers, + internal_services_ip_pool_ranges, + external_dns_ips, + external_dns_zone_name, + external_certificates: _, + recovery_silo, + rack_network_config, + allowed_source_ips, + } = &self; + + f.debug_struct("RackInitializeRequest") + .field("trust_quorum_peers", trust_qurorum_peers) + .field("bootstrap_discovery", bootstrap_discovery) + .field("ntp_servers", ntp_servers) + .field("dns_servers", dns_servers) + .field( + "internal_services_ip_pool_ranges", + internal_services_ip_pool_ranges, + ) + .field("external_dns_ips", external_dns_ips) + .field("external_dns_zone_name", external_dns_zone_name) + .field("external_certificates", &"") + .field("recovery_silo", recovery_silo) + .field("rack_network_config", rack_network_config) + .field("allowed_source_ips", allowed_source_ips) + .finish() + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum BootstrapAddressDiscovery { + /// Ignore all bootstrap addresses except our own. + OnlyOurs, + /// Ignore all bootstrap addresses except the following. + OnlyThese { addrs: BTreeSet }, +} + +#[cfg(test)] +mod tests { + use std::net::Ipv4Addr; + use std::net::Ipv6Addr; + + use super::*; + use anyhow::Context; + use oxnet::Ipv6Net; + + #[test] + fn parse_rack_initialization() { + let manifest = std::env::var("CARGO_MANIFEST_DIR") + .expect("Cannot access manifest directory"); + let manifest = Utf8PathBuf::from(manifest); + + let path = + manifest.join("../../smf/sled-agent/non-gimlet/config-rss.toml"); + let contents = std::fs::read_to_string(&path).unwrap(); + let _: RackInitializeRequest = toml::from_str(&contents) + .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); + + let path = manifest + .join("../../smf/sled-agent/gimlet-standalone/config-rss.toml"); + let contents = std::fs::read_to_string(&path).unwrap(); + let _: RackInitializeRequest = toml::from_str(&contents) + .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); + } + + #[test] + fn parse_rack_initialization_weak_hash() { + let config = r#" + bootstrap_discovery.type = "only_ours" + ntp_servers = [ "ntp.eng.oxide.computer" ] + dns_servers = [ "1.1.1.1", "9.9.9.9" ] + external_dns_zone_name = "oxide.test" + + [[internal_services_ip_pool_ranges]] + first = "192.168.1.20" + last = "192.168.1.22" + + [recovery_silo] + silo_name = "recovery" + user_name = "recovery" + user_password_hash = "$argon2i$v=19$m=16,t=2,p=1$NVR0a2QxVXNiQjlObFJXbA$iGFJWOlUqN20B8KR4Fsmrg" + "#; + + let error = toml::from_str::(config) + .expect_err("unexpectedly parsed with bad password hash"); + println!("found error: {}", error); + assert!(error.to_string().contains( + "password hash: algorithm: expected argon2id, found argon2i" + )); + } + + #[test] + fn validate_external_dns_ips_must_be_in_internal_services_ip_pools() { + // Conjure up a config; we'll tweak the internal services pools and + // external DNS IPs, but no other fields matter. + let mut config = UnvalidatedRackInitializeRequest { + trust_quorum_peers: None, + bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, + ntp_servers: Vec::new(), + dns_servers: Vec::new(), + internal_services_ip_pool_ranges: Vec::new(), + external_dns_ips: Vec::new(), + external_dns_zone_name: "".to_string(), + external_certificates: Vec::new(), + recovery_silo: RecoverySiloConfig { + silo_name: "recovery".parse().unwrap(), + user_name: "recovery".parse().unwrap(), + user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/ek3GL0el/oProgTwWpHJZ8lsQQoY".parse().unwrap(), + }, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Net::host_net(Ipv6Addr::LOCALHOST), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + allowed_source_ips: AllowedSourceIps::Any, + }; + + // Valid configs: all external DNS IPs are contained in the IP pool + // ranges. + for (ip_pool_ranges, dns_ips) in [ + ( + &[("fd00::1", "fd00::10")] as &[(&str, &str)], + &["fd00::1", "fd00::5", "fd00::10"] as &[&str], + ), + ( + &[("192.168.1.10", "192.168.1.20")], + &["192.168.1.10", "192.168.1.15", "192.168.1.20"], + ), + ( + &[("fd00::1", "fd00::10"), ("192.168.1.10", "192.168.1.20")], + &[ + "fd00::1", + "fd00::5", + "fd00::10", + "192.168.1.10", + "192.168.1.15", + "192.168.1.20", + ], + ), + ] { + config.internal_services_ip_pool_ranges = ip_pool_ranges + .iter() + .map(|(a, b)| { + IpRange::try_from(( + a.parse::().unwrap(), + b.parse::().unwrap(), + )) + .unwrap() + }) + .collect(); + config.external_dns_ips = + dns_ips.iter().map(|ip| ip.parse().unwrap()).collect(); + + match RackInitializeRequest::try_from(config.clone()) { + Ok(_) => (), + Err(err) => panic!( + "failure on {ip_pool_ranges:?} with DNS IPs {dns_ips:?}: \ + {err}" + ), + } + } + + // Invalid configs: either no DNS IPs, or one or more DNS IPs are not + // contained in the ip pool ranges. + for (ip_pool_ranges, dns_ips) in [ + (&[("fd00::1", "fd00::10")] as &[(&str, &str)], &[] as &[&str]), + (&[("fd00::1", "fd00::10")], &["fd00::1", "fd00::5", "fd00::11"]), + ( + &[("192.168.1.10", "192.168.1.20")], + &["192.168.1.9", "192.168.1.15", "192.168.1.20"], + ), + ( + &[("fd00::1", "fd00::10"), ("192.168.1.10", "192.168.1.20")], + &[ + "fd00::1", + "fd00::5", + "fd00::10", + "192.168.1.10", + "192.168.1.15", + "192.168.1.20", + "192.168.1.21", + ], + ), + ] { + config.internal_services_ip_pool_ranges = ip_pool_ranges + .iter() + .map(|(a, b)| { + IpRange::try_from(( + a.parse::().unwrap(), + b.parse::().unwrap(), + )) + .unwrap() + }) + .collect(); + config.external_dns_ips = + dns_ips.iter().map(|ip| ip.parse().unwrap()).collect(); + + match RackInitializeRequest::try_from(config.clone()) { + Ok(_) => panic!( + "unexpected success on {ip_pool_ranges:?} with \ + DNS IPs {dns_ips:?}" + ), + Err(_) => (), + } + } + } + + #[test] + fn test_subnets() { + let cfg = RackInitializeRequest { + trust_quorum_peers: None, + bootstrap_discovery: BootstrapAddressDiscovery::OnlyOurs, + ntp_servers: vec![String::from("test.pool.example.com")], + dns_servers: vec!["1.1.1.1".parse().unwrap()], + external_dns_zone_name: String::from("oxide.test"), + internal_services_ip_pool_ranges: vec![IpRange::from(IpAddr::V4( + Ipv4Addr::new(129, 168, 1, 20), + ))], + external_dns_ips: vec![], + external_certificates: vec![], + recovery_silo: RecoverySiloConfig { + silo_name: "test-silo".parse().unwrap(), + user_name: "dummy".parse().unwrap(), + // This is a hash for the password "oxide". It doesn't matter, + // though; it's not used. + user_password_hash: "$argon2id$v=19$m=98304,t=13,p=1$\ + RUlWc0ZxaHo0WFdrN0N6ZQ$S8p52j85GPvMhR/\ + ek3GL0el/oProgTwWpHJZ8lsQQoY" + .parse() + .unwrap(), + }, + rack_network_config: RackNetworkConfig { + rack_subnet: Ipv6Net::new( + "fd00:1122:3344:0100::".parse().unwrap(), + RACK_PREFIX, + ) + .unwrap(), + infra_ip_first: Ipv4Addr::LOCALHOST, + infra_ip_last: Ipv4Addr::LOCALHOST, + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + allowed_source_ips: AllowedSourceIps::Any, + }; + + assert_eq!( + Ipv6Subnet::::new( + // Masked out in AZ Subnet + // vv + "fd00:1122:3344:0000::".parse::().unwrap(), + ), + cfg.az_subnet() + ); + assert_eq!( + Ipv6Subnet::::new( + // Shows up from Rack Subnet + // vv + "fd00:1122:3344:0100::".parse::().unwrap(), + ), + cfg.rack_subnet() + ); + assert_eq!( + Ipv6Subnet::::new( + // 0th Sled Subnet + // vv + "fd00:1122:3344:0100::".parse::().unwrap(), + ), + cfg.sled_subnet(0) + ); + assert_eq!( + Ipv6Subnet::::new( + // 1st Sled Subnet + // vv + "fd00:1122:3344:0101::".parse::().unwrap(), + ), + cfg.sled_subnet(1) + ); + assert_eq!( + Ipv6Subnet::::new( + // Last Sled Subnet + // vv + "fd00:1122:3344:01ff::".parse::().unwrap(), + ), + cfg.sled_subnet(255) + ); + } + + #[test] + fn test_extra_certs() { + // The stock non-Gimlet config has no TLS certificates. + let path = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../smf/sled-agent/non-gimlet/config-rss.toml"); + let cfg = RackInitializeRequest::from_file(&path) + .unwrap_or_else(|e| panic!("failed to parse {:?}: {}", &path, e)); + assert!(cfg.external_certificates.is_empty()); + + // Now let's create a configuration that does have an adjacent + // certificate and key. + let tempdir = + camino_tempfile::tempdir().expect("creating temporary directory"); + println!("using temp path: {:?}", tempdir); + + // Generate the certificate. + let domain = format!( + "{}.sys.{}", + cfg.external_dns_zone_name, + cfg.recovery_silo.silo_name.as_str(), + ); + let cert = rcgen::generate_simple_self_signed(vec![domain.clone()]) + .unwrap_or_else(|error| { + panic!( + "generating certificate for domain {:?}: {}", + domain, error + ) + }); + + // Write the configuration file. + let cfg_path = tempdir.path().join("config-rss.toml"); + let _ = std::fs::copy(&path, &cfg_path) + .with_context(|| { + format!("failed to copy file {:?} to {:?}", &path, &cfg_path) + }) + .unwrap(); + + // Write the certificate. + let cert_bytes = cert + .serialize_pem() + .expect("serializing generated certificate") + .into_bytes(); + let cert_path = tempdir.path().join("initial-tls-cert.pem"); + std::fs::write(&cert_path, &cert_bytes) + .with_context(|| format!("failed to write to {:?}", &cert_path)) + .unwrap(); + + // Write the private key. + let key_path = tempdir.path().join("initial-tls-key.pem"); + let key_bytes = cert.serialize_private_key_pem().into_bytes(); + std::fs::write(&key_path, &key_bytes) + .with_context(|| format!("failed to write to {:?}", &key_path)) + .unwrap(); + + // Now try to load it all. + let read_cfg = RackInitializeRequest::from_file(&cfg_path) + .expect("failed to read generated config with certificate"); + assert_eq!(read_cfg.external_certificates.len(), 1); + let cert = read_cfg.external_certificates.first().unwrap(); + let _ = rcgen::KeyPair::from_pem(&cert.key) + .expect("generated PEM did not parse as KeyPair"); + } +} diff --git a/sled-agent/types/src/rack_ops.rs b/sled-agent/types/src/rack_ops.rs new file mode 100644 index 0000000000..d8c0fa1c88 --- /dev/null +++ b/sled-agent/types/src/rack_ops.rs @@ -0,0 +1,46 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use omicron_uuid_kinds::{RackInitUuid, RackResetUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Current status of any rack-level operation being performed by this bootstrap +/// agent. +#[derive( + Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, +)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum RackOperationStatus { + Initializing { + id: RackInitUuid, + }, + /// `id` will be none if the rack was already initialized on startup. + Initialized { + id: Option, + }, + InitializationFailed { + id: RackInitUuid, + message: String, + }, + InitializationPanicked { + id: RackInitUuid, + }, + Resetting { + id: RackResetUuid, + }, + /// `reset_id` will be None if the rack is in an uninitialized-on-startup, + /// or Some if it is in an uninitialized state due to a reset operation + /// completing. + Uninitialized { + reset_id: Option, + }, + ResetFailed { + id: RackResetUuid, + message: String, + }, + ResetPanicked { + id: RackResetUuid, + }, +} From 46b1d3731d73500a50b73a611192609d487745eb Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 19 Jul 2024 21:42:54 -0700 Subject: [PATCH 18/21] [cockroach-admin] turn API into a trait (#6129) Move the more complex types into a shared cockroach-admin-types crate, and the simple wrappers into the cockroach-admin-api crate. --- Cargo.lock | 30 ++ Cargo.toml | 6 + cockroach-admin/Cargo.toml | 2 + cockroach-admin/api/Cargo.toml | 17 + cockroach-admin/api/src/lib.rs | 76 +++ cockroach-admin/src/bin/cockroach-admin.rs | 5 - cockroach-admin/src/cockroach_cli.rs | 473 +---------------- cockroach-admin/src/http_entrypoints.rs | 125 ++--- cockroach-admin/src/lib.rs | 15 - .../tests/integration_tests/commands.rs | 43 -- .../tests/integration_tests/mod.rs | 5 - cockroach-admin/tests/mod.rs | 5 - .../output/cmd-cockroach-admin-openapi-stderr | 0 cockroach-admin/types/Cargo.toml | 20 + cockroach-admin/types/src/lib.rs | 477 ++++++++++++++++++ dev-tools/openapi-manager/Cargo.toml | 1 + dev-tools/openapi-manager/src/spec.rs | 11 + openapi/cockroach-admin.json | 8 +- 18 files changed, 682 insertions(+), 637 deletions(-) create mode 100644 cockroach-admin/api/Cargo.toml create mode 100644 cockroach-admin/api/src/lib.rs delete mode 100644 cockroach-admin/tests/integration_tests/commands.rs delete mode 100644 cockroach-admin/tests/integration_tests/mod.rs delete mode 100644 cockroach-admin/tests/mod.rs delete mode 100644 cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr create mode 100644 cockroach-admin/types/Cargo.toml create mode 100644 cockroach-admin/types/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 67345447c7..79dac1c05e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1073,6 +1073,19 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" +[[package]] +name = "cockroach-admin-api" +version = "0.1.0" +dependencies = [ + "cockroach-admin-types", + "dropshot", + "omicron-common", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "schemars", + "serde", +] + [[package]] name = "cockroach-admin-client" version = "0.1.0" @@ -1087,6 +1100,20 @@ dependencies = [ "slog", ] +[[package]] +name = "cockroach-admin-types" +version = "0.1.0" +dependencies = [ + "chrono", + "csv", + "omicron-common", + "omicron-workspace-hack", + "proptest", + "schemars", + "serde", + "test-strategy", +] + [[package]] name = "colorchoice" version = "1.0.1" @@ -5309,6 +5336,8 @@ dependencies = [ "camino", "chrono", "clap", + "cockroach-admin-api", + "cockroach-admin-types", "csv", "dropshot", "expectorate", @@ -6109,6 +6138,7 @@ dependencies = [ "atomicwrites", "camino", "clap", + "cockroach-admin-api", "dns-server-api", "dropshot", "fs-err", diff --git a/Cargo.toml b/Cargo.toml index ba68cc9cac..9ad1d585d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ members = [ "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", + "cockroach-admin/api", + "cockroach-admin/types", "common", "dev-tools/crdb-seed", "dev-tools/omdb", @@ -111,6 +113,8 @@ default-members = [ "clients/sled-agent-client", "clients/wicketd-client", "cockroach-admin", + "cockroach-admin/api", + "cockroach-admin/types", "common", "dev-tools/crdb-seed", "dev-tools/omdb", @@ -265,7 +269,9 @@ ciborium = "0.2.2" cfg-if = "1.0" chrono = { version = "0.4", features = [ "serde" ] } clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] } +cockroach-admin-api = { path = "cockroach-admin/api" } cockroach-admin-client = { path = "clients/cockroach-admin-client" } +cockroach-admin-types = { path = "cockroach-admin/types" } colored = "2.1" const_format = "0.2.32" cookie = "0.18" diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index 07f9807463..1738fd98e5 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -12,6 +12,8 @@ anyhow.workspace = true camino.workspace = true chrono.workspace = true clap.workspace = true +cockroach-admin-api.workspace = true +cockroach-admin-types.workspace = true csv.workspace = true dropshot.workspace = true http.workspace = true diff --git a/cockroach-admin/api/Cargo.toml b/cockroach-admin/api/Cargo.toml new file mode 100644 index 0000000000..f0434856d2 --- /dev/null +++ b/cockroach-admin/api/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "cockroach-admin-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +cockroach-admin-types.workspace = true +dropshot.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true diff --git a/cockroach-admin/api/src/lib.rs b/cockroach-admin/api/src/lib.rs new file mode 100644 index 0000000000..192ff56f04 --- /dev/null +++ b/cockroach-admin/api/src/lib.rs @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use cockroach_admin_types::{NodeDecommission, NodeStatus}; +use dropshot::{HttpError, HttpResponseOk, RequestContext, TypedBody}; +use omicron_uuid_kinds::OmicronZoneUuid; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[dropshot::api_description { + module = "cockroach_admin_api_mod", +}] +pub trait CockroachAdminApi { + type Context; + + /// Get the status of all nodes in the CRDB cluster. + #[endpoint { + method = GET, + path = "/node/status", + }] + async fn node_status( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Get the CockroachDB node ID of the local cockroach instance. + #[endpoint { + method = GET, + path = "/node/id", + }] + async fn local_node_id( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Decommission a node from the CRDB cluster. + #[endpoint { + method = POST, + path = "/node/decommission", + }] + async fn node_decommission( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError>; +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct ClusterNodeStatus { + pub all_nodes: Vec, +} + +/// CockroachDB Node ID +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct LocalNodeId { + /// The ID of this Omicron zone. + /// + /// This is included to ensure correctness even if a socket address on a + /// sled is reused for a different zone; if our caller is trying to + /// determine the node ID for a particular Omicron CockroachDB zone, they'll + /// contact us by socket address. We include our zone ID in the response for + /// their confirmation that we are the zone they intended to contact. + pub zone_id: OmicronZoneUuid, + // CockroachDB node IDs are integers, in practice, but our use of them is as + // input and output to the `cockroach` CLI. We use a string which is a bit + // more natural (no need to parse CLI output or stringify an ID to send it + // as input) and leaves open the door for the format to change in the + // future. + pub node_id: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct NodeId { + pub node_id: String, +} diff --git a/cockroach-admin/src/bin/cockroach-admin.rs b/cockroach-admin/src/bin/cockroach-admin.rs index 0399c8bbb0..ee6d8f4aa9 100644 --- a/cockroach-admin/src/bin/cockroach-admin.rs +++ b/cockroach-admin/src/bin/cockroach-admin.rs @@ -19,9 +19,6 @@ use std::net::SocketAddrV6; #[derive(Debug, Parser)] #[clap(name = "cockroach-admin", about = "Omicron CRDB cluster admin server")] enum Args { - /// Print the OpenAPI Spec document and exit - Openapi, - /// Start the CRDB admin server Run { /// Path to the `cockroach` CLI @@ -57,8 +54,6 @@ async fn main_impl() -> Result<(), CmdError> { let args = Args::parse(); match args { - Args::Openapi => omicron_cockroach_admin::run_openapi() - .map_err(|e| CmdError::Failure(anyhow!(e))), Args::Run { path_to_cockroach_binary, cockroach_address, diff --git a/cockroach-admin/src/cockroach_cli.rs b/cockroach-admin/src/cockroach_cli.rs index 1951866ce7..b812cf9749 100644 --- a/cockroach-admin/src/cockroach_cli.rs +++ b/cockroach-admin/src/cockroach_cli.rs @@ -3,20 +3,14 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use camino::Utf8PathBuf; -use chrono::DateTime; -use chrono::NaiveDateTime; -use chrono::Utc; +use cockroach_admin_types::NodeDecommission; +use cockroach_admin_types::NodeStatus; use dropshot::HttpError; use illumos_utils::output_to_exec_error; use illumos_utils::ExecutionError; -use schemars::JsonSchema; -use serde::de; -use serde::Deserialize; -use serde::Serialize; use slog_error_chain::InlineErrorChain; use slog_error_chain::SlogInlineError; use std::io; -use std::net::SocketAddr; use std::net::SocketAddrV6; use tokio::process::Command; @@ -139,463 +133,16 @@ impl CockroachCli { } } -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct NodeStatus { - pub node_id: String, - pub address: SocketAddr, - pub sql_address: SocketAddr, - pub build: String, - pub started_at: DateTime, - pub updated_at: DateTime, - pub locality: String, - pub is_available: bool, - pub is_live: bool, -} - -// Slightly different `NodeStatus` that matches what we get from `cockroach`: -// timestamps are a fixed format with no timezone (but are actually UTC), so we -// have a custom deserializer, and the ID column is `id` instead of `node_id`. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] -struct CliNodeStatus { - id: String, - address: SocketAddr, - sql_address: SocketAddr, - build: String, - #[serde(deserialize_with = "parse_cockroach_cli_timestamp")] - started_at: DateTime, - #[serde(deserialize_with = "parse_cockroach_cli_timestamp")] - updated_at: DateTime, - locality: String, - is_available: bool, - is_live: bool, -} - -impl From for NodeStatus { - fn from(cli: CliNodeStatus) -> Self { - Self { - node_id: cli.id, - address: cli.address, - sql_address: cli.sql_address, - build: cli.build, - started_at: cli.started_at, - updated_at: cli.updated_at, - locality: cli.locality, - is_available: cli.is_available, - is_live: cli.is_live, - } - } -} - -fn parse_cockroach_cli_timestamp<'de, D>( - d: D, -) -> Result, D::Error> -where - D: serde::Deserializer<'de>, -{ - struct CockroachTimestampVisitor; - impl<'de> de::Visitor<'de> for CockroachTimestampVisitor { - type Value = DateTime; - - fn expecting( - &self, - formatter: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - formatter.write_str("a Cockroach CLI timestamp") - } - - fn visit_str(self, v: &str) -> Result - where - E: de::Error, - { - let dt = NaiveDateTime::parse_from_str(v, "%Y-%m-%d %H:%M:%S%.f") - .map_err(E::custom)?; - Ok(DateTime::from_naive_utc_and_offset(dt, Utc)) - } - } - - d.deserialize_str(CockroachTimestampVisitor) -} - -impl NodeStatus { - pub fn parse_from_csv(data: &[u8]) -> Result, csv::Error> { - let mut statuses = Vec::new(); - let mut reader = csv::Reader::from_reader(io::Cursor::new(data)); - for result in reader.deserialize() { - let record: CliNodeStatus = result?; - statuses.push(record.into()); - } - Ok(statuses) - } -} - -// The cockroach CLI and `crdb_internal.gossip_liveness` table use a string for -// node membership, but there are only three meaningful values per -// https://github.com/cockroachdb/cockroach/blob/0c92c710d2baadfdc5475be8d2238cf26cb152ca/pkg/kv/kvserver/liveness/livenesspb/liveness.go#L96, -// so we'll convert into a Rust enum and leave the "unknown" case for future -// changes that expand or reword these values. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(tag = "state", rename_all = "lowercase")] -pub enum NodeMembership { - Active, - Decommissioning, - Decommissioned, - Unknown { value: String }, -} - -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct NodeDecommission { - pub node_id: String, - pub is_live: bool, - pub replicas: i64, - pub is_decommissioning: bool, - pub membership: NodeMembership, - pub is_draining: bool, - pub notes: Vec, -} - -// Slightly different `NodeDecommission` that matches what we get from -// `cockroach`: this omites `notes`, which isn't really a CSV field at all, but -// is instead where we collect the non-CSV string output from the CLI, uses -// a custom deserializer for `membership` to handle unknown variants, and the ID -// column is `id` instead of `node_id`. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] -struct CliNodeDecommission { - pub id: String, - pub is_live: bool, - pub replicas: i64, - pub is_decommissioning: bool, - #[serde(deserialize_with = "parse_node_membership")] - pub membership: NodeMembership, - pub is_draining: bool, -} - -impl From<(CliNodeDecommission, Vec)> for NodeDecommission { - fn from((cli, notes): (CliNodeDecommission, Vec)) -> Self { - Self { - node_id: cli.id, - is_live: cli.is_live, - replicas: cli.replicas, - is_decommissioning: cli.is_decommissioning, - membership: cli.membership, - is_draining: cli.is_draining, - notes, - } - } -} - -fn parse_node_membership<'de, D>(d: D) -> Result -where - D: serde::Deserializer<'de>, -{ - struct CockroachNodeMembershipVisitor; - - impl<'de> de::Visitor<'de> for CockroachNodeMembershipVisitor { - type Value = NodeMembership; - - fn expecting( - &self, - formatter: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - formatter.write_str("a Cockroach node membership string") - } - - fn visit_str(self, v: &str) -> Result - where - E: de::Error, - { - let membership = match v { - "active" => NodeMembership::Active, - "decommissioning" => NodeMembership::Decommissioning, - "decommissioned" => NodeMembership::Decommissioned, - _ => NodeMembership::Unknown { value: v.to_string() }, - }; - Ok(membership) - } - } - - d.deserialize_str(CockroachNodeMembershipVisitor) -} - -impl NodeDecommission { - pub fn parse_from_csv(data: &[u8]) -> Result { - // Reading the node decommission output is awkward because it isn't - // fully CSV. We expect a CSV header, then a row for each node being - // decommissioned, then (maybe) a blank line followed by a note that is - // just a string, not related to the initial CSV data. Even though the - // CLI supports decommissioning more than one node in one invocation, we - // only provide an API to decommission a single node, so we expect: - // - // 1. The CSV header line - // 2. The one row of CSV data - // 3. Trailing notes - // - // We'll collect the notes as a separate field and return them to our - // caller. - - // First we'll run the data through a csv::Reader; this will pull out - // the header row and the one row of data. - let mut reader = csv::Reader::from_reader(io::Cursor::new(data)); - let record: CliNodeDecommission = - reader.deserialize().next().ok_or_else(|| { - io::Error::other("fewer than two lines of output") - })??; - - // Get the position where the reader ended after that one row; we'll - // collect any remaining nonempty lines as `notes`. - let extra_data = &data[reader.position().byte() as usize..]; - let mut notes = Vec::new(); - for line in String::from_utf8_lossy(extra_data).lines() { - let line = line.trim(); - if !line.is_empty() { - notes.push(line.to_string()); - } - } - - Ok(Self::from((record, notes))) - } -} - #[cfg(test)] mod tests { + use std::net::SocketAddr; + use super::*; - use chrono::NaiveDate; + use cockroach_admin_types::NodeMembership; use nexus_test_utils::db::test_setup_database; use omicron_test_utils::dev; - use test_strategy::proptest; use url::Url; - #[test] - fn test_node_status_parse_single_line_from_csv() { - let input = br#"id,address,sql_address,build,started_at,updated_at,locality,is_available,is_live -1,[::1]:42021,[::1]:42021,v22.1.9,2024-05-21 15:19:50.523796,2024-05-21 16:31:28.050069,,true,true"#; - let expected = NodeStatus { - node_id: "1".to_string(), - address: "[::1]:42021".parse().unwrap(), - sql_address: "[::1]:42021".parse().unwrap(), - build: "v22.1.9".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 19, 50, 523796) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(16, 31, 28, 50069) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }; - - let statuses = NodeStatus::parse_from_csv(input).expect("parsed input"); - assert_eq!(statuses, vec![expected]); - } - - #[test] - fn test_node_status_parse_multiple_lines_from_csv() { - let input = br#"id,address,sql_address,build,started_at,updated_at,locality,is_available,is_live -1,[fd00:1122:3344:109::3]:32221,[fd00:1122:3344:109::3]:32221,v22.1.9-dirty,2024-05-18 19:18:00.597145,2024-05-21 15:22:34.290434,,true,true -2,[fd00:1122:3344:105::3]:32221,[fd00:1122:3344:105::3]:32221,v22.1.9-dirty,2024-05-18 19:17:01.796714,2024-05-21 15:22:34.901268,,true,true -3,[fd00:1122:3344:10b::3]:32221,[fd00:1122:3344:10b::3]:32221,v22.1.9-dirty,2024-05-18 19:18:52.37564,2024-05-21 15:22:36.341146,,true,true -4,[fd00:1122:3344:107::3]:32221,[fd00:1122:3344:107::3]:32221,v22.1.9-dirty,2024-05-18 19:16:22.788276,2024-05-21 15:22:34.897047,,true,true -5,[fd00:1122:3344:108::3]:32221,[fd00:1122:3344:108::3]:32221,v22.1.9-dirty,2024-05-18 19:18:09.196634,2024-05-21 15:22:35.168738,,true,true"#; - let expected = vec![ - NodeStatus { - node_id: "1".to_string(), - address: "[fd00:1122:3344:109::3]:32221".parse().unwrap(), - sql_address: "[fd00:1122:3344:109::3]:32221".parse().unwrap(), - build: "v22.1.9-dirty".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 18) - .unwrap() - .and_hms_micro_opt(19, 18, 0, 597145) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 22, 34, 290434) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }, - NodeStatus { - node_id: "2".to_string(), - address: "[fd00:1122:3344:105::3]:32221".parse().unwrap(), - sql_address: "[fd00:1122:3344:105::3]:32221".parse().unwrap(), - build: "v22.1.9-dirty".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 18) - .unwrap() - .and_hms_micro_opt(19, 17, 1, 796714) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 22, 34, 901268) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }, - NodeStatus { - node_id: "3".to_string(), - address: "[fd00:1122:3344:10b::3]:32221".parse().unwrap(), - sql_address: "[fd00:1122:3344:10b::3]:32221".parse().unwrap(), - build: "v22.1.9-dirty".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 18) - .unwrap() - .and_hms_micro_opt(19, 18, 52, 375640) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 22, 36, 341146) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }, - NodeStatus { - node_id: "4".to_string(), - address: "[fd00:1122:3344:107::3]:32221".parse().unwrap(), - sql_address: "[fd00:1122:3344:107::3]:32221".parse().unwrap(), - build: "v22.1.9-dirty".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 18) - .unwrap() - .and_hms_micro_opt(19, 16, 22, 788276) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 22, 34, 897047) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }, - NodeStatus { - node_id: "5".to_string(), - address: "[fd00:1122:3344:108::3]:32221".parse().unwrap(), - sql_address: "[fd00:1122:3344:108::3]:32221".parse().unwrap(), - build: "v22.1.9-dirty".to_string(), - started_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 18) - .unwrap() - .and_hms_micro_opt(19, 18, 9, 196634) - .unwrap(), - Utc, - ), - updated_at: DateTime::from_naive_utc_and_offset( - NaiveDate::from_ymd_opt(2024, 5, 21) - .unwrap() - .and_hms_micro_opt(15, 22, 35, 168738) - .unwrap(), - Utc, - ), - locality: String::new(), - is_available: true, - is_live: true, - }, - ]; - - let statuses = NodeStatus::parse_from_csv(input).expect("parsed input"); - assert_eq!(statuses.len(), expected.len()); - for (status, expected) in statuses.iter().zip(&expected) { - assert_eq!(status, expected); - } - } - - #[test] - fn test_node_decommission_parse_with_no_trailing_notes() { - let input = - br#"id,is_live,replicas,is_decommissioning,membership,is_draining -6,true,24,true,decommissioning,false"#; - let expected = NodeDecommission { - node_id: "6".to_string(), - is_live: true, - replicas: 24, - is_decommissioning: true, - membership: NodeMembership::Decommissioning, - is_draining: false, - notes: vec![], - }; - - let statuses = - NodeDecommission::parse_from_csv(input).expect("parsed input"); - assert_eq!(statuses, expected); - } - - #[test] - fn test_node_decommission_parse_with_trailing_notes() { - let input = - br#"id,is_live,replicas,is_decommissioning,membership,is_draining -6,false,0,true,decommissioned,false - -No more data reported on target nodes. Please verify cluster health before removing the nodes. -"#; - let expected = NodeDecommission { - node_id: "6".to_string(), - is_live: false, - replicas: 0, - is_decommissioning: true, - membership: NodeMembership::Decommissioned, - is_draining: false, - notes: vec!["No more data reported on target nodes. \ - Please verify cluster health before removing the nodes." - .to_string()], - }; - - let statuses = - NodeDecommission::parse_from_csv(input).expect("parsed input"); - assert_eq!(statuses, expected); - } - - #[test] - fn test_node_decommission_parse_with_unexpected_membership_value() { - let input = - br#"id,is_live,replicas,is_decommissioning,membership,is_draining -6,false,0,true,foobar,false"#; - let expected = NodeDecommission { - node_id: "6".to_string(), - is_live: false, - replicas: 0, - is_decommissioning: true, - membership: NodeMembership::Unknown { value: "foobar".to_string() }, - is_draining: false, - notes: vec![], - }; - - let statuses = - NodeDecommission::parse_from_csv(input).expect("parsed input"); - assert_eq!(statuses, expected); - } - // Ensure that if `cockroach node status` changes in a future CRDB version // bump, we have a test that will fail to force us to check whether our // current parsing is still valid. @@ -721,14 +268,4 @@ No more data reported on target nodes. Please verify cluster health before remov db.cleanup().await.unwrap(); logctx.cleanup_successful(); } - - #[proptest] - fn node_status_parse_doesnt_panic_on_arbitrary_input(input: Vec) { - _ = NodeStatus::parse_from_csv(&input); - } - - #[proptest] - fn node_decommission_parse_doesnt_panic_on_arbitrary_input(input: Vec) { - _ = NodeDecommission::parse_from_csv(&input); - } } diff --git a/cockroach-admin/src/http_entrypoints.rs b/cockroach-admin/src/http_entrypoints.rs index 45957df0df..77eaf7e02b 100644 --- a/cockroach-admin/src/http_entrypoints.rs +++ b/cockroach-admin/src/http_entrypoints.rs @@ -2,112 +2,53 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::cockroach_cli::NodeDecommission; -use crate::cockroach_cli::NodeStatus; use crate::context::ServerContext; -use dropshot::endpoint; -use dropshot::ApiDescriptionRegisterError; +use cockroach_admin_api::*; +use cockroach_admin_types::NodeDecommission; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::RequestContext; use dropshot::TypedBody; -use omicron_uuid_kinds::OmicronZoneUuid; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; use std::sync::Arc; type CrdbApiDescription = dropshot::ApiDescription>; pub fn api() -> CrdbApiDescription { - fn register_endpoints( - api: &mut CrdbApiDescription, - ) -> Result<(), ApiDescriptionRegisterError> { - api.register(local_node_id)?; - api.register(node_status)?; - api.register(node_decommission)?; - Ok(()) - } - - let mut api = CrdbApiDescription::new(); - if let Err(err) = register_endpoints(&mut api) { - panic!("failed to register entrypoints: {}", err); - } - api + cockroach_admin_api_mod::api_description::() + .expect("registered entrypoints") } -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct ClusterNodeStatus { - pub all_nodes: Vec, -} - -/// Get the status of all nodes in the CRDB cluster -#[endpoint { - method = GET, - path = "/node/status", -}] -async fn node_status( - rqctx: RequestContext>, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let all_nodes = - ctx.cockroach_cli().node_status().await.map_err(HttpError::from)?; - Ok(HttpResponseOk(ClusterNodeStatus { all_nodes })) -} +enum CockroachAdminImpl {} -/// CockroachDB Node ID -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct LocalNodeId { - /// The ID of this Omicron zone. - /// - /// This is included to ensure correctness even if a socket address on a - /// sled is reused for a different zone; if our caller is trying to - /// determine the node ID for a particular Omicron CockroachDB zone, they'll - /// contact us by socket address. We include our zone ID in the response for - /// their confirmation that we are the zone they intended to contact. - pub zone_id: OmicronZoneUuid, - // CockroachDB node IDs are integers, in practice, but our use of them is as - // input and output to the `cockroach` CLI. We use a string which is a bit - // more natural (no need to parse CLI output or stringify an ID to send it - // as input) and leaves open the door for the format to change in the - // future. - pub node_id: String, -} +impl CockroachAdminApi for CockroachAdminImpl { + type Context = Arc; -/// Get the CockroachDB node ID of the local cockroach instance. -#[endpoint { - method = GET, - path = "/node/id", -}] -async fn local_node_id( - rqctx: RequestContext>, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let node_id = ctx.node_id().await?.to_string(); - let zone_id = ctx.zone_id(); - Ok(HttpResponseOk(LocalNodeId { zone_id, node_id })) -} + async fn node_status( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let all_nodes = + ctx.cockroach_cli().node_status().await.map_err(HttpError::from)?; + Ok(HttpResponseOk(ClusterNodeStatus { all_nodes })) + } -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct NodeId { - pub node_id: String, -} + async fn local_node_id( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let node_id = ctx.node_id().await?.to_string(); + let zone_id = ctx.zone_id(); + Ok(HttpResponseOk(LocalNodeId { zone_id, node_id })) + } -/// Decommission a node from the CRDB cluster -#[endpoint { - method = POST, - path = "/node/decommission", -}] -async fn node_decommission( - rqctx: RequestContext>, - body: TypedBody, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let NodeId { node_id } = body.into_inner(); - let decommission_status = - ctx.cockroach_cli().node_decommission(&node_id).await?; - Ok(HttpResponseOk(decommission_status)) + async fn node_decommission( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let NodeId { node_id } = body.into_inner(); + let decommission_status = + ctx.cockroach_cli().node_decommission(&node_id).await?; + Ok(HttpResponseOk(decommission_status)) + } } diff --git a/cockroach-admin/src/lib.rs b/cockroach-admin/src/lib.rs index f4a32cb6c0..1057344297 100644 --- a/cockroach-admin/src/lib.rs +++ b/cockroach-admin/src/lib.rs @@ -23,21 +23,6 @@ pub use cockroach_cli::CockroachCli; pub use cockroach_cli::CockroachCliError; pub use config::Config; -/// Run the OpenAPI generator for the API; this emits the OpenAPI spec to -/// stdout. -pub fn run_openapi() -> Result<(), String> { - http_entrypoints::api() - .openapi("Oxide CockroachDb Cluster Admin API", "0.0.1") - .description( - "API for interacting with the Oxide \ - control plane's CockroachDb cluster", - ) - .contact_url("https://oxide.computer") - .contact_email("api@oxide.computer") - .write(&mut std::io::stdout()) - .map_err(|e| e.to_string()) -} - #[derive(Debug, thiserror::Error, SlogInlineError)] pub enum StartError { #[error("failed to initialize logger")] diff --git a/cockroach-admin/tests/integration_tests/commands.rs b/cockroach-admin/tests/integration_tests/commands.rs deleted file mode 100644 index 875427d948..0000000000 --- a/cockroach-admin/tests/integration_tests/commands.rs +++ /dev/null @@ -1,43 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Tests for the executable commands in this repo. - -use expectorate::assert_contents; -use omicron_test_utils::dev::test_cmds::{ - assert_exit_code, path_to_executable, run_command, EXIT_SUCCESS, -}; -use openapiv3::OpenAPI; -use std::path::PathBuf; -use subprocess::Exec; - -// path to executable -const CMD_COCKROACH_ADMIN: &str = env!("CARGO_BIN_EXE_cockroach-admin"); - -fn path_to_cockroach_admin() -> PathBuf { - path_to_executable(CMD_COCKROACH_ADMIN) -} - -#[test] -fn test_cockroach_admin_openapi() { - let exec = Exec::cmd(path_to_cockroach_admin()).arg("openapi"); - let (exit_status, stdout_text, stderr_text) = run_command(exec); - assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); - assert_contents( - "tests/output/cmd-cockroach-admin-openapi-stderr", - &stderr_text, - ); - - let spec: OpenAPI = serde_json::from_str(&stdout_text) - .expect("stdout was not valid OpenAPI"); - - // Check for lint errors. - let errors = openapi_lint::validate(&spec); - assert!(errors.is_empty(), "{}", errors.join("\n\n")); - - // Confirm that the output hasn't changed. It's expected that we'll change - // this file as the API evolves, but pay attention to the diffs to ensure - // that the changes match your expectations. - assert_contents("../openapi/cockroach-admin.json", &stdout_text); -} diff --git a/cockroach-admin/tests/integration_tests/mod.rs b/cockroach-admin/tests/integration_tests/mod.rs deleted file mode 100644 index 1bf43dc00c..0000000000 --- a/cockroach-admin/tests/integration_tests/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -mod commands; diff --git a/cockroach-admin/tests/mod.rs b/cockroach-admin/tests/mod.rs deleted file mode 100644 index 99aeeb8299..0000000000 --- a/cockroach-admin/tests/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -mod integration_tests; diff --git a/cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr b/cockroach-admin/tests/output/cmd-cockroach-admin-openapi-stderr deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/cockroach-admin/types/Cargo.toml b/cockroach-admin/types/Cargo.toml new file mode 100644 index 0000000000..870d1c55c2 --- /dev/null +++ b/cockroach-admin/types/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "cockroach-admin-types" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +chrono.workspace = true +csv.workspace = true +omicron-common.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true + +[dev-dependencies] +proptest.workspace = true +test-strategy.workspace = true diff --git a/cockroach-admin/types/src/lib.rs b/cockroach-admin/types/src/lib.rs new file mode 100644 index 0000000000..3653cc616b --- /dev/null +++ b/cockroach-admin/types/src/lib.rs @@ -0,0 +1,477 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::{io, net::SocketAddr}; + +use chrono::{DateTime, NaiveDateTime, Utc}; +use schemars::JsonSchema; +use serde::{de, Deserialize, Serialize}; + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct NodeStatus { + pub node_id: String, + pub address: SocketAddr, + pub sql_address: SocketAddr, + pub build: String, + pub started_at: DateTime, + pub updated_at: DateTime, + pub locality: String, + pub is_available: bool, + pub is_live: bool, +} + +impl NodeStatus { + pub fn parse_from_csv(data: &[u8]) -> Result, csv::Error> { + let mut statuses = Vec::new(); + let mut reader = csv::Reader::from_reader(io::Cursor::new(data)); + for result in reader.deserialize() { + let record: CliNodeStatus = result?; + statuses.push(record.into()); + } + Ok(statuses) + } +} + +// Slightly different `NodeStatus` that matches what we get from `cockroach`: +// timestamps are a fixed format with no timezone (but are actually UTC), so we +// have a custom deserializer, and the ID column is `id` instead of `node_id`. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +struct CliNodeStatus { + id: String, + address: SocketAddr, + sql_address: SocketAddr, + build: String, + #[serde(deserialize_with = "parse_cockroach_cli_timestamp")] + started_at: DateTime, + #[serde(deserialize_with = "parse_cockroach_cli_timestamp")] + updated_at: DateTime, + locality: String, + is_available: bool, + is_live: bool, +} + +impl From for NodeStatus { + fn from(cli: CliNodeStatus) -> Self { + Self { + node_id: cli.id, + address: cli.address, + sql_address: cli.sql_address, + build: cli.build, + started_at: cli.started_at, + updated_at: cli.updated_at, + locality: cli.locality, + is_available: cli.is_available, + is_live: cli.is_live, + } + } +} + +fn parse_cockroach_cli_timestamp<'de, D>( + d: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + struct CockroachTimestampVisitor; + impl<'de> de::Visitor<'de> for CockroachTimestampVisitor { + type Value = DateTime; + + fn expecting( + &self, + formatter: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + formatter.write_str("a Cockroach CLI timestamp") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + let dt = NaiveDateTime::parse_from_str(v, "%Y-%m-%d %H:%M:%S%.f") + .map_err(E::custom)?; + Ok(DateTime::from_naive_utc_and_offset(dt, Utc)) + } + } + + d.deserialize_str(CockroachTimestampVisitor) +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub struct NodeDecommission { + pub node_id: String, + pub is_live: bool, + pub replicas: i64, + pub is_decommissioning: bool, + pub membership: NodeMembership, + pub is_draining: bool, + pub notes: Vec, +} + +impl NodeDecommission { + pub fn parse_from_csv(data: &[u8]) -> Result { + // Reading the node decommission output is awkward because it isn't + // fully CSV. We expect a CSV header, then a row for each node being + // decommissioned, then (maybe) a blank line followed by a note that is + // just a string, not related to the initial CSV data. Even though the + // CLI supports decommissioning more than one node in one invocation, we + // only provide an API to decommission a single node, so we expect: + // + // 1. The CSV header line + // 2. The one row of CSV data + // 3. Trailing notes + // + // We'll collect the notes as a separate field and return them to our + // caller. + + // First we'll run the data through a csv::Reader; this will pull out + // the header row and the one row of data. + let mut reader = csv::Reader::from_reader(io::Cursor::new(data)); + let record: CliNodeDecommission = + reader.deserialize().next().ok_or_else(|| { + io::Error::other("fewer than two lines of output") + })??; + + // Get the position where the reader ended after that one row; we'll + // collect any remaining nonempty lines as `notes`. + let extra_data = &data[reader.position().byte() as usize..]; + let mut notes = Vec::new(); + for line in String::from_utf8_lossy(extra_data).lines() { + let line = line.trim(); + if !line.is_empty() { + notes.push(line.to_string()); + } + } + + Ok(Self::from((record, notes))) + } +} + +// Slightly different `NodeDecommission` that matches what we get from +// `cockroach`: this omites `notes`, which isn't really a CSV field at all, but +// is instead where we collect the non-CSV string output from the CLI, uses +// a custom deserializer for `membership` to handle unknown variants, and the ID +// column is `id` instead of `node_id`. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +struct CliNodeDecommission { + pub id: String, + pub is_live: bool, + pub replicas: i64, + pub is_decommissioning: bool, + #[serde(deserialize_with = "parse_node_membership")] + pub membership: NodeMembership, + pub is_draining: bool, +} + +impl From<(CliNodeDecommission, Vec)> for NodeDecommission { + fn from((cli, notes): (CliNodeDecommission, Vec)) -> Self { + Self { + node_id: cli.id, + is_live: cli.is_live, + replicas: cli.replicas, + is_decommissioning: cli.is_decommissioning, + membership: cli.membership, + is_draining: cli.is_draining, + notes, + } + } +} + +fn parse_node_membership<'de, D>(d: D) -> Result +where + D: serde::Deserializer<'de>, +{ + struct CockroachNodeMembershipVisitor; + + impl<'de> de::Visitor<'de> for CockroachNodeMembershipVisitor { + type Value = NodeMembership; + + fn expecting( + &self, + formatter: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + formatter.write_str("a Cockroach node membership string") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + let membership = match v { + "active" => NodeMembership::Active, + "decommissioning" => NodeMembership::Decommissioning, + "decommissioned" => NodeMembership::Decommissioned, + _ => NodeMembership::Unknown { value: v.to_string() }, + }; + Ok(membership) + } + } + + d.deserialize_str(CockroachNodeMembershipVisitor) +} + +// The cockroach CLI and `crdb_internal.gossip_liveness` table use a string for +// node membership, but there are only three meaningful values per +// https://github.com/cockroachdb/cockroach/blob/0c92c710d2baadfdc5475be8d2238cf26cb152ca/pkg/kv/kvserver/liveness/livenesspb/liveness.go#L96, +// so we'll convert into a Rust enum and leave the "unknown" case for future +// changes that expand or reword these values. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "state", rename_all = "lowercase")] +pub enum NodeMembership { + Active, + Decommissioning, + Decommissioned, + Unknown { value: String }, +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::NaiveDate; + use test_strategy::proptest; + + #[test] + fn test_node_status_parse_single_line_from_csv() { + let input = br#"id,address,sql_address,build,started_at,updated_at,locality,is_available,is_live +1,[::1]:42021,[::1]:42021,v22.1.9,2024-05-21 15:19:50.523796,2024-05-21 16:31:28.050069,,true,true"#; + let expected = NodeStatus { + node_id: "1".to_string(), + address: "[::1]:42021".parse().unwrap(), + sql_address: "[::1]:42021".parse().unwrap(), + build: "v22.1.9".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 19, 50, 523796) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(16, 31, 28, 50069) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }; + + let statuses = NodeStatus::parse_from_csv(input).expect("parsed input"); + assert_eq!(statuses, vec![expected]); + } + + #[test] + fn test_node_status_parse_multiple_lines_from_csv() { + let input = br#"id,address,sql_address,build,started_at,updated_at,locality,is_available,is_live +1,[fd00:1122:3344:109::3]:32221,[fd00:1122:3344:109::3]:32221,v22.1.9-dirty,2024-05-18 19:18:00.597145,2024-05-21 15:22:34.290434,,true,true +2,[fd00:1122:3344:105::3]:32221,[fd00:1122:3344:105::3]:32221,v22.1.9-dirty,2024-05-18 19:17:01.796714,2024-05-21 15:22:34.901268,,true,true +3,[fd00:1122:3344:10b::3]:32221,[fd00:1122:3344:10b::3]:32221,v22.1.9-dirty,2024-05-18 19:18:52.37564,2024-05-21 15:22:36.341146,,true,true +4,[fd00:1122:3344:107::3]:32221,[fd00:1122:3344:107::3]:32221,v22.1.9-dirty,2024-05-18 19:16:22.788276,2024-05-21 15:22:34.897047,,true,true +5,[fd00:1122:3344:108::3]:32221,[fd00:1122:3344:108::3]:32221,v22.1.9-dirty,2024-05-18 19:18:09.196634,2024-05-21 15:22:35.168738,,true,true"#; + let expected = vec![ + NodeStatus { + node_id: "1".to_string(), + address: "[fd00:1122:3344:109::3]:32221".parse().unwrap(), + sql_address: "[fd00:1122:3344:109::3]:32221".parse().unwrap(), + build: "v22.1.9-dirty".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 18) + .unwrap() + .and_hms_micro_opt(19, 18, 0, 597145) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 22, 34, 290434) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }, + NodeStatus { + node_id: "2".to_string(), + address: "[fd00:1122:3344:105::3]:32221".parse().unwrap(), + sql_address: "[fd00:1122:3344:105::3]:32221".parse().unwrap(), + build: "v22.1.9-dirty".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 18) + .unwrap() + .and_hms_micro_opt(19, 17, 1, 796714) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 22, 34, 901268) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }, + NodeStatus { + node_id: "3".to_string(), + address: "[fd00:1122:3344:10b::3]:32221".parse().unwrap(), + sql_address: "[fd00:1122:3344:10b::3]:32221".parse().unwrap(), + build: "v22.1.9-dirty".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 18) + .unwrap() + .and_hms_micro_opt(19, 18, 52, 375640) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 22, 36, 341146) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }, + NodeStatus { + node_id: "4".to_string(), + address: "[fd00:1122:3344:107::3]:32221".parse().unwrap(), + sql_address: "[fd00:1122:3344:107::3]:32221".parse().unwrap(), + build: "v22.1.9-dirty".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 18) + .unwrap() + .and_hms_micro_opt(19, 16, 22, 788276) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 22, 34, 897047) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }, + NodeStatus { + node_id: "5".to_string(), + address: "[fd00:1122:3344:108::3]:32221".parse().unwrap(), + sql_address: "[fd00:1122:3344:108::3]:32221".parse().unwrap(), + build: "v22.1.9-dirty".to_string(), + started_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 18) + .unwrap() + .and_hms_micro_opt(19, 18, 9, 196634) + .unwrap(), + Utc, + ), + updated_at: DateTime::from_naive_utc_and_offset( + NaiveDate::from_ymd_opt(2024, 5, 21) + .unwrap() + .and_hms_micro_opt(15, 22, 35, 168738) + .unwrap(), + Utc, + ), + locality: String::new(), + is_available: true, + is_live: true, + }, + ]; + + let statuses = NodeStatus::parse_from_csv(input).expect("parsed input"); + assert_eq!(statuses.len(), expected.len()); + for (status, expected) in statuses.iter().zip(&expected) { + assert_eq!(status, expected); + } + } + + #[test] + fn test_node_decommission_parse_with_no_trailing_notes() { + let input = + br#"id,is_live,replicas,is_decommissioning,membership,is_draining +6,true,24,true,decommissioning,false"#; + let expected = NodeDecommission { + node_id: "6".to_string(), + is_live: true, + replicas: 24, + is_decommissioning: true, + membership: NodeMembership::Decommissioning, + is_draining: false, + notes: vec![], + }; + + let statuses = + NodeDecommission::parse_from_csv(input).expect("parsed input"); + assert_eq!(statuses, expected); + } + + #[test] + fn test_node_decommission_parse_with_trailing_notes() { + let input = + br#"id,is_live,replicas,is_decommissioning,membership,is_draining +6,false,0,true,decommissioned,false + +No more data reported on target nodes. Please verify cluster health before removing the nodes. +"#; + let expected = NodeDecommission { + node_id: "6".to_string(), + is_live: false, + replicas: 0, + is_decommissioning: true, + membership: NodeMembership::Decommissioned, + is_draining: false, + notes: vec!["No more data reported on target nodes. \ + Please verify cluster health before removing the nodes." + .to_string()], + }; + + let statuses = + NodeDecommission::parse_from_csv(input).expect("parsed input"); + assert_eq!(statuses, expected); + } + + #[test] + fn test_node_decommission_parse_with_unexpected_membership_value() { + let input = + br#"id,is_live,replicas,is_decommissioning,membership,is_draining +6,false,0,true,foobar,false"#; + let expected = NodeDecommission { + node_id: "6".to_string(), + is_live: false, + replicas: 0, + is_decommissioning: true, + membership: NodeMembership::Unknown { value: "foobar".to_string() }, + is_draining: false, + notes: vec![], + }; + + let statuses = + NodeDecommission::parse_from_csv(input).expect("parsed input"); + assert_eq!(statuses, expected); + } + + // TODO: the proptests below should probably be fuzz targets instead to + // allow for guided fuzzing. + + #[proptest] + fn node_status_parse_doesnt_panic_on_arbitrary_input(input: Vec) { + _ = NodeStatus::parse_from_csv(&input); + } + + #[proptest] + fn node_decommission_parse_doesnt_panic_on_arbitrary_input(input: Vec) { + _ = NodeDecommission::parse_from_csv(&input); + } +} diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index aa0cfacfd5..b7e74f6515 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -11,6 +11,7 @@ workspace = true anyhow.workspace = true atomicwrites.workspace = true camino.workspace = true +cockroach-admin-api.workspace = true clap.workspace = true dns-server-api.workspace = true dropshot.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 83f0f4dd57..6a3431b1f5 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -14,6 +14,17 @@ use openapiv3::OpenAPI; /// All APIs managed by openapi-manager. pub fn all_apis() -> Vec { vec![ + ApiSpec { + title: "CockroachDB Cluster Admin API", + version: "0.0.1", + description: "API for interacting with the Oxide control plane's \ + CockroachDB cluster", + boundary: ApiBoundary::Internal, + api_description: + cockroach_admin_api::cockroach_admin_api_mod::stub_api_description, + filename: "cockroach-admin.json", + extra_validation: None, + }, ApiSpec { title: "Internal DNS", version: "0.0.1", diff --git a/openapi/cockroach-admin.json b/openapi/cockroach-admin.json index 3b03475ec5..76c0bea09b 100644 --- a/openapi/cockroach-admin.json +++ b/openapi/cockroach-admin.json @@ -1,8 +1,8 @@ { "openapi": "3.0.3", "info": { - "title": "Oxide CockroachDb Cluster Admin API", - "description": "API for interacting with the Oxide control plane's CockroachDb cluster", + "title": "CockroachDB Cluster Admin API", + "description": "API for interacting with the Oxide control plane's CockroachDB cluster", "contact": { "url": "https://oxide.computer", "email": "api@oxide.computer" @@ -12,7 +12,7 @@ "paths": { "/node/decommission": { "post": { - "summary": "Decommission a node from the CRDB cluster", + "summary": "Decommission a node from the CRDB cluster.", "operationId": "node_decommission", "requestBody": { "content": { @@ -70,7 +70,7 @@ }, "/node/status": { "get": { - "summary": "Get the status of all nodes in the CRDB cluster", + "summary": "Get the status of all nodes in the CRDB cluster.", "operationId": "node_status", "responses": { "200": { From 301cf0ad1a385f65a6e03bba18b9b28028e500ce Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 21:43:24 -0700 Subject: [PATCH 19/21] Update Rust crate cargo_toml to v0.20.4 (#6132) --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79dac1c05e..da165707e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -854,9 +854,9 @@ dependencies = [ [[package]] name = "cargo_toml" -version = "0.20.3" +version = "0.20.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4895c018bb228aa6b3ba1a0285543fcb4b704734c3fb1f72afaa75aa769500c1" +checksum = "ad639525b1c67b6a298f378417b060fbc04618bea559482a8484381cce27d965" dependencies = [ "serde", "toml 0.8.15", From cd1203c2f9d155449c8f7305a52655795db43a02 Mon Sep 17 00:00:00 2001 From: "oxide-renovate[bot]" <146848827+oxide-renovate[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 21:43:30 -0700 Subject: [PATCH 20/21] Update Rust crate unicode-width to 0.1.13 (#6133) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9ad1d585d3..ab0efb20c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -533,7 +533,7 @@ tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } tui-tree-widget = "0.21.0" typed-rng = { path = "typed-rng" } -unicode-width = "0.1.11" +unicode-width = "0.1.13" update-common = { path = "update-common" } update-engine = { path = "update-engine" } url = "2.5.0" From b83eb7c0cbbbf2febf97e54dcb87ca6258144435 Mon Sep 17 00:00:00 2001 From: Rain Date: Fri, 19 Jul 2024 23:02:05 -0700 Subject: [PATCH 21/21] [3/3 sled-agent] convert bootstrap agent API into a trait (#6124) Make a new bootstrap-agent-api crate which has the bootstrap agent trait in it, and implement it within the sled agent. Changed some of the comments slightly to align their style generally with omicron ("Resets" -> "Reset"). --- Cargo.lock | 17 ++ Cargo.toml | 3 + dev-tools/openapi-manager/Cargo.toml | 1 + dev-tools/openapi-manager/src/spec.rs | 10 + openapi/bootstrap-agent.json | 12 +- sled-agent/Cargo.toml | 1 + sled-agent/bootstrap-agent-api/Cargo.toml | 19 ++ sled-agent/bootstrap-agent-api/src/lib.rs | 92 ++++++++ sled-agent/src/bin/sled-agent.rs | 4 - sled-agent/src/bootstrap/http_entrypoints.rs | 218 +++++++----------- sled-agent/src/bootstrap/server.rs | 12 - sled-agent/src/updates.rs | 11 +- .../tests/integration_tests/commands.rs | 23 -- .../cmd-bootstrap-agent-openapi-sled-stderr | 0 14 files changed, 239 insertions(+), 184 deletions(-) create mode 100644 sled-agent/bootstrap-agent-api/Cargo.toml create mode 100644 sled-agent/bootstrap-agent-api/src/lib.rs delete mode 100644 sled-agent/tests/output/cmd-bootstrap-agent-openapi-sled-stderr diff --git a/Cargo.lock b/Cargo.lock index da165707e8..5373c93796 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -698,6 +698,21 @@ dependencies = [ "zeroize", ] +[[package]] +name = "bootstrap-agent-api" +version = "0.1.0" +dependencies = [ + "dropshot", + "nexus-client", + "omicron-common", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "schemars", + "serde", + "sled-agent-types", + "sled-hardware-types", +] + [[package]] name = "bootstrap-agent-client" version = "0.1.0" @@ -5811,6 +5826,7 @@ dependencies = [ "async-trait", "base64 0.22.1", "bootstore", + "bootstrap-agent-api", "bootstrap-agent-client", "bytes", "camino", @@ -6136,6 +6152,7 @@ version = "0.1.0" dependencies = [ "anyhow", "atomicwrites", + "bootstrap-agent-api", "camino", "clap", "cockroach-admin-api", diff --git a/Cargo.toml b/Cargo.toml index ab0efb20c3..e2ba5fb5b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/bootstrap-agent-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -175,6 +176,7 @@ default-members = [ "passwords", "rpaths", "sled-agent", + "sled-agent/bootstrap-agent-api", "sled-agent/types", "sled-hardware", "sled-hardware/types", @@ -256,6 +258,7 @@ bb8 = "0.8.5" bcs = "0.1.6" bincode = "1.3.3" bootstore = { path = "bootstore" } +bootstrap-agent-api = { path = "sled-agent/bootstrap-agent-api" } bootstrap-agent-client = { path = "clients/bootstrap-agent-client" } buf-list = { version = "1.0.3", features = ["tokio1"] } byteorder = "1.5.0" diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml index b7e74f6515..dc07dae0e2 100644 --- a/dev-tools/openapi-manager/Cargo.toml +++ b/dev-tools/openapi-manager/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] anyhow.workspace = true atomicwrites.workspace = true +bootstrap-agent-api.workspace = true camino.workspace = true cockroach-admin-api.workspace = true clap.workspace = true diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 6a3431b1f5..e8ebc23550 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -25,6 +25,16 @@ pub fn all_apis() -> Vec { filename: "cockroach-admin.json", extra_validation: None, }, + ApiSpec { + title: "Bootstrap Agent API", + version: "0.0.1", + description: "Per-sled API for setup and teardown", + boundary: ApiBoundary::Internal, + api_description: + bootstrap_agent_api::bootstrap_agent_api_mod::stub_api_description, + filename: "bootstrap-agent.json", + extra_validation: None, + }, ApiSpec { title: "Internal DNS", version: "0.0.1", diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 68ae76e523..f268662ca9 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -1,8 +1,8 @@ { "openapi": "3.0.3", "info": { - "title": "Oxide Bootstrap Agent API", - "description": "API for interacting with individual sleds", + "title": "Bootstrap Agent API", + "description": "Per-sled API for setup and teardown", "contact": { "url": "https://oxide.computer", "email": "api@oxide.computer" @@ -36,7 +36,7 @@ }, "/components": { "get": { - "summary": "Provides a list of components known to the bootstrap agent.", + "summary": "Provide a list of components known to the bootstrap agent.", "description": "This API is intended to allow early boot services (such as Wicket) to query the underlying component versions installed on a sled.", "operationId": "components_get", "responses": { @@ -87,7 +87,7 @@ } }, "post": { - "summary": "Initializes the rack with the provided configuration.", + "summary": "Initialize the rack with the provided configuration.", "operationId": "rack_initialize", "requestBody": { "content": { @@ -119,7 +119,7 @@ } }, "delete": { - "summary": "Resets the rack to an unconfigured state.", + "summary": "Reset the rack to an unconfigured state.", "operationId": "rack_reset", "responses": { "200": { @@ -143,7 +143,7 @@ }, "/sled-initialize": { "delete": { - "summary": "Resets this particular sled to an unconfigured state.", + "summary": "Reset this particular sled to an unconfigured state.", "operationId": "sled_reset", "responses": { "204": { diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index a85884587f..7747bb768e 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -13,6 +13,7 @@ anyhow.workspace = true async-trait.workspace = true base64.workspace = true bootstore.workspace = true +bootstrap-agent-api.workspace = true bootstrap-agent-client.workspace = true bytes.workspace = true camino.workspace = true diff --git a/sled-agent/bootstrap-agent-api/Cargo.toml b/sled-agent/bootstrap-agent-api/Cargo.toml new file mode 100644 index 0000000000..368c5afe93 --- /dev/null +++ b/sled-agent/bootstrap-agent-api/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "bootstrap-agent-api" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +dropshot.workspace = true +nexus-client.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +omicron-workspace-hack.workspace = true +schemars.workspace = true +serde.workspace = true +sled-agent-types.workspace = true +sled-hardware-types.workspace = true diff --git a/sled-agent/bootstrap-agent-api/src/lib.rs b/sled-agent/bootstrap-agent-api/src/lib.rs new file mode 100644 index 0000000000..b1b8865351 --- /dev/null +++ b/sled-agent/bootstrap-agent-api/src/lib.rs @@ -0,0 +1,92 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The bootstrap agent's API. +//! +//! Note that the bootstrap agent also communicates over Sprockets, +//! and has a separate interface for establishing the trust quorum. + +use dropshot::{ + HttpError, HttpResponseOk, HttpResponseUpdatedNoContent, RequestContext, + TypedBody, +}; +use omicron_common::api::external::SemverVersion; +use omicron_uuid_kinds::{RackInitUuid, RackResetUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_agent_types::{ + rack_init::RackInitializeRequest, rack_ops::RackOperationStatus, +}; +use sled_hardware_types::Baseboard; + +#[dropshot::api_description { + module = "bootstrap_agent_api_mod", +}] +pub trait BootstrapAgentApi { + type Context; + + /// Return the baseboard identity of this sled. + #[endpoint { + method = GET, + path = "/baseboard", + }] + async fn baseboard_get( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Provide a list of components known to the bootstrap agent. + /// + /// This API is intended to allow early boot services (such as Wicket) + /// to query the underlying component versions installed on a sled. + #[endpoint { + method = GET, + path = "/components", + }] + async fn components_get( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// Get the current status of rack initialization or reset. + #[endpoint { + method = GET, + path = "/rack-initialize", + }] + async fn rack_initialization_status( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Initialize the rack with the provided configuration. + #[endpoint { + method = POST, + path = "/rack-initialize", + }] + async fn rack_initialize( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError>; + + /// Reset the rack to an unconfigured state. + #[endpoint { + method = DELETE, + path = "/rack-initialize", + }] + async fn rack_reset( + rqctx: RequestContext, + ) -> Result, HttpError>; + + /// Reset this particular sled to an unconfigured state. + #[endpoint { + method = DELETE, + path = "/sled-initialize", + }] + async fn sled_reset( + rqctx: RequestContext, + ) -> Result; +} + +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct Component { + pub name: String, + pub version: SemverVersion, +} diff --git a/sled-agent/src/bin/sled-agent.rs b/sled-agent/src/bin/sled-agent.rs index 1bd83653ad..6feeffd302 100644 --- a/sled-agent/src/bin/sled-agent.rs +++ b/sled-agent/src/bin/sled-agent.rs @@ -16,8 +16,6 @@ use sled_agent_types::rack_init::RackInitializeRequest; #[derive(Subcommand, Debug)] enum OpenapiFlavor { - /// Generates bootstrap agent openapi spec - Bootstrap, /// Generates sled agent openapi spec Sled, } @@ -54,8 +52,6 @@ async fn do_run() -> Result<(), CmdError> { Args::Openapi(flavor) => match flavor { OpenapiFlavor::Sled => sled_server::run_openapi() .map_err(|err| CmdError::Failure(anyhow!(err))), - OpenapiFlavor::Bootstrap => bootstrap_server::run_openapi() - .map_err(|err| CmdError::Failure(anyhow!(err))), }, Args::Run { config_path } => { let config = SledConfig::from_file(&config_path) diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index 824bb5fd25..2bd1745f77 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -11,12 +11,14 @@ use super::rack_ops::RssAccess; use super::BootstrapError; use super::RssAccessError; use crate::updates::ConfigUpdates; -use crate::updates::{Component, UpdateManager}; +use crate::updates::UpdateManager; use bootstore::schemes::v0 as bootstore; -use dropshot::ApiDescriptionRegisterError; +use bootstrap_agent_api::bootstrap_agent_api_mod; +use bootstrap_agent_api::BootstrapAgentApi; +use bootstrap_agent_api::Component; use dropshot::{ - endpoint, ApiDescription, HttpError, HttpResponseOk, - HttpResponseUpdatedNoContent, RequestContext, TypedBody, + ApiDescription, HttpError, HttpResponseOk, HttpResponseUpdatedNoContent, + RequestContext, TypedBody, }; use http::StatusCode; use omicron_common::api::external::Error; @@ -58,142 +60,98 @@ impl BootstrapServerContext { } } -type BootstrapApiDescription = ApiDescription; - /// Returns a description of the bootstrap agent API -pub(crate) fn api() -> BootstrapApiDescription { - fn register_endpoints( - api: &mut BootstrapApiDescription, - ) -> Result<(), ApiDescriptionRegisterError> { - api.register(baseboard_get)?; - api.register(components_get)?; - api.register(rack_initialization_status)?; - api.register(rack_initialize)?; - api.register(rack_reset)?; - api.register(sled_reset)?; - Ok(()) - } - - let mut api = BootstrapApiDescription::new(); - if let Err(err) = register_endpoints(&mut api) { - panic!("failed to register entrypoints: {}", err); - } - api +pub(crate) fn api() -> ApiDescription { + bootstrap_agent_api_mod::api_description::() + .expect("registered entrypoints successfully") } -/// Return the baseboard identity of this sled. -#[endpoint { - method = GET, - path = "/baseboard", -}] -async fn baseboard_get( - rqctx: RequestContext, -) -> Result, HttpError> { - let ctx = rqctx.context(); - Ok(HttpResponseOk(ctx.baseboard.clone())) -} +enum BootstrapAgentImpl {} -/// Provides a list of components known to the bootstrap agent. -/// -/// This API is intended to allow early boot services (such as Wicket) -/// to query the underlying component versions installed on a sled. -#[endpoint { - method = GET, - path = "/components", -}] -async fn components_get( - rqctx: RequestContext, -) -> Result>, HttpError> { - let ctx = rqctx.context(); - let updates = UpdateManager::new(ctx.updates.clone()); - let components = updates - .components_get() - .await - .map_err(|err| HttpError::for_internal_error(err.to_string()))?; - Ok(HttpResponseOk(components)) -} +impl BootstrapAgentApi for BootstrapAgentImpl { + type Context = BootstrapServerContext; -/// Get the current status of rack initialization or reset. -#[endpoint { - method = GET, - path = "/rack-initialize", -}] -async fn rack_initialization_status( - rqctx: RequestContext, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let status = ctx.rss_access.operation_status(); - Ok(HttpResponseOk(status)) -} + async fn baseboard_get( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + Ok(HttpResponseOk(ctx.baseboard.clone())) + } -/// Initializes the rack with the provided configuration. -#[endpoint { - method = POST, - path = "/rack-initialize", -}] -async fn rack_initialize( - rqctx: RequestContext, - body: TypedBody, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let request = body.into_inner(); - let id = ctx - .start_rack_initialize(request) - .map_err(|err| HttpError::for_bad_request(None, err.to_string()))?; - Ok(HttpResponseOk(id)) -} + async fn components_get( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let ctx = rqctx.context(); + let updates = UpdateManager::new(ctx.updates.clone()); + let components = updates + .components_get() + .await + .map_err(|err| HttpError::for_internal_error(err.to_string()))?; + Ok(HttpResponseOk(components)) + } -/// Resets the rack to an unconfigured state. -#[endpoint { - method = DELETE, - path = "/rack-initialize", -}] -async fn rack_reset( - rqctx: RequestContext, -) -> Result, HttpError> { - let ctx = rqctx.context(); - let id = ctx - .rss_access - .start_reset(&ctx.base_log, ctx.global_zone_bootstrap_ip) - .map_err(|err| HttpError::for_bad_request(None, err.to_string()))?; - Ok(HttpResponseOk(id)) -} + async fn rack_initialization_status( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let status = ctx.rss_access.operation_status(); + Ok(HttpResponseOk(status)) + } -/// Resets this particular sled to an unconfigured state. -#[endpoint { - method = DELETE, - path = "/sled-initialize", -}] -async fn sled_reset( - rqctx: RequestContext, -) -> Result { - let ctx = rqctx.context(); - let (response_tx, response_rx) = oneshot::channel(); - - let make_channel_closed_err = || { - Err(HttpError::for_internal_error( - "sled_reset channel closed: task panic?".to_string(), - )) - }; - - match ctx.sled_reset_tx.try_send(response_tx) { - Ok(()) => (), - Err(TrySendError::Full(_)) => { - return Err(HttpError::for_status( - Some("ResetPending".to_string()), - StatusCode::TOO_MANY_REQUESTS, - )); - } - Err(TrySendError::Closed(_)) => { - return make_channel_closed_err(); - } + async fn rack_initialize( + rqctx: RequestContext, + body: TypedBody, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let request = body.into_inner(); + let id = ctx + .start_rack_initialize(request) + .map_err(|err| HttpError::for_bad_request(None, err.to_string()))?; + Ok(HttpResponseOk(id)) } - match response_rx.await { - Ok(result) => { - () = result.map_err(Error::from)?; - Ok(HttpResponseUpdatedNoContent()) + async fn rack_reset( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let id = ctx + .rss_access + .start_reset(&ctx.base_log, ctx.global_zone_bootstrap_ip) + .map_err(|err| HttpError::for_bad_request(None, err.to_string()))?; + Ok(HttpResponseOk(id)) + } + + async fn sled_reset( + rqctx: RequestContext, + ) -> Result { + let ctx = rqctx.context(); + let (response_tx, response_rx) = oneshot::channel(); + + let make_channel_closed_err = || { + Err(HttpError::for_internal_error( + "sled_reset channel closed: task panic?".to_string(), + )) + }; + + match ctx.sled_reset_tx.try_send(response_tx) { + Ok(()) => (), + Err(TrySendError::Full(_)) => { + return Err(HttpError::for_status( + Some("ResetPending".to_string()), + StatusCode::TOO_MANY_REQUESTS, + )); + } + Err(TrySendError::Closed(_)) => { + return make_channel_closed_err(); + } + } + + match response_rx.await { + Ok(result) => { + () = result.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + } + Err(_) => make_channel_closed_err(), } - Err(_) => make_channel_closed_err(), } } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index 656be1a394..fa1d781a96 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -11,7 +11,6 @@ use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; -use crate::bootstrap::http_entrypoints::api as http_api; use crate::bootstrap::http_entrypoints::BootstrapServerContext; use crate::bootstrap::maghemite; use crate::bootstrap::pre_server::BootstrapAgentStartup; @@ -501,17 +500,6 @@ async fn sled_config_paths( Ok(paths) } -/// Runs the OpenAPI generator, emitting the spec to stdout. -pub fn run_openapi() -> Result<(), String> { - http_api() - .openapi("Oxide Bootstrap Agent API", "0.0.1") - .description("API for interacting with individual sleds") - .contact_url("https://oxide.computer") - .contact_email("api@oxide.computer") - .write(&mut std::io::stdout()) - .map_err(|e| e.to_string()) -} - struct Inner { config: SledConfig, state: SledAgentState, diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 13a1ec7623..9193a855b0 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -5,14 +5,13 @@ //! Management of per-sled updates use crate::nexus::NexusClient; +use bootstrap_agent_api::Component; use camino::{Utf8Path, Utf8PathBuf}; use camino_tempfile::NamedUtf8TempFile; use futures::{TryFutureExt, TryStreamExt}; -use omicron_common::api::external::SemverVersion; use omicron_common::api::internal::nexus::{ KnownArtifactKind, UpdateArtifactId, }; -use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::io::Read; use tokio::io::AsyncWriteExt; @@ -68,12 +67,6 @@ impl Default for ConfigUpdates { } } -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct Component { - pub name: String, - pub version: SemverVersion, -} - // Helper functions for returning errors fn version_malformed_err(path: &Utf8Path, key: &str) -> Error { Error::VersionMalformed { @@ -261,7 +254,7 @@ mod test { use crate::fakes::nexus::FakeNexusServer; use flate2::write::GzEncoder; use nexus_client::Client as NexusClient; - use omicron_common::api::external::Error; + use omicron_common::api::external::{Error, SemverVersion}; use omicron_common::api::internal::nexus::UpdateArtifactId; use omicron_test_utils::dev::test_setup_log; use std::io::Write; diff --git a/sled-agent/tests/integration_tests/commands.rs b/sled-agent/tests/integration_tests/commands.rs index 132c3d78e4..26c82e488e 100644 --- a/sled-agent/tests/integration_tests/commands.rs +++ b/sled-agent/tests/integration_tests/commands.rs @@ -79,26 +79,3 @@ fn test_sled_agent_openapi_sled() { // that the changes match your expectations. assert_contents("../openapi/sled-agent.json", &stdout_text); } - -#[test] -fn test_bootstrap_agent_openapi_sled() { - let exec = Exec::cmd(path_to_sled_agent()).arg("openapi").arg("bootstrap"); - let (exit_status, stdout_text, stderr_text) = run_command(exec); - assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); - assert_contents( - "tests/output/cmd-bootstrap-agent-openapi-sled-stderr", - &stderr_text, - ); - - let spec: OpenAPI = serde_json::from_str(&stdout_text) - .expect("stdout was not valid OpenAPI"); - - // Check for lint errors. - let errors = openapi_lint::validate(&spec); - assert!(errors.is_empty(), "{}", errors.join("\n\n")); - - // Confirm that the output hasn't changed. It's expected that we'll change - // this file as the API evolves, but pay attention to the diffs to ensure - // that the changes match your expectations. - assert_contents("../openapi/bootstrap-agent.json", &stdout_text); -} diff --git a/sled-agent/tests/output/cmd-bootstrap-agent-openapi-sled-stderr b/sled-agent/tests/output/cmd-bootstrap-agent-openapi-sled-stderr deleted file mode 100644 index e69de29bb2..0000000000