From 7b9fee2f65f6e76fa37de4125c720631de5fc236 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Thu, 22 Aug 2024 20:26:41 -0500 Subject: [PATCH 01/10] Bump web console (custom routers on subnets) (#6418) https://github.com/oxidecomputer/console/compare/8dcddcef...9ff6ac6c * [9ff6ac6c](https://github.com/oxidecomputer/console/commit/9ff6ac6c) oxidecomputer/console#2394 * [8028f9a5](https://github.com/oxidecomputer/console/commit/8028f9a5) oxidecomputer/console#2393 * [1bb92706](https://github.com/oxidecomputer/console/commit/1bb92706) oxidecomputer/console#2339 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index ef59f6e40c..6ed1b23c0b 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="8dcddcef62b8d10dfcd3adb470439212b23b3d5e" -SHA2="30a5ecc4d7b82dfc8bbd5ea59d5d92b8414d0362425c1ce1011da8c722a8ec4c" +COMMIT="9ff6ac6cc709b9081347f2718b99a9a799a41610" +SHA2="abeddddefcf70f1cea74178b6b7463eb834215a4168f16631ccae74e9d95a8e1" From 02303a6f03b19b8476fc4bc4a65d6b4e29585c6c Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 23 Aug 2024 09:30:53 -0700 Subject: [PATCH 02/10] Check histogram bin overflow in the support type, not the power type (#6409) Fixes #6408 --- oximeter/types/src/histogram.rs | 38 +++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs index 0b85727ee0..2a4feab382 100644 --- a/oximeter/types/src/histogram.rs +++ b/oximeter/types/src/histogram.rs @@ -1029,8 +1029,13 @@ where return Err(QuantizationError::InvalidSteps); } - // The highest power must be representable in the target type. - if self.checked_pow(hi.into()).is_none() { + // The highest power must be representable in the target type. Note that + // we have to convert to that target type _before_ doing this check. + let base = >::from(*self); + let Some(highest) = base.checked_pow(hi.into()) else { + return Err(QuantizationError::Overflow); + }; + if ::from(highest).is_none() { return Err(QuantizationError::Overflow); } @@ -1039,7 +1044,6 @@ where // // Note that we unwrap in a few places below, where we're sure the // narrowing conversion cannot fail, such as to a u32. - let base = >::from(*self); let lo = >::from(lo); let hi = >::from(hi); let count = ::from(count.get()) @@ -1057,7 +1061,6 @@ where let lo = base.pow(lo as _); let hi = base.pow(hi as _); let distance = hi - lo; - dbg!(distance, count); distance.is_multiple_of(&count) }) } @@ -1767,4 +1770,31 @@ mod tests { HistogramError::EmptyBins )); } + + #[test] + fn test_log_linear_bins_does_not_overflow_wide_bin_type() { + let start: u16 = 3; + // 10u16 ** 10u16 overflows, but what we should be computing is 10u64 ** + // 10u16, which would not overflow. We need to compute whether it + // overflows in the _support_ type. + let stop = 10; + Histogram::::span_decades(start, stop).expect( + "expected not to overflow, since support type is wide enough", + ); + } + + #[test] + fn test_log_linear_bins_does_overflow_narrow_bin_type() { + // In this case, the start / stop powers _and_ their resulting bins are + // both representable as u16s and also u64s. But we're generating bins + // that are u8s, which _the powers do_ overflow. + let start: u16 = 1; + let stop: u16 = 4; + Histogram::::span_decades(start, stop).expect( + "expected not to overflow a u32, since support type is wide enough", + ); + Histogram::::span_decades(start, stop).expect_err( + "expected to overflow a u8, since support type is not wide enough", + ); + } } From f2463f4305320997c8e4d62a18162ef817722674 Mon Sep 17 00:00:00 2001 From: David Crespo Date: Fri, 23 Aug 2024 13:20:01 -0500 Subject: [PATCH 03/10] Bump web console (instance list polling, status -> state) (#6419) https://github.com/oxidecomputer/console/compare/9ff6ac6c...77127657 * [77127657](https://github.com/oxidecomputer/console/commit/77127657) oxidecomputer/console#2395 * [342aa049](https://github.com/oxidecomputer/console/commit/342aa049) oxidecomputer/console#2391 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 6ed1b23c0b..b2fc99daf3 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="9ff6ac6cc709b9081347f2718b99a9a799a41610" -SHA2="abeddddefcf70f1cea74178b6b7463eb834215a4168f16631ccae74e9d95a8e1" +COMMIT="771276573549dd255c6749980636aa7140e8bab8" +SHA2="4d441de0784bb0d775e0a7f4067758fd6c37fbf050ed76b744cd37d6e81af3d3" From d96ea7ca8945b8ad78a53fd083850ea39789e5f0 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 23 Aug 2024 21:00:03 +0100 Subject: [PATCH 04/10] [chore] Bump OPTE to v0.33.293 (#6400) * Move underlay NICs back into H/W Classification (oxidecomputer/opte#504) My disposition is to wait til R11 before we merge this -- I've done lengthy testing on `glasgow`, but I would like plenty of soak time on dogfood before this sees a release. --- Cargo.lock | 12 ++++++------ Cargo.toml | 4 ++-- tools/opte_version | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 837015f3bc..4edebcc911 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3810,7 +3810,7 @@ dependencies = [ [[package]] name = "illumos-sys-hdrs" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" [[package]] name = "illumos-utils" @@ -4246,7 +4246,7 @@ dependencies = [ [[package]] name = "kstat-macro" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" dependencies = [ "quote", "syn 2.0.74", @@ -6717,7 +6717,7 @@ dependencies = [ [[package]] name = "opte" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" dependencies = [ "cfg-if", "dyn-clone", @@ -6734,7 +6734,7 @@ dependencies = [ [[package]] name = "opte-api" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" dependencies = [ "illumos-sys-hdrs", "ipnetwork", @@ -6746,7 +6746,7 @@ dependencies = [ [[package]] name = "opte-ioctl" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" dependencies = [ "libc", "libnet 0.1.0 (git+https://github.com/oxidecomputer/netadm-sys)", @@ -6820,7 +6820,7 @@ dependencies = [ [[package]] name = "oxide-vpc" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" +source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653" dependencies = [ "cfg-if", "illumos-sys-hdrs", diff --git a/Cargo.toml b/Cargo.toml index cfb097ef3c..cbb0216d5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -450,7 +450,7 @@ omicron-test-utils = { path = "test-utils" } omicron-workspace-hack = "0.1.0" omicron-zone-package = "0.11.0" oxide-client = { path = "clients/oxide-client" } -oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d", features = [ "api", "std" ] } +oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "76878de67229ea113d70503c441eab47ac5dc653", features = [ "api", "std" ] } oxlog = { path = "dev-tools/oxlog" } oxnet = { git = "https://github.com/oxidecomputer/oxnet" } once_cell = "1.19.0" @@ -460,7 +460,7 @@ openapiv3 = "2.0.0" # must match samael's crate! openssl = "0.10" openssl-sys = "0.9" -opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" } +opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "76878de67229ea113d70503c441eab47ac5dc653" } oso = "0.27" owo-colors = "4.0.0" oximeter = { path = "oximeter/oximeter" } diff --git a/tools/opte_version b/tools/opte_version index dfbb589f24..0e2023666f 100644 --- a/tools/opte_version +++ b/tools/opte_version @@ -1 +1 @@ -0.33.277 +0.33.293 From 9ac07441afc15cabc7dfeab59915b50da69f44be Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 23 Aug 2024 14:53:50 -0700 Subject: [PATCH 05/10] Expunge old switch-table timeseries schema (#6423) Fixes #6422 --- .../db/schema/replicated/11/timeseries-to-delete.txt | 9 +++++++++ .../db/schema/single-node/11/timeseries-to-delete.txt | 9 +++++++++ oximeter/db/src/model.rs | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 oximeter/db/schema/replicated/11/timeseries-to-delete.txt create mode 100644 oximeter/db/schema/single-node/11/timeseries-to-delete.txt diff --git a/oximeter/db/schema/replicated/11/timeseries-to-delete.txt b/oximeter/db/schema/replicated/11/timeseries-to-delete.txt new file mode 100644 index 0000000000..4f0301a6b5 --- /dev/null +++ b/oximeter/db/schema/replicated/11/timeseries-to-delete.txt @@ -0,0 +1,9 @@ +switch_table:capacity +switch_table:collisions +switch_table:delete_misses +switch_table:deletes +switch_table:exhaustion +switch_table:inserts +switch_table:occupancy +switch_table:update_misses +switch_table:updates diff --git a/oximeter/db/schema/single-node/11/timeseries-to-delete.txt b/oximeter/db/schema/single-node/11/timeseries-to-delete.txt new file mode 100644 index 0000000000..4f0301a6b5 --- /dev/null +++ b/oximeter/db/schema/single-node/11/timeseries-to-delete.txt @@ -0,0 +1,9 @@ +switch_table:capacity +switch_table:collisions +switch_table:delete_misses +switch_table:deletes +switch_table:exhaustion +switch_table:inserts +switch_table:occupancy +switch_table:update_misses +switch_table:updates diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs index 7608f81e45..a3e9d109ff 100644 --- a/oximeter/db/src/model.rs +++ b/oximeter/db/src/model.rs @@ -45,7 +45,7 @@ use uuid::Uuid; /// - [`crate::Client::initialize_db_with_version`] /// - [`crate::Client::ensure_schema`] /// - The `clickhouse-schema-updater` binary in this crate -pub const OXIMETER_VERSION: u64 = 10; +pub const OXIMETER_VERSION: u64 = 11; // Wrapper type to represent a boolean in the database. // From 41d36d75ac0e442212366f3d0567e33ecb47c067 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 15:40:07 -0700 Subject: [PATCH 06/10] [oximeter] spell "Celsius" correctly (#6426) Thanks to @elaine-oxide for catching this --- I had misspelt "Celsius" as "Celcius" and it had made it all the way into the CLI thanks to its dependency on the Nexus API. This commit corrects the misspelling. --- openapi/nexus.json | 2 +- oximeter/schema/src/codegen.rs | 4 ++-- oximeter/types/src/schema.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/openapi/nexus.json b/openapi/nexus.json index 285dcd82bb..2a8c227c64 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -19934,7 +19934,7 @@ "nanoseconds", "volts", "amps", - "degrees_celcius" + "degrees_celsius" ] }, { diff --git a/oximeter/schema/src/codegen.rs b/oximeter/schema/src/codegen.rs index 0429cf0534..c46c25c97d 100644 --- a/oximeter/schema/src/codegen.rs +++ b/oximeter/schema/src/codegen.rs @@ -512,8 +512,8 @@ fn quote_units(units: Units) -> TokenStream { } Units::Amps => quote! { ::oximeter::schema::Units::Amps }, Units::Volts => quote! { ::oximeter::schema::Units::Volts }, - Units::DegreesCelcius => { - quote! { ::oximeter::schema::Units::DegreesCelcius } + Units::DegreesCelsius => { + quote! { ::oximeter::schema::Units::DegreesCelsius } } Units::Rpm => quote! { ::oximeter::schema::Units::Rpm }, } diff --git a/oximeter/types/src/schema.rs b/oximeter/types/src/schema.rs index 80aaa6f101..e06e6e2b57 100644 --- a/oximeter/types/src/schema.rs +++ b/oximeter/types/src/schema.rs @@ -189,7 +189,7 @@ pub enum Units { Nanoseconds, Volts, Amps, - DegreesCelcius, + DegreesCelsius, /// Rotations per minute. Rpm, } From 876ae85fc86e21ce81f1c49783fb86907d99fe8e Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Fri, 23 Aug 2024 19:14:54 -0400 Subject: [PATCH 07/10] Use `BlueprintZoneConfig` in RSS service plan (#6410) In anticipation of adding more `BlueprintZoneConfig` variants with more auxiliary information, we stop converting from `OmicronZoneConfig` to `BlueprintZoneConfig` which is not going to be feasible for much longer. Instead we change the one production code place we do this, RSS, to directly construct `BlueprintZoneConfig` structs rather than do the conversion. This has some ripple effects, and results in a new persistent v4 sled service plan. There is one test that still does this conversion, but the function that does it is now moved into that test module and commented heavily. We hope to remove it shortly. --- Cargo.lock | 1 + nexus/reconfigurator/execution/Cargo.toml | 1 + nexus/reconfigurator/execution/src/dns.rs | 214 ++++- nexus/types/src/deployment.rs | 172 ---- schema/rss-service-plan-v4.json | 999 ++++++++++++++++++++++ sled-agent/src/rack_setup/mod.rs | 5 + sled-agent/src/rack_setup/plan/service.rs | 362 +++++--- sled-agent/src/rack_setup/service.rs | 93 +- sled-agent/src/sim/server.rs | 105 ++- 9 files changed, 1548 insertions(+), 404 deletions(-) create mode 100644 schema/rss-service-plan-v4.json diff --git a/Cargo.lock b/Cargo.lock index 4edebcc911..2630aa2a25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5245,6 +5245,7 @@ dependencies = [ "httptest", "internal-dns", "ipnet", + "newtype-uuid", "nexus-config", "nexus-db-model", "nexus-db-queries", diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index a531b66df4..1c62e553a8 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -16,6 +16,7 @@ dns-service-client.workspace = true chrono.workspace = true futures.workspace = true internal-dns.workspace = true +newtype-uuid.workspace = true nexus-config.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 9ca14f8e24..1c878a9ada 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -467,6 +467,7 @@ mod test { use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; + use newtype_uuid::GenericUuid; use nexus_db_model::DnsGroup; use nexus_db_model::Silo; use nexus_db_queries::authn; @@ -478,6 +479,8 @@ mod test { use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple; use nexus_reconfigurator_planning::example::example; use nexus_reconfigurator_preparation::PlanningInputFromDb; + use nexus_sled_agent_shared::inventory::OmicronZoneConfig; + use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_test_utils::resource_helpers::create_silo; use nexus_test_utils::resource_helpers::DiskTestBuilder; @@ -490,6 +493,9 @@ mod test { use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; + pub use nexus_types::deployment::OmicronZoneExternalFloatingAddr; + pub use nexus_types::deployment::OmicronZoneExternalFloatingIp; + pub use nexus_types::deployment::OmicronZoneExternalSnatIp; use nexus_types::deployment::SledFilter; use nexus_types::external_api::params; use nexus_types::external_api::shared; @@ -539,6 +545,212 @@ mod test { } } + /// ********************************************************************** + /// DEPRECATION WARNING: + /// + /// Remove when `deprecated_omicron_zone_config_to_blueprint_zone_config` + /// is deleted. + /// ********************************************************************** + /// + /// Errors from converting an [`OmicronZoneType`] into a [`BlueprintZoneType`]. + #[derive(Debug, Clone)] + pub enum InvalidOmicronZoneType { + #[allow(unused)] + ExternalIpIdRequired { kind: ZoneKind }, + } + + /// ********************************************************************** + /// DEPRECATION WARNING: Do not call this function in new code !!! + /// ********************************************************************** + /// + /// Convert an [`OmicronZoneConfig`] to a [`BlueprintZoneConfig`]. + /// + /// A `BlueprintZoneConfig` is a superset of `OmicronZoneConfig` and + /// contains auxiliary information not present in an `OmicronZoneConfig`. + /// Therefore, the only valid direction for a real system to take is a + /// lossy conversion from `BlueprintZoneConfig` to `OmicronZoneConfig`. + /// This function, however, does the opposite. We therefore have to inject + /// fake information to fill in the unknown fields in the generated + /// `OmicronZoneConfig`. + /// + /// This is bad, and we should generally feel bad for doing it :). At + /// the time this was done we were backporting the blueprint system into + /// RSS while trying not to change too much code. This was a judicious + /// shortcut used right before a release for stability reasons. As the + /// number of zones managed by the reconfigurator has grown, the use + /// of this function has become more egregious, and so it was removed + /// from the production code path and into this test module. This move + /// itself is a judicious shortcut. We have a test in this module, + /// `test_blueprint_internal_dns_basic`, that is the last caller of this + /// function, and so we have moved this function into this module. + /// + /// Ideally, we would get rid of this function altogether and use another + /// method for generating `BlueprintZoneConfig` structures. Unfortunately, + /// there are still a few remaining zones that need to be implemented in the + /// `BlueprintBuilder`, and some of them require custom code. Until that is + /// done, we don't have a good way of generating a test representation of + /// the real system that would properly serve this test. We could generate + /// a `BlueprintZoneConfig` by hand for each zone type in this test, on + /// top of the more modern `SystemDescription` setup, but that isn't much + /// different than what we do in this test. We'd also eventually remove it + /// for better test setup when our `BlueprintBuilder` is capable of properly + /// constructing all zone types. Instead, we do the simple thing, and reuse + /// what we alreaady have. + /// + /// # Errors + /// + /// If `config.zone_type` is a zone that has an external IP address (Nexus, + /// boundary NTP, external DNS), `external_ip_id` must be `Some(_)` or this + /// method will return an error. + pub fn deprecated_omicron_zone_config_to_blueprint_zone_config( + config: OmicronZoneConfig, + disposition: BlueprintZoneDisposition, + external_ip_id: Option, + ) -> Result { + let kind = config.zone_type.kind(); + let zone_type = match config.zone_type { + OmicronZoneType::BoundaryNtp { + address, + dns_servers, + domain, + nic, + ntp_servers, + snat_cfg, + } => { + let external_ip_id = external_ip_id.ok_or( + InvalidOmicronZoneType::ExternalIpIdRequired { kind }, + )?; + BlueprintZoneType::BoundaryNtp( + blueprint_zone_type::BoundaryNtp { + address, + ntp_servers, + dns_servers, + domain, + nic, + external_ip: OmicronZoneExternalSnatIp { + id: external_ip_id, + snat_cfg, + }, + }, + ) + } + OmicronZoneType::Clickhouse { address, dataset } => { + BlueprintZoneType::Clickhouse(blueprint_zone_type::Clickhouse { + address, + dataset, + }) + } + OmicronZoneType::ClickhouseKeeper { address, dataset } => { + BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { address, dataset }, + ) + } + OmicronZoneType::ClickhouseServer { address, dataset } => { + BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { address, dataset }, + ) + } + OmicronZoneType::CockroachDb { address, dataset } => { + BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { address, dataset }, + ) + } + OmicronZoneType::Crucible { address, dataset } => { + BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { + address, + dataset, + }) + } + OmicronZoneType::CruciblePantry { address } => { + BlueprintZoneType::CruciblePantry( + blueprint_zone_type::CruciblePantry { address }, + ) + } + OmicronZoneType::ExternalDns { + dataset, + dns_address, + http_address, + nic, + } => { + let external_ip_id = external_ip_id.ok_or( + InvalidOmicronZoneType::ExternalIpIdRequired { kind }, + )?; + BlueprintZoneType::ExternalDns( + blueprint_zone_type::ExternalDns { + dataset, + http_address, + dns_address: OmicronZoneExternalFloatingAddr { + id: external_ip_id, + addr: dns_address, + }, + nic, + }, + ) + } + OmicronZoneType::InternalDns { + dataset, + dns_address, + gz_address, + gz_address_index, + http_address, + } => BlueprintZoneType::InternalDns( + blueprint_zone_type::InternalDns { + dataset, + http_address, + dns_address, + gz_address, + gz_address_index, + }, + ), + OmicronZoneType::InternalNtp { + address, + dns_servers, + domain, + ntp_servers, + } => BlueprintZoneType::InternalNtp( + blueprint_zone_type::InternalNtp { + address, + ntp_servers, + dns_servers, + domain, + }, + ), + OmicronZoneType::Nexus { + external_dns_servers, + external_ip, + external_tls, + internal_address, + nic, + } => { + let external_ip_id = external_ip_id.ok_or( + InvalidOmicronZoneType::ExternalIpIdRequired { kind }, + )?; + BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + internal_address, + external_ip: OmicronZoneExternalFloatingIp { + id: external_ip_id, + ip: external_ip, + }, + nic, + external_tls, + external_dns_servers, + }) + } + OmicronZoneType::Oximeter { address } => { + BlueprintZoneType::Oximeter(blueprint_zone_type::Oximeter { + address, + }) + } + }; + Ok(BlueprintZoneConfig { + disposition, + id: OmicronZoneUuid::from_untyped_uuid(config.id), + underlay_address: config.underlay_address, + filesystem_pool: config.filesystem_pool, + zone_type, + }) + } + /// test blueprint_internal_dns_config(): trivial case of an empty blueprint #[test] fn test_blueprint_internal_dns_empty() { @@ -589,7 +801,7 @@ mod test { .zones .into_iter() .map(|config| -> BlueprintZoneConfig { - BlueprintZoneConfig::from_omicron_zone_config( + deprecated_omicron_zone_config_to_blueprint_zone_config( config, BlueprintZoneDisposition::InService, // We don't get external IP IDs in inventory diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index cc48f2646a..96de893fa3 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -27,20 +27,17 @@ use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; use omicron_common::disk::OmicronPhysicalDisksConfig; use omicron_uuid_kinds::CollectionUuid; -use omicron_uuid_kinds::ExternalIpUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; -use slog_error_chain::SlogInlineError; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::fmt; use std::net::Ipv6Addr; use strum::EnumIter; use strum::IntoEnumIterator; -use thiserror::Error; use uuid::Uuid; mod blueprint_diff; @@ -595,13 +592,6 @@ fn zone_sort_key(z: &T) -> impl Ord { (z.kind(), z.id()) } -/// Errors from converting an [`OmicronZoneType`] into a [`BlueprintZoneType`]. -#[derive(Debug, Clone, Error, SlogInlineError)] -pub enum InvalidOmicronZoneType { - #[error("Omicron zone {} requires an external IP ID", kind.report_str())] - ExternalIpIdRequired { kind: ZoneKind }, -} - /// Describes one Omicron-managed zone in a blueprint. /// /// Part of [`BlueprintZonesConfig`]. @@ -616,168 +606,6 @@ pub struct BlueprintZoneConfig { pub zone_type: BlueprintZoneType, } -impl BlueprintZoneConfig { - /// Convert from an [`OmicronZoneConfig`]. - /// - /// This method is annoying to call correctly and will become more so over - /// time. Ideally we'd remove all callers and then remove this method, but - /// for now we keep it. - /// - /// # Errors - /// - /// If `config.zone_type` is a zone that has an external IP address (Nexus, - /// boundary NTP, external DNS), `external_ip_id` must be `Some(_)` or this - /// method will return an error. - pub fn from_omicron_zone_config( - config: OmicronZoneConfig, - disposition: BlueprintZoneDisposition, - external_ip_id: Option, - ) -> Result { - let kind = config.zone_type.kind(); - let zone_type = match config.zone_type { - OmicronZoneType::BoundaryNtp { - address, - dns_servers, - domain, - nic, - ntp_servers, - snat_cfg, - } => { - let external_ip_id = external_ip_id.ok_or( - InvalidOmicronZoneType::ExternalIpIdRequired { kind }, - )?; - BlueprintZoneType::BoundaryNtp( - blueprint_zone_type::BoundaryNtp { - address, - ntp_servers, - dns_servers, - domain, - nic, - external_ip: OmicronZoneExternalSnatIp { - id: external_ip_id, - snat_cfg, - }, - }, - ) - } - OmicronZoneType::Clickhouse { address, dataset } => { - BlueprintZoneType::Clickhouse(blueprint_zone_type::Clickhouse { - address, - dataset, - }) - } - OmicronZoneType::ClickhouseKeeper { address, dataset } => { - BlueprintZoneType::ClickhouseKeeper( - blueprint_zone_type::ClickhouseKeeper { address, dataset }, - ) - } - OmicronZoneType::ClickhouseServer { address, dataset } => { - BlueprintZoneType::ClickhouseServer( - blueprint_zone_type::ClickhouseServer { address, dataset }, - ) - } - OmicronZoneType::CockroachDb { address, dataset } => { - BlueprintZoneType::CockroachDb( - blueprint_zone_type::CockroachDb { address, dataset }, - ) - } - OmicronZoneType::Crucible { address, dataset } => { - BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { - address, - dataset, - }) - } - OmicronZoneType::CruciblePantry { address } => { - BlueprintZoneType::CruciblePantry( - blueprint_zone_type::CruciblePantry { address }, - ) - } - OmicronZoneType::ExternalDns { - dataset, - dns_address, - http_address, - nic, - } => { - let external_ip_id = external_ip_id.ok_or( - InvalidOmicronZoneType::ExternalIpIdRequired { kind }, - )?; - BlueprintZoneType::ExternalDns( - blueprint_zone_type::ExternalDns { - dataset, - http_address, - dns_address: OmicronZoneExternalFloatingAddr { - id: external_ip_id, - addr: dns_address, - }, - nic, - }, - ) - } - OmicronZoneType::InternalDns { - dataset, - dns_address, - gz_address, - gz_address_index, - http_address, - } => BlueprintZoneType::InternalDns( - blueprint_zone_type::InternalDns { - dataset, - http_address, - dns_address, - gz_address, - gz_address_index, - }, - ), - OmicronZoneType::InternalNtp { - address, - dns_servers, - domain, - ntp_servers, - } => BlueprintZoneType::InternalNtp( - blueprint_zone_type::InternalNtp { - address, - ntp_servers, - dns_servers, - domain, - }, - ), - OmicronZoneType::Nexus { - external_dns_servers, - external_ip, - external_tls, - internal_address, - nic, - } => { - let external_ip_id = external_ip_id.ok_or( - InvalidOmicronZoneType::ExternalIpIdRequired { kind }, - )?; - BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - internal_address, - external_ip: OmicronZoneExternalFloatingIp { - id: external_ip_id, - ip: external_ip, - }, - nic, - external_tls, - external_dns_servers, - }) - } - OmicronZoneType::Oximeter { address } => { - BlueprintZoneType::Oximeter(blueprint_zone_type::Oximeter { - address, - }) - } - }; - Ok(Self { - disposition, - id: OmicronZoneUuid::from_untyped_uuid(config.id), - underlay_address: config.underlay_address, - filesystem_pool: config.filesystem_pool, - zone_type, - }) - } -} - impl From for OmicronZoneConfig { fn from(z: BlueprintZoneConfig) -> Self { Self { diff --git a/schema/rss-service-plan-v4.json b/schema/rss-service-plan-v4.json new file mode 100644 index 0000000000..badfaf4589 --- /dev/null +++ b/schema/rss-service-plan-v4.json @@ -0,0 +1,999 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Plan", + "type": "object", + "required": [ + "dns_config", + "services" + ], + "properties": { + "dns_config": { + "$ref": "#/definitions/DnsConfigParams" + }, + "services": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/SledConfig" + } + } + }, + "definitions": { + "BlueprintZoneConfig": { + "description": "Describes one Omicron-managed zone in a blueprint.\n\nPart of [`BlueprintZonesConfig`].", + "type": "object", + "required": [ + "disposition", + "id", + "underlay_address", + "zone_type" + ], + "properties": { + "disposition": { + "description": "The disposition (desired state) of this zone recorded in the blueprint.", + "allOf": [ + { + "$ref": "#/definitions/BlueprintZoneDisposition" + } + ] + }, + "filesystem_pool": { + "anyOf": [ + { + "$ref": "#/definitions/ZpoolName" + }, + { + "type": "null" + } + ] + }, + "id": { + "$ref": "#/definitions/TypedUuidForOmicronZoneKind" + }, + "underlay_address": { + "type": "string", + "format": "ipv6" + }, + "zone_type": { + "$ref": "#/definitions/BlueprintZoneType" + } + } + }, + "BlueprintZoneDisposition": { + "description": "The desired state of an Omicron-managed zone in a blueprint.\n\nPart of [`BlueprintZoneConfig`].", + "oneOf": [ + { + "description": "The zone is in-service.", + "type": "string", + "enum": [ + "in_service" + ] + }, + { + "description": "The zone is not in service.", + "type": "string", + "enum": [ + "quiesced" + ] + }, + { + "description": "The zone is permanently gone.", + "type": "string", + "enum": [ + "expunged" + ] + } + ] + }, + "BlueprintZoneType": { + "oneOf": [ + { + "type": "object", + "required": [ + "address", + "dns_servers", + "external_ip", + "nic", + "ntp_servers", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dns_servers": { + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "domain": { + "type": [ + "string", + "null" + ] + }, + "external_ip": { + "$ref": "#/definitions/OmicronZoneExternalSnatIp" + }, + "nic": { + "description": "The service vNIC providing outbound connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "ntp_servers": { + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "type": "string", + "enum": [ + "boundary_ntp" + ] + } + } + }, + { + "description": "Used in single-node clickhouse setups", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_keeper" + ] + } + } + }, + { + "description": "Used in replicated clickhouse setups", + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_server" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "cockroach_db" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "crucible_pantry" + ] + } + } + }, + { + "type": "object", + "required": [ + "dataset", + "dns_address", + "http_address", + "nic", + "type" + ], + "properties": { + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "dns_address": { + "description": "The address at which the external DNS server is reachable.", + "allOf": [ + { + "$ref": "#/definitions/OmicronZoneExternalFloatingAddr" + } + ] + }, + "http_address": { + "description": "The address at which the external DNS server API is reachable.", + "type": "string" + }, + "nic": { + "description": "The service vNIC providing external connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "external_dns" + ] + } + } + }, + { + "type": "object", + "required": [ + "dataset", + "dns_address", + "gz_address", + "gz_address_index", + "http_address", + "type" + ], + "properties": { + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "dns_address": { + "type": "string" + }, + "gz_address": { + "description": "The addresses in the global zone which should be created\n\nFor the DNS service, which exists outside the sleds's typical subnet - adding an address in the GZ is necessary to allow inter-zone traffic routing.", + "type": "string", + "format": "ipv6" + }, + "gz_address_index": { + "description": "The address is also identified with an auxiliary bit of information to ensure that the created global zone address can have a unique name.", + "type": "integer", + "format": "uint32", + "minimum": 0.0 + }, + "http_address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "internal_dns" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dns_servers", + "ntp_servers", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dns_servers": { + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "domain": { + "type": [ + "string", + "null" + ] + }, + "ntp_servers": { + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "type": "string", + "enum": [ + "internal_ntp" + ] + } + } + }, + { + "type": "object", + "required": [ + "external_dns_servers", + "external_ip", + "external_tls", + "internal_address", + "nic", + "type" + ], + "properties": { + "external_dns_servers": { + "description": "External DNS servers Nexus can use to resolve external hosts.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "external_ip": { + "description": "The address at which the external nexus server is reachable.", + "allOf": [ + { + "$ref": "#/definitions/OmicronZoneExternalFloatingIp" + } + ] + }, + "external_tls": { + "description": "Whether Nexus's external endpoint should use TLS", + "type": "boolean" + }, + "internal_address": { + "description": "The address at which the internal nexus server is reachable.", + "type": "string" + }, + "nic": { + "description": "The service vNIC providing external connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "nexus" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "oximeter" + ] + } + } + } + ] + }, + "DiskIdentity": { + "description": "Uniquely identifies a disk.", + "type": "object", + "required": [ + "model", + "serial", + "vendor" + ], + "properties": { + "model": { + "type": "string" + }, + "serial": { + "type": "string" + }, + "vendor": { + "type": "string" + } + } + }, + "DnsConfigParams": { + "description": "DnsConfigParams\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"generation\", \"time_created\", \"zones\" ], \"properties\": { \"generation\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"time_created\": { \"type\": \"string\", \"format\": \"date-time\" }, \"zones\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsConfigZone\" } } } } ```
", + "type": "object", + "required": [ + "generation", + "time_created", + "zones" + ], + "properties": { + "generation": { + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "time_created": { + "type": "string", + "format": "date-time" + }, + "zones": { + "type": "array", + "items": { + "$ref": "#/definitions/DnsConfigZone" + } + } + } + }, + "DnsConfigZone": { + "description": "DnsConfigZone\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"records\", \"zone_name\" ], \"properties\": { \"records\": { \"type\": \"object\", \"additionalProperties\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsRecord\" } } }, \"zone_name\": { \"type\": \"string\" } } } ```
", + "type": "object", + "required": [ + "records", + "zone_name" + ], + "properties": { + "records": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "$ref": "#/definitions/DnsRecord" + } + } + }, + "zone_name": { + "type": "string" + } + } + }, + "DnsRecord": { + "description": "DnsRecord\n\n
JSON schema\n\n```json { \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv4\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"A\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv6\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"AAAA\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"$ref\": \"#/components/schemas/Srv\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"SRV\" ] } } } ] } ```
", + "oneOf": [ + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "type": "string", + "format": "ipv4" + }, + "type": { + "type": "string", + "enum": [ + "A" + ] + } + } + }, + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "type": "string", + "format": "ipv6" + }, + "type": { + "type": "string", + "enum": [ + "AAAA" + ] + } + } + }, + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "$ref": "#/definitions/Srv" + }, + "type": { + "type": "string", + "enum": [ + "SRV" + ] + } + } + } + ] + }, + "Generation": { + "description": "Generation numbers stored in the database, used for optimistic concurrency control", + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "IpNet": { + "oneOf": [ + { + "title": "v4", + "allOf": [ + { + "$ref": "#/definitions/Ipv4Net" + } + ] + }, + { + "title": "v6", + "allOf": [ + { + "$ref": "#/definitions/Ipv6Net" + } + ] + } + ], + "x-rust-type": { + "crate": "oxnet", + "path": "oxnet::IpNet", + "version": "0.1.0" + } + }, + "Ipv4Net": { + "title": "An IPv4 subnet", + "description": "An IPv4 subnet, including prefix and prefix length", + "examples": [ + "192.168.1.0/24" + ], + "type": "string", + "pattern": "^(([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])/([0-9]|1[0-9]|2[0-9]|3[0-2])$", + "x-rust-type": { + "crate": "oxnet", + "path": "oxnet::Ipv4Net", + "version": "0.1.0" + } + }, + "Ipv6Net": { + "title": "An IPv6 subnet", + "description": "An IPv6 subnet, including prefix and subnet mask", + "examples": [ + "fd12:3456::/64" + ], + "type": "string", + "pattern": "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\\/([0-9]|[1-9][0-9]|1[0-1][0-9]|12[0-8])$", + "x-rust-type": { + "crate": "oxnet", + "path": "oxnet::Ipv6Net", + "version": "0.1.0" + } + }, + "MacAddr": { + "title": "A MAC address", + "description": "A Media Access Control address, in EUI-48 format", + "examples": [ + "ff:ff:ff:ff:ff:ff" + ], + "type": "string", + "maxLength": 17, + "minLength": 5, + "pattern": "^([0-9a-fA-F]{0,2}:){5}[0-9a-fA-F]{0,2}$" + }, + "Name": { + "title": "A name unique within the parent collection", + "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID, but they may contain a UUID. They can be at most 63 characters long.", + "type": "string", + "maxLength": 63, + "minLength": 1, + "pattern": "^(?![0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$)^[a-z]([a-zA-Z0-9-]*[a-zA-Z0-9]+)?$" + }, + "NetworkInterface": { + "description": "Information required to construct a virtual network interface", + "type": "object", + "required": [ + "id", + "ip", + "kind", + "mac", + "name", + "primary", + "slot", + "subnet", + "vni" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "ip": { + "type": "string", + "format": "ip" + }, + "kind": { + "$ref": "#/definitions/NetworkInterfaceKind" + }, + "mac": { + "$ref": "#/definitions/MacAddr" + }, + "name": { + "$ref": "#/definitions/Name" + }, + "primary": { + "type": "boolean" + }, + "slot": { + "type": "integer", + "format": "uint8", + "minimum": 0.0 + }, + "subnet": { + "$ref": "#/definitions/IpNet" + }, + "transit_ips": { + "default": [], + "type": "array", + "items": { + "$ref": "#/definitions/IpNet" + } + }, + "vni": { + "$ref": "#/definitions/Vni" + } + } + }, + "NetworkInterfaceKind": { + "description": "The type of network interface", + "oneOf": [ + { + "description": "A vNIC attached to a guest instance", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "instance" + ] + } + } + }, + { + "description": "A vNIC associated with an internal service", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "service" + ] + } + } + }, + { + "description": "A vNIC associated with a probe", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "probe" + ] + } + } + } + ] + }, + "OmicronPhysicalDiskConfig": { + "type": "object", + "required": [ + "id", + "identity", + "pool_id" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "identity": { + "$ref": "#/definitions/DiskIdentity" + }, + "pool_id": { + "$ref": "#/definitions/TypedUuidForZpoolKind" + } + } + }, + "OmicronPhysicalDisksConfig": { + "type": "object", + "required": [ + "disks", + "generation" + ], + "properties": { + "disks": { + "type": "array", + "items": { + "$ref": "#/definitions/OmicronPhysicalDiskConfig" + } + }, + "generation": { + "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened). It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.", + "allOf": [ + { + "$ref": "#/definitions/Generation" + } + ] + } + } + }, + "OmicronZoneDataset": { + "description": "Describes a persistent ZFS dataset associated with an Omicron zone", + "type": "object", + "required": [ + "pool_name" + ], + "properties": { + "pool_name": { + "$ref": "#/definitions/ZpoolName" + } + } + }, + "OmicronZoneExternalFloatingAddr": { + "description": "Floating external address with port allocated to an Omicron-managed zone.", + "type": "object", + "required": [ + "addr", + "id" + ], + "properties": { + "addr": { + "type": "string" + }, + "id": { + "$ref": "#/definitions/TypedUuidForExternalIpKind" + } + } + }, + "OmicronZoneExternalFloatingIp": { + "description": "Floating external IP allocated to an Omicron-managed zone.\n\nThis is a slimmer `nexus_db_model::ExternalIp` that only stores the fields necessary for blueprint planning, and requires that the zone have a single IP.", + "type": "object", + "required": [ + "id", + "ip" + ], + "properties": { + "id": { + "$ref": "#/definitions/TypedUuidForExternalIpKind" + }, + "ip": { + "type": "string", + "format": "ip" + } + } + }, + "OmicronZoneExternalSnatIp": { + "description": "SNAT (outbound) external IP allocated to an Omicron-managed zone.\n\nThis is a slimmer `nexus_db_model::ExternalIp` that only stores the fields necessary for blueprint planning, and requires that the zone have a single IP.", + "type": "object", + "required": [ + "id", + "snat_cfg" + ], + "properties": { + "id": { + "$ref": "#/definitions/TypedUuidForExternalIpKind" + }, + "snat_cfg": { + "$ref": "#/definitions/SourceNatConfig" + } + } + }, + "SledConfig": { + "type": "object", + "required": [ + "disks", + "zones" + ], + "properties": { + "disks": { + "description": "Control plane disks configured for this sled", + "allOf": [ + { + "$ref": "#/definitions/OmicronPhysicalDisksConfig" + } + ] + }, + "zones": { + "description": "zones configured for this sled", + "type": "array", + "items": { + "$ref": "#/definitions/BlueprintZoneConfig" + } + } + } + }, + "SourceNatConfig": { + "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.", + "type": "object", + "required": [ + "first_port", + "ip", + "last_port" + ], + "properties": { + "first_port": { + "description": "The first port used for source NAT, inclusive.", + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "ip": { + "description": "The external address provided to the instance or service.", + "type": "string", + "format": "ip" + }, + "last_port": { + "description": "The last port used for source NAT, also inclusive.", + "type": "integer", + "format": "uint16", + "minimum": 0.0 + } + } + }, + "Srv": { + "description": "Srv\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"port\", \"prio\", \"target\", \"weight\" ], \"properties\": { \"port\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"prio\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"target\": { \"type\": \"string\" }, \"weight\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 } } } ```
", + "type": "object", + "required": [ + "port", + "prio", + "target", + "weight" + ], + "properties": { + "port": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "prio": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "target": { + "type": "string" + }, + "weight": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + } + } + }, + "TypedUuidForExternalIpKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForOmicronZoneKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForZpoolKind": { + "type": "string", + "format": "uuid" + }, + "Vni": { + "description": "A Geneve Virtual Network Identifier", + "type": "integer", + "format": "uint32", + "minimum": 0.0 + }, + "ZpoolName": { + "title": "The name of a Zpool", + "description": "Zpool names are of the format ox{i,p}_. They are either Internal or External, and should be unique", + "type": "string", + "pattern": "^ox[ip]_[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" + } + } +} \ No newline at end of file diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs index 0ec14138fc..e1b12d6b2b 100644 --- a/sled-agent/src/rack_setup/mod.rs +++ b/sled-agent/src/rack_setup/mod.rs @@ -9,3 +9,8 @@ mod plan; pub mod service; pub use plan::service::SledConfig; +pub use plan::service::{ + from_ipaddr_to_external_floating_ip, + from_sockaddr_to_external_floating_addr, + from_source_nat_config_to_external_snat_ip, +}; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 8c26d0bf58..a376096a87 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -10,7 +10,13 @@ use illumos_utils::zpool::ZpoolName; use internal_dns::config::{Host, Zone}; use internal_dns::ServiceName; use nexus_sled_agent_shared::inventory::{ - Inventory, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, SledRole, + Inventory, OmicronZoneDataset, SledRole, +}; +use nexus_types::deployment::{ + blueprint_zone_type, BlueprintPhysicalDisksConfig, BlueprintZoneConfig, + BlueprintZoneDisposition, BlueprintZoneType, + OmicronZoneExternalFloatingAddr, OmicronZoneExternalFloatingIp, + OmicronZoneExternalSnatIp, }; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, @@ -33,7 +39,9 @@ use omicron_common::policy::{ BOUNDARY_NTP_REDUNDANCY, COCKROACHDB_REDUNDANCY, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, NEXUS_REDUNDANCY, }; -use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid, ZpoolUuid}; +use omicron_uuid_kinds::{ + ExternalIpUuid, GenericUuid, OmicronZoneUuid, SledUuid, ZpoolUuid, +}; use rand::prelude::SliceRandom; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -120,10 +128,10 @@ pub enum PlanError { #[derive(Clone, Debug, Default, Serialize, Deserialize, JsonSchema)] pub struct SledConfig { /// Control plane disks configured for this sled - pub disks: OmicronPhysicalDisksConfig, + pub disks: BlueprintPhysicalDisksConfig, /// zones configured for this sled - pub zones: Vec, + pub zones: Vec, } #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] @@ -140,7 +148,53 @@ impl Ledgerable for Plan { } const RSS_SERVICE_PLAN_V1_FILENAME: &str = "rss-service-plan.json"; const RSS_SERVICE_PLAN_V2_FILENAME: &str = "rss-service-plan-v2.json"; -const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v3.json"; +const RSS_SERVICE_PLAN_V3_FILENAME: &str = "rss-service-plan-v3.json"; +const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v4.json"; + +pub fn from_sockaddr_to_external_floating_addr( + addr: SocketAddr, +) -> OmicronZoneExternalFloatingAddr { + // This is pretty weird: IP IDs don't exist yet, so it's fine for us + // to make them up (Nexus will record them as a part of the + // handoff). We could pass `None` here for some zone types, but it's + // a little simpler to just always pass a new ID, which will only be + // used if the zone type has an external IP. + // + // This should all go away once RSS starts using blueprints more + // directly (instead of this conversion after the fact): + // https://github.com/oxidecomputer/omicron/issues/5272 + OmicronZoneExternalFloatingAddr { id: ExternalIpUuid::new_v4(), addr } +} + +pub fn from_ipaddr_to_external_floating_ip( + ip: IpAddr, +) -> OmicronZoneExternalFloatingIp { + // This is pretty weird: IP IDs don't exist yet, so it's fine for us + // to make them up (Nexus will record them as a part of the + // handoff). We could pass `None` here for some zone types, but it's + // a little simpler to just always pass a new ID, which will only be + // used if the zone type has an external IP. + // + // This should all go away once RSS starts using blueprints more + // directly (instead of this conversion after the fact): + // https://github.com/oxidecomputer/omicron/issues/5272 + OmicronZoneExternalFloatingIp { id: ExternalIpUuid::new_v4(), ip } +} + +pub fn from_source_nat_config_to_external_snat_ip( + snat_cfg: SourceNatConfig, +) -> OmicronZoneExternalSnatIp { + // This is pretty weird: IP IDs don't exist yet, so it's fine for us + // to make them up (Nexus will record them as a part of the + // handoff). We could pass `None` here for some zone types, but it's + // a little simpler to just always pass a new ID, which will only be + // used if the zone type has an external IP. + // + // This should all go away once RSS starts using blueprints more + // directly (instead of this conversion after the fact): + // https://github.com/oxidecomputer/omicron/issues/5272 + OmicronZoneExternalSnatIp { id: ExternalIpUuid::new_v4(), snat_cfg } +} impl Plan { pub async fn load( @@ -200,6 +254,14 @@ impl Plan { } })? { Err(PlanError::FoundV2) + } else if Self::has_v3(storage_manager).await.map_err(|err| { + // Same as the comment above, but for version 3. + PlanError::Io { + message: String::from("looking for v3 RSS plan"), + err, + } + })? { + Err(PlanError::FoundV2) } else { Ok(None) } @@ -243,6 +305,25 @@ impl Plan { Ok(false) } + async fn has_v3( + storage_manager: &StorageHandle, + ) -> Result { + let paths = storage_manager + .get_latest_disks() + .await + .all_m2_mountpoints(CONFIG_DATASET) + .into_iter() + .map(|p| p.join(RSS_SERVICE_PLAN_V3_FILENAME)); + + for p in paths { + if p.try_exists()? { + return Ok(true); + } + } + + Ok(false) + } + async fn is_sled_scrimlet( log: &Logger, address: SocketAddrV6, @@ -419,20 +500,22 @@ impl Plan { sled.alloc_dataset_from_u2s(DatasetType::InternalDns)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::InternalDns { - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), - }, - http_address, - dns_address, - gz_address: dns_subnet.gz_address(), - gz_address_index: i.try_into().expect("Giant indices?"), - }, filesystem_pool, + zone_type: BlueprintZoneType::InternalDns( + blueprint_zone_type::InternalDns { + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, + http_address, + dns_address, + gz_address: dns_subnet.gz_address(), + gz_address_index: i.try_into().expect("Giant indices?"), + }, + ), }); } @@ -458,16 +541,18 @@ impl Plan { let dataset_name = sled.alloc_dataset_from_u2s(DatasetType::CockroachDb)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::CockroachDb { - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), + zone_type: BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { + address, + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, }, - address, - }, + ), filesystem_pool, }); } @@ -499,23 +584,27 @@ impl Plan { ) .unwrap(); let dns_port = omicron_common::address::DNS_PORT; - let dns_address = SocketAddr::new(external_ip, dns_port); + let dns_address = from_sockaddr_to_external_floating_addr( + SocketAddr::new(external_ip, dns_port), + ); let dataset_kind = DatasetType::ExternalDns; let dataset_name = sled.alloc_dataset_from_u2s(dataset_kind)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: *http_address.ip(), - zone_type: OmicronZoneType::ExternalDns { - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), + zone_type: BlueprintZoneType::ExternalDns( + blueprint_zone_type::ExternalDns { + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, + http_address, + dns_address, + nic, }, - http_address, - dns_address, - nic, - }, + ), filesystem_pool, }); } @@ -539,28 +628,32 @@ impl Plan { .unwrap(); let (nic, external_ip) = svc_port_builder.next_nexus(id)?; let filesystem_pool = Some(sled.alloc_zpool_from_u2s()?); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: address, - zone_type: OmicronZoneType::Nexus { - internal_address: SocketAddrV6::new( - address, - omicron_common::address::NEXUS_INTERNAL_PORT, - 0, - 0, - ), - external_ip, - nic, - // Tell Nexus to use TLS if and only if the caller - // provided TLS certificates. This effectively - // determines the status of TLS for the lifetime of - // the rack. In production-like deployments, we'd - // always expect TLS to be enabled. It's only in - // development that it might not be. - external_tls: !config.external_certificates.is_empty(), - external_dns_servers: config.dns_servers.clone(), - }, + zone_type: BlueprintZoneType::Nexus( + blueprint_zone_type::Nexus { + internal_address: SocketAddrV6::new( + address, + omicron_common::address::NEXUS_INTERNAL_PORT, + 0, + 0, + ), + external_ip: from_ipaddr_to_external_floating_ip( + external_ip, + ), + nic, + // Tell Nexus to use TLS if and only if the caller + // provided TLS certificates. This effectively + // determines the status of TLS for the lifetime of + // the rack. In production-like deployments, we'd + // always expect TLS to be enabled. It's only in + // development that it might not be. + external_tls: !config.external_certificates.is_empty(), + external_dns_servers: config.dns_servers.clone(), + }, + ), filesystem_pool, }); } @@ -584,18 +677,20 @@ impl Plan { ) .unwrap(); let filesystem_pool = Some(sled.alloc_zpool_from_u2s()?); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: address, - zone_type: OmicronZoneType::Oximeter { - address: SocketAddrV6::new( - address, - omicron_common::address::OXIMETER_PORT, - 0, - 0, - ), - }, + zone_type: BlueprintZoneType::Oximeter( + blueprint_zone_type::Oximeter { + address: SocketAddrV6::new( + address, + omicron_common::address::OXIMETER_PORT, + 0, + 0, + ), + }, + ), filesystem_pool, }) } @@ -623,16 +718,18 @@ impl Plan { let dataset_name = sled.alloc_dataset_from_u2s(DatasetType::Clickhouse)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::Clickhouse { - address, - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), + zone_type: BlueprintZoneType::Clickhouse( + blueprint_zone_type::Clickhouse { + address, + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, }, - }, + ), filesystem_pool, }); } @@ -664,16 +761,18 @@ impl Plan { let dataset_name = sled.alloc_dataset_from_u2s(DatasetType::ClickhouseServer)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::ClickhouseServer { - address, - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), + zone_type: BlueprintZoneType::ClickhouseServer( + blueprint_zone_type::ClickhouseServer { + address, + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, }, - }, + ), filesystem_pool, }); } @@ -703,16 +802,18 @@ impl Plan { let dataset_name = sled.alloc_dataset_from_u2s(DatasetType::ClickhouseKeeper)?; let filesystem_pool = Some(dataset_name.pool().clone()); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::ClickhouseKeeper { - address, - dataset: OmicronZoneDataset { - pool_name: dataset_name.pool().clone(), + zone_type: BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { + address, + dataset: OmicronZoneDataset { + pool_name: dataset_name.pool().clone(), + }, }, - }, + ), filesystem_pool, }); } @@ -737,13 +838,15 @@ impl Plan { port, ) .unwrap(); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: address, - zone_type: OmicronZoneType::CruciblePantry { - address: SocketAddrV6::new(address, port, 0, 0), - }, + zone_type: BlueprintZoneType::CruciblePantry( + blueprint_zone_type::CruciblePantry { + address: SocketAddrV6::new(address, port, 0, 0), + }, + ), filesystem_pool, }); } @@ -765,14 +868,18 @@ impl Plan { ) .unwrap(); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: ip, - zone_type: OmicronZoneType::Crucible { - address, - dataset: OmicronZoneDataset { pool_name: pool.clone() }, - }, + zone_type: BlueprintZoneType::Crucible( + blueprint_zone_type::Crucible { + address, + dataset: OmicronZoneDataset { + pool_name: pool.clone(), + }, + }, + ), filesystem_pool: Some(pool.clone()), }); } @@ -793,24 +900,31 @@ impl Plan { .push(Host::for_zone(Zone::Other(id)).fqdn()); let (nic, snat_cfg) = svc_port_builder.next_snat(id)?; ( - OmicronZoneType::BoundaryNtp { - address: ntp_address, - ntp_servers: config.ntp_servers.clone(), - dns_servers: config.dns_servers.clone(), - domain: None, - nic, - snat_cfg, - }, + BlueprintZoneType::BoundaryNtp( + blueprint_zone_type::BoundaryNtp { + address: ntp_address, + ntp_servers: config.ntp_servers.clone(), + dns_servers: config.dns_servers.clone(), + domain: None, + nic, + external_ip: + from_source_nat_config_to_external_snat_ip( + snat_cfg, + ), + }, + ), ServiceName::BoundaryNtp, ) } else { ( - OmicronZoneType::InternalNtp { - address: ntp_address, - ntp_servers: boundary_ntp_servers.clone(), - dns_servers: rack_dns_servers.clone(), - domain: None, - }, + BlueprintZoneType::InternalNtp( + blueprint_zone_type::InternalNtp { + address: ntp_address, + ntp_servers: boundary_ntp_servers.clone(), + dns_servers: rack_dns_servers.clone(), + domain: None, + }, + ), ServiceName::InternalNtp, ) }; @@ -819,9 +933,9 @@ impl Plan { .host_zone_with_one_backend(id, address, svcname, NTP_PORT) .unwrap(); - sled.request.zones.push(OmicronZoneConfig { - // TODO-cleanup use TypedUuid everywhere - id: id.into_untyped_uuid(), + sled.request.zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id, underlay_address: address, zone_type, filesystem_pool, @@ -1379,10 +1493,10 @@ mod tests { } #[test] - fn test_rss_service_plan_v3_schema() { + fn test_rss_service_plan_v4_schema() { let schema = schemars::schema_for!(Plan); expectorate::assert_contents( - "../schema/rss-service-plan-v3.json", + "../schema/rss-service-plan-v4.json", &serde_json::to_string_pretty(&schema).unwrap(), ); } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 20cd5646c0..3f73e55d0f 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -71,7 +71,6 @@ use crate::bootstrap::early_networking::{ }; use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::nexus::d2n_params; -use crate::params::OmicronZoneTypeExt; use crate::rack_setup::plan::service::{ Plan as ServicePlan, PlanError as ServicePlanError, }; @@ -91,9 +90,8 @@ use nexus_sled_agent_shared::inventory::{ OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig, }; use nexus_types::deployment::{ - Blueprint, BlueprintPhysicalDisksConfig, BlueprintZoneConfig, - BlueprintZoneDisposition, BlueprintZonesConfig, - CockroachDbPreserveDowngrade, InvalidOmicronZoneType, + blueprint_zone_type, Blueprint, BlueprintZoneType, BlueprintZonesConfig, + CockroachDbPreserveDowngrade, }; use nexus_types::external_api::views::SledState; use omicron_common::address::get_sled_address; @@ -108,8 +106,8 @@ use omicron_common::disk::{ }; use omicron_common::ledger::{self, Ledger, Ledgerable}; use omicron_ddm_admin_client::{Client as DdmAdminClient, DdmError}; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledUuid; -use omicron_uuid_kinds::{ExternalIpUuid, GenericUuid}; use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, @@ -533,7 +531,7 @@ impl ServiceInner { .iter() .filter_map(|zone_config| { match &zone_config.zone_type { - OmicronZoneType::InternalDns { http_address, .. } + BlueprintZoneType::InternalDns(blueprint_zone_type::InternalDns{ http_address, .. }) => { Some(*http_address) }, @@ -719,15 +717,17 @@ impl ServiceInner { let mut datasets: Vec = vec![]; for sled_config in service_plan.services.values() { for zone in &sled_config.zones { - if let Some((dataset_name, dataset_address)) = - zone.dataset_name_and_address() - { + if let Some(dataset) = zone.zone_type.durable_dataset() { datasets.push(NexusTypes::DatasetCreateRequest { - zpool_id: dataset_name.pool().id().into_untyped_uuid(), - dataset_id: zone.id, + zpool_id: dataset + .dataset + .pool_name + .id() + .into_untyped_uuid(), + dataset_id: zone.id.into_untyped_uuid(), request: NexusTypes::DatasetPutRequest { - address: dataset_address.to_string(), - kind: dataset_name.dataset().kind(), + address: dataset.address.to_string(), + kind: dataset.kind, }, }) } @@ -981,7 +981,7 @@ impl ServiceInner { if sled_config.zones.iter().any(|zone_config| { matches!( &zone_config.zone_type, - OmicronZoneType::CockroachDb { .. } + BlueprintZoneType::CockroachDb(_) ) }) { Some(sled_address) @@ -1398,7 +1398,7 @@ fn build_initial_blueprint_from_plan( let blueprint = build_initial_blueprint_from_sled_configs( sled_configs_by_id, internal_dns_version, - )?; + ); Ok(blueprint) } @@ -1406,47 +1406,11 @@ fn build_initial_blueprint_from_plan( pub(crate) fn build_initial_blueprint_from_sled_configs( sled_configs_by_id: &BTreeMap, internal_dns_version: Generation, -) -> Result { - // Helper to convert an `OmicronZoneConfig` into a `BlueprintZoneConfig`. - // This is separate primarily so rustfmt doesn't lose its mind. - let to_bp_zone_config = |z: &OmicronZoneConfig| { - // All initial zones are in-service. - let disposition = BlueprintZoneDisposition::InService; - BlueprintZoneConfig::from_omicron_zone_config( - z.clone(), - disposition, - // This is pretty weird: IP IDs don't exist yet, so it's fine for us - // to make them up (Nexus will record them as a part of the - // handoff). We could pass `None` here for some zone types, but it's - // a little simpler to just always pass a new ID, which will only be - // used if the zone type has an external IP. - // - // This should all go away once RSS starts using blueprints more - // directly (instead of this conversion after the fact): - // https://github.com/oxidecomputer/omicron/issues/5272 - Some(ExternalIpUuid::new_v4()), - ) - }; - - let mut blueprint_disks = BTreeMap::new(); - for (sled_id, sled_config) in sled_configs_by_id { - blueprint_disks.insert( - *sled_id, - BlueprintPhysicalDisksConfig { - generation: sled_config.disks.generation, - disks: sled_config - .disks - .disks - .iter() - .map(|d| OmicronPhysicalDiskConfig { - identity: d.identity.clone(), - id: d.id, - pool_id: d.pool_id, - }) - .collect(), - }, - ); - } +) -> Blueprint { + let blueprint_disks: BTreeMap<_, _> = sled_configs_by_id + .iter() + .map(|(sled_id, sled_config)| (*sled_id, sled_config.disks.clone())) + .collect(); let mut blueprint_zones = BTreeMap::new(); let mut sled_state = BTreeMap::new(); @@ -1463,18 +1427,14 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( // value, we will need to revisit storing this in the serialized // RSS plan. generation: DeployStepVersion::V5_EVERYTHING, - zones: sled_config - .zones - .iter() - .map(to_bp_zone_config) - .collect::>()?, + zones: sled_config.zones.clone(), }; blueprint_zones.insert(*sled_id, zones_config); sled_state.insert(*sled_id, SledState::Active); } - Ok(Blueprint { + Blueprint { id: Uuid::new_v4(), blueprint_zones, blueprint_disks, @@ -1492,7 +1452,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( time_created: Utc::now(), creator: "RSS".to_string(), comment: "initial blueprint from rack setup".to_string(), - }) + } } /// Facilitates creating a sequence of OmicronZonesConfig objects for each sled @@ -1570,11 +1530,14 @@ impl<'a> OmicronZonesConfigGenerator<'a> { sled_config .zones .iter() + .cloned() + .map(|bp_zone_config| { + OmicronZoneConfig::from(bp_zone_config) + }) .filter(|z| { !zones_already.contains(&z.id) && zone_filter(&z.zone_type) - }) - .cloned(), + }), ); let config = OmicronZonesConfig { generation: version, zones }; diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 189f775adb..b546025654 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -12,6 +12,10 @@ use crate::nexus::d2n_params; use crate::nexus::NexusClient; use crate::rack_setup::service::build_initial_blueprint_from_sled_configs; use crate::rack_setup::SledConfig; +use crate::rack_setup::{ + from_ipaddr_to_external_floating_ip, + from_sockaddr_to_external_floating_addr, +}; use anyhow::anyhow; use crucible_agent_client::types::State as RegionState; use illumos_utils::zpool::ZpoolName; @@ -19,9 +23,11 @@ use internal_dns::ServiceName; use nexus_client::types as NexusTypes; use nexus_client::types::{IpRange, Ipv4Range, Ipv6Range}; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; -use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; -use nexus_sled_agent_shared::inventory::OmicronZoneType; +use nexus_types::deployment::blueprint_zone_type; +use nexus_types::deployment::{ + BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZoneType, +}; use nexus_types::inventory::NetworkInterfaceKind; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; @@ -36,6 +42,7 @@ use omicron_common::backoff::{ use omicron_common::disk::DiskIdentity; use omicron_common::FileKv; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; use oxnet::Ipv6Net; @@ -375,19 +382,22 @@ pub async fn run_standalone_server( SocketAddr::V6(a) => a, }; let pool_name = ZpoolName::new_external(ZpoolUuid::new_v4()); - let mut zones = vec![OmicronZoneConfig { - id: Uuid::new_v4(), + let mut zones = vec![BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id: OmicronZoneUuid::new_v4(), underlay_address: *http_bound.ip(), - zone_type: OmicronZoneType::InternalDns { - dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, - http_address: http_bound, - dns_address: match dns.dns_server.local_address() { - SocketAddr::V4(_) => panic!("did not expect v4 address"), - SocketAddr::V6(a) => a, + zone_type: BlueprintZoneType::InternalDns( + blueprint_zone_type::InternalDns { + dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, + http_address: http_bound, + dns_address: match dns.dns_server.local_address() { + SocketAddr::V4(_) => panic!("did not expect v4 address"), + SocketAddr::V6(a) => a, + }, + gz_address: Ipv6Addr::LOCALHOST, + gz_address_index: 0, }, - gz_address: Ipv6Addr::LOCALHOST, - gz_address_index: 0, - }, + ), // Co-locate the filesystem pool with the dataset filesystem_pool: Some(pool_name), }]; @@ -396,23 +406,26 @@ pub async fn run_standalone_server( let mut macs = MacAddr::iter_system(); if let Some(nexus_external_addr) = rss_args.nexus_external_addr { let ip = nexus_external_addr.ip(); - let id = Uuid::new_v4(); + let id = OmicronZoneUuid::new_v4(); - zones.push(OmicronZoneConfig { + zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, id, underlay_address: match ip { IpAddr::V4(_) => panic!("did not expect v4 address"), IpAddr::V6(a) => a, }, - zone_type: OmicronZoneType::Nexus { + zone_type: BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { internal_address: match config.nexus_address { SocketAddr::V4(_) => panic!("did not expect v4 address"), SocketAddr::V6(a) => a, }, - external_ip: ip, + external_ip: from_ipaddr_to_external_floating_ip(ip), nic: nexus_types::inventory::NetworkInterface { id: Uuid::new_v4(), - kind: NetworkInterfaceKind::Service { id }, + kind: NetworkInterfaceKind::Service { + id: id.into_untyped_uuid(), + }, name: "nexus".parse().unwrap(), ip: NEXUS_OPTE_IPV4_SUBNET .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1) @@ -427,7 +440,7 @@ pub async fn run_standalone_server( }, external_tls: false, external_dns_servers: vec![], - }, + }), filesystem_pool: Some(get_random_zpool()), }); @@ -445,31 +458,40 @@ pub async fn run_standalone_server( rss_args.external_dns_internal_addr { let ip = *external_dns_internal_addr.ip(); - let id = Uuid::new_v4(); + let id = OmicronZoneUuid::new_v4(); let pool_name = ZpoolName::new_external(ZpoolUuid::new_v4()); - zones.push(OmicronZoneConfig { + zones.push(BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, id, underlay_address: ip, - zone_type: OmicronZoneType::ExternalDns { - dataset: OmicronZoneDataset { pool_name: pool_name.clone() }, - http_address: external_dns_internal_addr, - dns_address: SocketAddr::V6(external_dns_internal_addr), - nic: nexus_types::inventory::NetworkInterface { - id: Uuid::new_v4(), - kind: NetworkInterfaceKind::Service { id }, - name: "external-dns".parse().unwrap(), - ip: DNS_OPTE_IPV4_SUBNET - .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1) - .unwrap() - .into(), - mac: macs.next().unwrap(), - subnet: (*DNS_OPTE_IPV4_SUBNET).into(), - vni: Vni::SERVICES_VNI, - primary: true, - slot: 0, - transit_ips: vec![], + zone_type: BlueprintZoneType::ExternalDns( + blueprint_zone_type::ExternalDns { + dataset: OmicronZoneDataset { + pool_name: pool_name.clone(), + }, + http_address: external_dns_internal_addr, + dns_address: from_sockaddr_to_external_floating_addr( + SocketAddr::V6(external_dns_internal_addr), + ), + nic: nexus_types::inventory::NetworkInterface { + id: Uuid::new_v4(), + kind: NetworkInterfaceKind::Service { + id: id.into_untyped_uuid(), + }, + name: "external-dns".parse().unwrap(), + ip: DNS_OPTE_IPV4_SUBNET + .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1) + .unwrap() + .into(), + mac: macs.next().unwrap(), + subnet: (*DNS_OPTE_IPV4_SUBNET).into(), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + transit_ips: vec![], + }, }, - }, + ), // Co-locate the filesystem pool with the dataset filesystem_pool: Some(pool_name), }); @@ -530,8 +552,7 @@ pub async fn run_standalone_server( blueprint: build_initial_blueprint_from_sled_configs( &sled_configs, internal_dns_version, - ) - .expect("failed to construct initial blueprint"), + ), physical_disks, zpools, datasets, From 31ea57ea400f2ee68d9b2a3348f881bbad955069 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Fri, 23 Aug 2024 23:23:45 -0600 Subject: [PATCH 08/10] bgp: check md5 passkey length on create/update (#6428) --- nexus/src/app/switch_port.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nexus/src/app/switch_port.rs b/nexus/src/app/switch_port.rs index 9726a59d33..b616531f53 100644 --- a/nexus/src/app/switch_port.rs +++ b/nexus/src/app/switch_port.rs @@ -30,6 +30,7 @@ impl super::Nexus { params: params::SwitchPortSettingsCreate, ) -> CreateResult { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + Self::switch_port_settings_validate(¶ms)?; //TODO race conditions on exists check versus update/create. // Normally I would use a DB lock here, but not sure what @@ -54,6 +55,36 @@ impl super::Nexus { } } + // TODO: more validation wanted + fn switch_port_settings_validate( + params: ¶ms::SwitchPortSettingsCreate, + ) -> CreateResult<()> { + for x in params.bgp_peers.values() { + for p in x.peers.iter() { + if let Some(ref key) = p.md5_auth_key { + if key.len() > 80 { + return Err(Error::invalid_value( + "md5_auth_key", + format!("md5 auth key for {} is longer than 80 characters", p.addr) + )); + } + for c in key.chars() { + if !c.is_ascii() || c.is_ascii_control() { + return Err(Error::invalid_value( + "md5_auth_key", + format!( + "md5 auth key for {} must be printable ascii", + p.addr + ), + )); + } + } + } + } + } + Ok(()) + } + pub async fn switch_port_settings_create( self: &Arc, opctx: &OpContext, From 5afa0de7b3e91ee23168b12758f3451f46043032 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 24 Aug 2024 11:16:41 -0700 Subject: [PATCH 09/10] [gateway] ingest sensor measurements from SPs into oximeter (#6354) This branch adds code to the Management Gateway Service for periodically polling sensor measurements from SPs and emitting it to Oximeter. In particular, this consists of: - a task for managing the metrics endpoint, waiting until MGS knows its underlay network address to bind the endpoint and register it with the control plane, - tasks for polling sensor measurements from each individual SP that MGS knows about, - a task that waits until SP discovery has completed and the rack ID to be known, and then spawns a poller task for every discovered SP slot The SP poller tasks send samples to the Oximeter producer endpoint using a `tokio::sync::broadcast` channel, which I've chosen primarily because it can be used as a bounded ring buffer that actually overwrites the *oldest* value when the buffer is full. This mostway, we use a bounded amount of memory for samples, but prioritize the most recent samples if we have to throw anything away because Oximeter hasn't come along to collect them recently. The poller tasks cache the component inventory and identifying information from the SP, so that we don't have to re-read all this data from the SP on every poll. While MGS, running on a host, would probably be fine with doing this, it seems better to avoid making the SP do unnecessary work at a 1Hz poll frequency, especially when *both* switch zones are polling them. Instead, every time we poll sensor data from an SP, we first ask it for its current state, and only invalidate our cached understanding of the SP when the state changes. This way, if a SP starts reporting new metrics due to a firmware update, or gets replaced with a different chassis with a new serial number, revision, etc, we won't continue to report metrics for stale targets, but we don't have to reload all of that once per second. To detect scenarios where the SP's state and/or identity has changed in the midst of polling its sensors (which may result in mislabeled metrics), we check whether the SP's state at the end of the poll matches its state at the beginning, and if it's not, we poll again immediately with its new identity. At present, the timestamps for these metric samples is generated by MGS --- it's the time when MGS received the sensor data from the SP, as MGS understands it. Because we don't currently collect data that was recorded prior to the switch zone coming up, we don't need to worry about figuring out timestamps for data recorded by the SP prior to the existence of a wall clock. Figuring out the SP/MGS timebase synchronization is probably a lot of additional work, although it would be nice to do in the future. At present, [metrics emitted by sled-agent prior to NTP sync will also be from 1987][1], so I think it's fine to do something similar here, especially because the potential solutions to that [also have their fair share of tradeoffs][2]. The new metrics use a schema in `oximeter/oximeter/schema/hardware-component.toml`. The target of these metrics is a `hardware_component` that includes: - the rack ID and the identity of the MGS instance that collected the metric, - information identifying the chassis[^1] and of the SP that recorded them (its serial number, model number, revision, and whether it's a switch, a sled, or a power shelf), - the SP's Hubris archive version (since the reported sensor data may change in future firmware releases) - the SP's ID for the hardware component (e.g. "dev-7"), the kind of device (e.g. "tmp117", "max5970"), and the humman-readable description (e.g. "Southeast temperature sensor", "U.2 Sharkfin A hot swap controller", etc.) reported by the SP Each kind of sensor reading has an individual metric (`hardware_component:temperature`, `hardware_component:current`, `hardware_component:voltage`, and so on). These metrics are labeled with the SP-reported name of the individual sensor measurement channel. For instance, a MAX5970 hotswap controller on sharkfin will have a voltage and current metric named "V12_U2A_A0" for the 12V rail, and a voltage and current metric named "V3P3_U2A_A0" for the 3.3V rail. Finally, a `hardware_component:sensor_errors` metric records sensor errors reported by the SP, labeled with the sensor name, what kind of sensor it is, and a string representation of the error. [1]: https://github.com/oxidecomputer/omicron/pull/6354#issuecomment-2308019422 [2]: https://github.com/oxidecomputer/omicron/pull/6354#issuecomment-2308475741 [^1]: I'm using "chassis" as a generic term to refer to "switch, sled, or power shelf". --- Cargo.lock | 4 + clients/nexus-client/src/lib.rs | 4 + clients/oximeter-client/src/lib.rs | 1 + common/src/api/internal/nexus.rs | 2 + dev-tools/mgs-dev/Cargo.toml | 1 + dev-tools/mgs-dev/src/main.rs | 24 +- dev-tools/omdb/tests/successes.out | 25 +- gateway-test-utils/configs/config.test.toml | 9 + .../configs/sp_sim_config.test.toml | 166 +++ gateway-test-utils/src/setup.rs | 21 +- gateway/Cargo.toml | 3 + gateway/examples/config.toml | 9 + gateway/src/config.rs | 7 +- gateway/src/lib.rs | 12 +- gateway/src/metrics.rs | 1159 +++++++++++++++++ .../tests/integration_tests/component_list.rs | 157 ++- nexus/db-model/src/producer_endpoint.rs | 7 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/tests/integration_tests/metrics.rs | 181 ++- nexus/tests/integration_tests/sp_updater.rs | 20 +- openapi/nexus-internal.json | 7 + openapi/nexus.json | 1 + openapi/oximeter.json | 7 + .../oximeter/schema/hardware-component.toml | 183 +++ oximeter/schema/src/codegen.rs | 1 + oximeter/types/src/schema.rs | 1 + .../up.sql | 2 + schema/crdb/dbinit.sql | 6 +- 28 files changed, 1990 insertions(+), 33 deletions(-) create mode 100644 gateway/src/metrics.rs create mode 100644 oximeter/oximeter/schema/hardware-component.toml create mode 100644 schema/crdb/add-management-gateway-producer-kind/up.sql diff --git a/Cargo.lock b/Cargo.lock index 2630aa2a25..249b7c5cea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4720,6 +4720,7 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "libc", + "omicron-gateway", "omicron-workspace-hack", "signal-hook-tokio", "tokio", @@ -5962,6 +5963,7 @@ dependencies = [ "anyhow", "base64 0.22.1", "camino", + "chrono", "clap", "dropshot", "expectorate", @@ -5980,6 +5982,8 @@ dependencies = [ "omicron-test-utils", "omicron-workspace-hack", "once_cell", + "oximeter", + "oximeter-producer", "schemars", "serde", "serde_json", diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 62366c45e1..a55c5d4013 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -213,6 +213,7 @@ impl From fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { use omicron_common::api::internal::nexus::ProducerKind; match kind { + ProducerKind::ManagementGateway => Self::ManagementGateway, ProducerKind::SledAgent => Self::SledAgent, ProducerKind::Service => Self::Service, ProducerKind::Instance => Self::Instance, @@ -390,6 +391,9 @@ impl From fn from(kind: types::ProducerKind) -> Self { use omicron_common::api::internal::nexus::ProducerKind; match kind { + types::ProducerKind::ManagementGateway => { + ProducerKind::ManagementGateway + } types::ProducerKind::SledAgent => ProducerKind::SledAgent, types::ProducerKind::Instance => ProducerKind::Instance, types::ProducerKind::Service => ProducerKind::Service, diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 74fc6968e8..c23e5177a0 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -26,6 +26,7 @@ impl From fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { use omicron_common::api::internal::nexus; match kind { + nexus::ProducerKind::ManagementGateway => Self::ManagementGateway, nexus::ProducerKind::Service => Self::Service, nexus::ProducerKind::SledAgent => Self::SledAgent, nexus::ProducerKind::Instance => Self::Instance, diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 7f4eb358a4..4daea6a198 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -223,6 +223,8 @@ pub enum ProducerKind { Service, /// The producer is a Propolis VMM managing a guest instance. Instance, + /// The producer is a management gateway service. + ManagementGateway, } /// Information announced by a metric server, used so that clients can contact it and collect diff --git a/dev-tools/mgs-dev/Cargo.toml b/dev-tools/mgs-dev/Cargo.toml index d5f61f4b96..70382c0469 100644 --- a/dev-tools/mgs-dev/Cargo.toml +++ b/dev-tools/mgs-dev/Cargo.toml @@ -14,6 +14,7 @@ futures.workspace = true gateway-messages.workspace = true gateway-test-utils.workspace = true libc.workspace = true +omicron-gateway.workspace = true omicron-workspace-hack.workspace = true signal-hook-tokio.workspace = true tokio.workspace = true diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs index 85b1313d68..77947999d9 100644 --- a/dev-tools/mgs-dev/src/main.rs +++ b/dev-tools/mgs-dev/src/main.rs @@ -8,6 +8,7 @@ use clap::{Args, Parser, Subcommand}; use futures::StreamExt; use libc::SIGINT; use signal_hook_tokio::Signals; +use std::net::SocketAddr; #[tokio::main] async fn main() -> anyhow::Result<()> { @@ -36,7 +37,12 @@ enum MgsDevCmd { } #[derive(Clone, Debug, Args)] -struct MgsRunArgs {} +struct MgsRunArgs { + /// Override the address of the Nexus instance to use when registering the + /// Oximeter producer. + #[clap(long)] + nexus_address: Option, +} impl MgsRunArgs { async fn exec(&self) -> Result<(), anyhow::Error> { @@ -46,9 +52,23 @@ impl MgsRunArgs { let mut signal_stream = signals.fuse(); println!("mgs-dev: setting up MGS ... "); - let gwtestctx = gateway_test_utils::setup::test_setup( + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + if let Some(addr) = self.nexus_address { + mgs_config.metrics = + Some(gateway_test_utils::setup::MetricsConfig { + disabled: false, + dev_nexus_address: Some(addr), + dev_bind_loopback: true, + }); + } + + let gwtestctx = gateway_test_utils::setup::test_setup_with_config( "mgs-dev", gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + None, ) .await; println!("mgs-dev: MGS is running."); diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 2a9c9c8051..e939bfa864 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -141,9 +141,16 @@ SP DETAILS: type "Sled" slot 0 COMPONENTS - NAME DESCRIPTION DEVICE PRESENCE SERIAL - sp3-host-cpu FAKE host cpu sp3-host-cpu Present None - dev-0 FAKE temperature sensor fake-tmp-sensor Failed None + NAME DESCRIPTION DEVICE PRESENCE SERIAL + sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + dev-0 FAKE temperature sensor fake-tmp-sensor Failed None + dev-1 FAKE temperature sensor tmp117 Present None + dev-2 FAKE Southeast temperature sensor tmp117 Present None + dev-6 FAKE U.2 Sharkfin A VPD at24csw080 Present None + dev-7 FAKE U.2 Sharkfin A hot swap controller max5970 Present None + dev-8 FAKE U.2 A NVMe Basic Management Command nvme_bmc Present None + dev-39 FAKE T6 temperature sensor tmp451 Present None + dev-53 FAKE Fan controller max31790 Present None CABOOSES: none found @@ -167,8 +174,16 @@ SP DETAILS: type "Sled" slot 1 COMPONENTS - NAME DESCRIPTION DEVICE PRESENCE SERIAL - sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + NAME DESCRIPTION DEVICE PRESENCE SERIAL + sp3-host-cpu FAKE host cpu sp3-host-cpu Present None + dev-0 FAKE temperature sensor tmp117 Present None + dev-1 FAKE temperature sensor tmp117 Present None + dev-2 FAKE Southeast temperature sensor tmp117 Present None + dev-6 FAKE U.2 Sharkfin A VPD at24csw080 Present None + dev-7 FAKE U.2 Sharkfin A hot swap controller max5970 Present None + dev-8 FAKE U.2 A NVMe Basic Management Command nvme_bmc Present None + dev-39 FAKE T6 temperature sensor tmp451 Present None + dev-53 FAKE Fan controller max31790 Present None CABOOSES: none found diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml index 79975f4611..4e3e9c6e6e 100644 --- a/gateway-test-utils/configs/config.test.toml +++ b/gateway-test-utils/configs/config.test.toml @@ -88,6 +88,15 @@ addr = "[::1]:0" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is +# useful in local testing and development, when the gateway service is not +# given a "real" underlay network IP. +dev_bind_loopback = true + # # NOTE: for the test suite, if mode = "file", the file path MUST be the sentinel # string "UNUSED". The actual path will be generated by the test suite for each diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml index cc08eec30b..4f370a167c 100644 --- a/gateway-test-utils/configs/sp_sim_config.test.toml +++ b/gateway-test-utils/configs/sp_sim_config.test.toml @@ -20,6 +20,9 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor 1" capabilities = 0x2 presence = "Present" +sensors = [ + {name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 }, +] [[simulated_sps.sidecar.components]] id = "dev-1" @@ -27,6 +30,9 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor 2" capabilities = 0x2 presence = "Failed" +sensors = [ + { name = "South", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 }, +] [[simulated_sps.sidecar]] multicast_addr = "::1" @@ -56,6 +62,82 @@ device = "fake-tmp-sensor" description = "FAKE temperature sensor" capabilities = 0x2 presence = "Failed" +sensors = [ + { name = "Southwest", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-1" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-2" +device = "tmp117" +description = "FAKE Southeast temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-6" +device = "at24csw080" +description = "FAKE U.2 Sharkfin A VPD" +capabilities = 0x0 +presence = "Present" + +[[simulated_sps.gimlet.components]] +id = "dev-7" +device = "max5970" +description = "FAKE U.2 Sharkfin A hot swap controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.45898438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.024414063, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.03125, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.328125, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-8" +device = "nvme_bmc" +description = "FAKE U.2 A NVMe Basic Management Command" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-39" +device = "tmp451" +description = "FAKE T6 temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-53" +device = "max31790" +description = "FAKE Fan controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Speed", last_data.value = 2607.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", last_data.value = 2476.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", last_data.value = 2553.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", last_data.value = 2265.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", last_data.value = 2649.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", last_data.value = 2275.0, last_data.timestamp = 1234 }, +] + [[simulated_sps.gimlet]] multicast_addr = "::1" @@ -72,6 +154,90 @@ capabilities = 0 presence = "Present" serial_console = "[::1]:0" + +[[simulated_sps.gimlet.components]] +id = "dev-0" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southwest", kind = "Temperature", last_data.value = 41.3629, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-1" +device = "tmp117" +description = "FAKE temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-2" +device = "tmp117" +description = "FAKE Southeast temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-6" +device = "at24csw080" +description = "FAKE U.2 Sharkfin A VPD" +capabilities = 0x0 +presence = "Present" + +[[simulated_sps.gimlet.components]] +id = "dev-7" +device = "max5970" +description = "FAKE U.2 Sharkfin A hot swap controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.41893438, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.025614603, last_data.timestamp = 1234 }, + { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.02914, last_data.timestamp = 1234 }, + { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.2618, last_data.timestamp = 1234 }, +] + +[[simulated_sps.gimlet.components]] +id = "dev-8" +device = "nvme_bmc" +description = "FAKE U.2 A NVMe Basic Management Command" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-39" +device = "tmp451" +description = "FAKE T6 temperature sensor" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 }, +] +[[simulated_sps.gimlet.components]] +id = "dev-53" +device = "max31790" +description = "FAKE Fan controller" +capabilities = 0x2 +presence = "Present" +sensors = [ + { name = "Southeast", kind = "Speed", last_data.value = 2510.0, last_data.timestamp = 1234 }, + { name = "Northeast", kind = "Speed", last_data.value = 2390.0, last_data.timestamp = 1234 }, + { name = "South", kind = "Speed", last_data.value = 2467.0, last_data.timestamp = 1234 }, + { name = "North", kind = "Speed", last_data.value = 2195.0, last_data.timestamp = 1234 }, + { name = "Southwest", kind = "Speed", last_data.value = 2680.0, last_data.timestamp = 1234 }, + { name = "Northwest", kind = "Speed", last_data.value = 2212.0, last_data.timestamp = 1234 }, +] + + # # NOTE: for the test suite, the [log] section is ignored; sp-sim logs are rolled # into the gateway logfile. diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index 46bc55805a..056bb451f7 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -8,6 +8,7 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; use dropshot::test_util::LogContext; use gateway_messages::SpPort; +pub use omicron_gateway::metrics::MetricsConfig; use omicron_gateway::MgsArguments; use omicron_gateway::SpType; use omicron_gateway::SwitchPortConfig; @@ -33,6 +34,7 @@ pub struct GatewayTestContext { pub server: omicron_gateway::Server, pub simrack: SimRack, pub logctx: LogContext, + pub gateway_id: Uuid, } impl GatewayTestContext { @@ -48,13 +50,18 @@ pub fn load_test_config() -> (omicron_gateway::Config, sp_sim::Config) { let manifest_dir = Utf8Path::new(env!("CARGO_MANIFEST_DIR")); let server_config_file_path = manifest_dir.join("configs/config.test.toml"); let server_config = - omicron_gateway::Config::from_file(&server_config_file_path) - .expect("failed to load config.test.toml"); + match omicron_gateway::Config::from_file(&server_config_file_path) { + Ok(config) => config, + Err(e) => panic!("failed to load MGS config: {e}"), + }; let sp_sim_config_file_path = manifest_dir.join("configs/sp_sim_config.test.toml"); - let sp_sim_config = sp_sim::Config::from_file(&sp_sim_config_file_path) - .expect("failed to load sp_sim_config.test.toml"); + let sp_sim_config = + match sp_sim::Config::from_file(&sp_sim_config_file_path) { + Ok(config) => config, + Err(e) => panic!("failed to load SP simulator config: {e}"), + }; (server_config, sp_sim_config) } @@ -143,8 +150,8 @@ pub async fn test_setup_with_config( // Start gateway server let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap()); - - let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id }; + let gateway_id = Uuid::new_v4(); + let args = MgsArguments { id: gateway_id, addresses, rack_id }; let server = omicron_gateway::Server::start( server_config.clone(), args, @@ -206,5 +213,5 @@ pub async fn test_setup_with_config( log.new(o!("component" => "client test context")), ); - GatewayTestContext { client, server, simrack, logctx } + GatewayTestContext { client, server, simrack, logctx, gateway_id } } diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index 3cfd1d447b..2dce15892d 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -11,6 +11,7 @@ workspace = true anyhow.workspace = true base64.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true dropshot.workspace = true futures.workspace = true @@ -39,6 +40,8 @@ tokio-tungstenite.workspace = true toml.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true +oximeter.workspace = true +oximeter-producer.workspace = true [dev-dependencies] expectorate.workspace = true diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml index d29d9508b9..a76edcd7b5 100644 --- a/gateway/examples/config.toml +++ b/gateway/examples/config.toml @@ -71,6 +71,15 @@ addr = "[::1]:33320" ignition-target = 3 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] } +# +# Configuration for SP sensor metrics polling +# +[metrics] +# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is +# useful in local testing and development, when the gateway service is not +# given a "real" underlay network IP. +dev_bind_loopback = true + [log] # Show log messages of this level and more severe level = "debug" diff --git a/gateway/src/config.rs b/gateway/src/config.rs index afdb046881..edf895ef59 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -6,6 +6,7 @@ //! configuration use crate::management_switch::SwitchConfig; +use crate::metrics::MetricsConfig; use camino::Utf8Path; use camino::Utf8PathBuf; use dropshot::ConfigLogging; @@ -25,6 +26,8 @@ pub struct Config { pub switch: SwitchConfig, /// Server-wide logging configuration. pub log: ConfigLogging, + /// Configuration for SP sensor metrics. + pub metrics: Option, } impl Config { @@ -47,13 +50,13 @@ pub struct PartialDropshotConfig { #[derive(Debug, Error, SlogInlineError)] pub enum LoadError { - #[error("error reading \"{path}\"")] + #[error("error reading \"{path}\": {err}")] Io { path: Utf8PathBuf, #[source] err: std::io::Error, }, - #[error("error parsing \"{path}\"")] + #[error("error parsing \"{path}\": {err}")] Parse { path: Utf8PathBuf, #[source] diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index e1eed05334..8e764dc63f 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -6,6 +6,7 @@ mod config; mod context; mod error; mod management_switch; +pub mod metrics; mod serial_console; pub mod http_entrypoints; // TODO pub only for testing - is this right? @@ -62,6 +63,8 @@ pub struct Server { /// `http_servers` all_servers_shutdown: FuturesUnordered, request_body_max_bytes: usize, + /// handle to the SP sensor metrics subsystem + metrics: metrics::Metrics, log: Logger, } @@ -151,6 +154,9 @@ impl Server { let mut http_servers = HashMap::with_capacity(args.addresses.len()); let all_servers_shutdown = FuturesUnordered::new(); + let metrics = + metrics::Metrics::new(&log, &args, config.metrics, apictx.clone()); + for addr in args.addresses { start_dropshot_server( &apictx, @@ -167,6 +173,7 @@ impl Server { http_servers, all_servers_shutdown, request_body_max_bytes: config.dropshot.request_body_max_bytes, + metrics, log, }) } @@ -275,12 +282,14 @@ impl Server { server.close().await?; } + self.metrics.update_server_addrs(addresses).await; + Ok(()) } /// The rack_id will be set on a refresh of the SMF property when the sled /// agent starts. - pub fn set_rack_id(&self, rack_id: Option) { + pub fn set_rack_id(&mut self, rack_id: Option) { if let Some(rack_id) = rack_id { let val = self.apictx.rack_id.get_or_init(|| rack_id); if *val != rack_id { @@ -291,6 +300,7 @@ impl Server { "ignored_new_rack_id" => %rack_id); } else { info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id); + self.metrics.set_rack_id(rack_id); } } else { warn!(self.apictx.log, "SMF refresh called without a rack id"); diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs new file mode 100644 index 0000000000..d4e0795ae0 --- /dev/null +++ b/gateway/src/metrics.rs @@ -0,0 +1,1159 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::error::CommunicationError; +use crate::management_switch::SpIdentifier; +use crate::management_switch::SpType; +use crate::MgsArguments; +use crate::ServerContext; +use anyhow::Context; +use gateway_messages::measurement::MeasurementError; +use gateway_messages::measurement::MeasurementKind; +use gateway_messages::ComponentDetails; +use gateway_messages::DeviceCapabilities; +use gateway_sp_comms::SingleSp; +use gateway_sp_comms::SpComponent; +use gateway_sp_comms::VersionedSpState; +use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; +use omicron_common::backoff; +use oximeter::types::Cumulative; +use oximeter::types::ProducerRegistry; +use oximeter::types::Sample; +use oximeter::MetricsError; +use std::borrow::Cow; +use std::collections::hash_map; +use std::collections::hash_map::HashMap; +use std::net::IpAddr; +use std::net::SocketAddr; +use std::net::SocketAddrV6; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::broadcast; +use tokio::sync::oneshot; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use uuid::Uuid; + +oximeter::use_timeseries!("hardware-component.toml"); +use hardware_component as metric; + +/// Handle to the metrics tasks. +pub struct Metrics { + /// If the metrics subsystem is disabled, this is `None`. + inner: Option, +} + +struct Handles { + addrs_tx: watch::Sender>, + rack_id_tx: Option>, + server: JoinHandle>, +} + +/// Configuration for metrics. +/// +/// In order to reduce the risk of a bad config file taking down the whole +/// management network, we try to keep the metrics-specific portion of the +/// config file as minimal as possible. At present, it only includes development +/// configurations that shouldn't be present in production configs. +#[derive( + Clone, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize, +)] +#[serde(deny_unknown_fields)] +pub struct MetricsConfig { + /// Completely disable the metrics subsystem. + /// + /// If `disabled = true`, sensor data metrics will not be collected, and the + /// metrics polling tasks will not be started. + #[serde(default)] + pub disabled: bool, + + /// Override the Nexus address used to register the SP metrics Oximeter + /// producer. This is intended for use in development and testing. + /// + /// If this argument is not present, Nexus is discovered through DNS. + #[serde(default)] + pub dev_nexus_address: Option, + + /// Allow the metrics producer endpoint to bind on loopback. + /// + /// This should be disabled in production, as Nexus will not be able to + /// reach the loopback interface, but is necessary for local development and + /// test purposes. + #[serde(default)] + pub dev_bind_loopback: bool, +} + +/// Polls sensor readings from an individual SP. +struct SpPoller { + spid: SpIdentifier, + known_state: Option, + components: HashMap, + log: slog::Logger, + rack_id: Uuid, + mgs_id: Uuid, + sample_tx: broadcast::Sender>, +} + +struct ComponentMetrics { + target: metric::HardwareComponent, + /// Counts of errors reported by sensors on this component. + sensor_errors: HashMap>, + /// Counts of errors that occurred whilst polling the SP for measurements + /// from this component. + poll_errors: HashMap<&'static str, Cumulative>, +} + +#[derive(Eq, PartialEq, Hash)] +struct SensorErrorKey { + name: Cow<'static, str>, + kind: &'static str, + error: &'static str, +} + +/// Manages a metrics server and stuff. +struct ServerManager { + log: slog::Logger, + addrs: watch::Receiver>, + registry: ProducerRegistry, +} + +#[derive(Debug)] +struct Producer { + /// Receiver for samples produced by SP pollers. + sample_rx: broadcast::Receiver>, + /// Logging context. + /// + /// We stick this on the producer because we would like to be able to log + /// when stale samples are dropped. + log: slog::Logger, +} + +/// The maximum Dropshot request size for the metrics server. +const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024; + +/// Poll interval for requesting sensor readings from SPs. +/// +/// Bryan wants to try polling at 1Hz, so let's do that for now. +const SP_POLL_INTERVAL: Duration = Duration::from_secs(1); + +///The interval at which we will ask Oximeter to collect our metric samples. +/// +/// Every ten seconds seems good. +const OXIMETER_COLLECTION_INTERVAL: Duration = Duration::from_secs(10); + +/// The expected number of SPs in a fully-loaded rack. +/// +/// N.B. that there *might* be more than this; we shouldn't ever panic or +/// otherwise misbehave if we see more than this number. This is just intended +/// for sizing buffers/map allocations and so forth; we can always realloc if we +/// see a bonus SP or two. That's why it's called "normal number of SPs" and not +/// "MAX_SPS" or similar. +/// +/// Additionally, note that we always determine the channel capacity based on +/// the assumption that *someday*, the rack might be fully loaded with compute +/// sleds, even if it isn't *right now*. A rack with 16 sleds could always grow +/// another 16 later! +const NORMAL_NUMBER_OF_SPS: usize = + 32 // 32 compute sleds + + 2 // two switches + + 2 // two power shelves, someday. + ; + +/// What size should we make the +const MAX_BUFFERED_SAMPLE_CHUNKS: usize = { + // Roughly how many times will we poll SPs for each metrics collection + // interval? + let polls_per_metrics_interval = { + let collection_interval_secs: usize = + OXIMETER_COLLECTION_INTERVAL.as_secs() as usize; + let poll_interval_secs: usize = SP_POLL_INTERVAL.as_secs() as usize; + + collection_interval_secs / poll_interval_secs + }; + + // How many sample collection intervals do we want to allow to elapse before + // we start putting stuff on the floor? + // + // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish. + let sloppiness = 16; + let capacity = + NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness; + // Finally, the buffer capacity will probably be allocated in a power of two + // anyway, so let's make sure our thing is a power of two so we don't waste + // the allocation we're gonna get anyway. + capacity.next_power_of_two() +}; + +impl Metrics { + pub fn new( + log: &slog::Logger, + args: &MgsArguments, + cfg: Option, + apictx: Arc, + ) -> Self { + let &MgsArguments { id, rack_id, ref addresses } = args; + + if cfg.as_ref().map(|c| c.disabled).unwrap_or(false) { + slog::warn!(&log, "metrics subsystem disabled by config"); + return Self { inner: None }; + } + + // Create a channel for the SP poller tasks to send samples to the + // Oximeter producer endpoint. + // + // A broadcast channel is used here, not because we are actually + // multi-consumer (`Producer::produce` is never called concurrently), + // but because the broadcast channel has properly ring-buffer-like + // behavior, where earlier messages are discarded, rather than exerting + // backpressure on senders (as Tokio's MPSC channel does). This + // is what we want, as we would prefer a full buffer to result in + // clobbering the oldest measurements, rather than leaving the newest + // ones on the floor. + let (sample_tx, sample_rx) = + broadcast::channel(MAX_BUFFERED_SAMPLE_CHUNKS); + + // Using a channel for this is, admittedly, a bit of an end-run around + // the `OnceLock` on the `ServerContext` that *also* stores the rack ID, + // but it has the nice benefit of allowing the `PollerManager` task to _await_ + // the rack ID being set...we might want to change other code to use a + // similar approach in the future. + let (rack_id_tx, rack_id_rx) = oneshot::channel(); + let rack_id_tx = if let Some(rack_id) = rack_id { + rack_id_tx.send(rack_id).expect( + "we just created the channel; it therefore will not be \ + closed", + ); + None + } else { + Some(rack_id_tx) + }; + + tokio::spawn(start_pollers( + log.new(slog::o!("component" => "sensor-poller")), + apictx.clone(), + rack_id_rx, + id, + sample_tx, + )); + + let (addrs_tx, addrs_rx) = + tokio::sync::watch::channel(addresses.clone()); + let server = { + let log = log.new(slog::o!("component" => "producer-server")); + let registry = ProducerRegistry::with_id(id); + registry + .register_producer(Producer { sample_rx, log: log.clone() }) + // TODO(ben): when you change `register_producer` to not return + // a `Result`, delete this `expect`. thanks in advance! :) + .expect( + "`ProducerRegistry::register_producer()` will never \ + actually return an `Err`, so this shouldn't ever \ + happen...", + ); + + tokio::spawn( + ServerManager { log, addrs: addrs_rx, registry }.run(cfg), + ) + }; + Self { inner: Some(Handles { addrs_tx, rack_id_tx, server }) } + } + + pub fn set_rack_id(&mut self, rack_id: Uuid) { + let tx = self.inner.as_mut().and_then(|i| i.rack_id_tx.take()); + if let Some(tx) = tx { + // If the task that starts sensor pollers has gone away already, + // we're probably shutting down, and shouldn't panic. + let _ = tx.send(rack_id); + } + // Ignoring duplicate attempt to set the rack ID... + } + + pub async fn update_server_addrs(&self, new_addrs: &[SocketAddrV6]) { + if let Some(ref inner) = self.inner { + inner.addrs_tx.send_if_modified(|current_addrs| { + if current_addrs.len() == new_addrs.len() + // N.B. that we could make this "faster" with a `HashSet`, + // but...the size of this Vec of addresses is probably going to + // two or three items, max, so the linear scan actually probably + // outperforms it... + && current_addrs.iter().all(|addr| new_addrs.contains(addr)) + { + return false; + } + + // Reuse existing `Vec` capacity if possible.This is almost + // certainly not performance-critical, but it makes me feel happy. + current_addrs.clear(); + current_addrs.extend_from_slice(new_addrs); + true + }); + } + } +} + +impl Drop for Metrics { + fn drop(&mut self) { + // Clean up our children on drop. + if let Some(ref mut inner) = self.inner { + inner.server.abort(); + } + } +} + +impl oximeter::Producer for Producer { + fn produce( + &mut self, + ) -> Result>, MetricsError> { + // Drain all samples currently in the queue into a `Vec`. + // + // N.B. it may be tempting to pursue an alternative design where we + // implement `Iterator` for a `broadcast::Receiver>` and + // just return that using `Receiver::resubscribe`...DON'T DO THAT! The + // `resubscribe` function creates a receiver at the current *tail* of + // the ringbuffer, so it won't see any samples produced *before* now. + // Which is the opposite of what we want! + let mut samples = Vec::with_capacity(self.sample_rx.len()); + // Because we receive the individual samples in a `Vec` of all samples + // produced by a poller, let's also sum the length of each of those + // `Vec`s here, so we can log it later. + let mut total_samples = 0; + // Also, track whether any sample chunks were dropped off the end of the + // ring buffer. + let mut dropped_chunks = 0; + + use broadcast::error::TryRecvError; + loop { + match self.sample_rx.try_recv() { + Ok(sample_chunk) => { + total_samples += sample_chunk.len(); + samples.push(sample_chunk) + } + // This error indicates that an old ringbuffer entry was + // overwritten. That's fine, just get the next one. + Err(TryRecvError::Lagged(dropped)) => { + dropped_chunks += dropped; + } + // We've drained all currently available samples! We're done here! + Err(TryRecvError::Empty) => break, + // This should only happen when shutting down. + Err(TryRecvError::Closed) => { + slog::debug!(&self.log, "sample producer channel closed"); + break; + } + } + } + + if dropped_chunks > 0 { + slog::info!( + &self.log, + "produced metric samples. some old sample chunks were dropped!"; + "samples" => total_samples, + "sample_chunks" => samples.len(), + "dropped_chunks" => dropped_chunks, + ); + } else { + slog::debug!( + &self.log, + "produced metric samples"; + "samples" => total_samples, + "sample_chunks" => samples.len(), + ); + } + + // There you go, that's all I've got. + Ok(Box::new(samples.into_iter().flatten())) + } +} + +async fn start_pollers( + log: slog::Logger, + apictx: Arc, + rack_id: oneshot::Receiver, + mgs_id: Uuid, + sample_tx: broadcast::Sender>, +) -> anyhow::Result<()> { + let switch = &apictx.mgmt_switch; + + // First, wait until we know what the rack ID is known... + let rack_id = rack_id + .await + .context("rack ID sender has gone away...we must be shutting down")?; + + // Wait for SP discovery to complete, if it hasn't already. + // TODO(eliza): presently, we busy-poll here. It would be nicer to + // replace the `OnceLock` in `ManagementSwitch` + // with a `tokio::sync::watch` + let sps = backoff::retry_notify_ext( + backoff::retry_policy_local(), + || async { switch.all_sps().map_err(backoff::BackoffError::transient) }, + |err, _, elapsed| { + let secs = elapsed.as_secs(); + if secs < 30 { + slog::debug!( + &log, + "waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, + "error" => err, + ); + } else if secs < 180 { + slog::info!( + &log, + "still waiting for SP discovery to complete..."; + "elapsed" => ?elapsed, + "error" => err, + ) + } else { + slog::warn!( + &log, + "we have been waiting for SP discovery to complete \ + for a pretty long time!"; + "elapsed" => ?elapsed, + "error" => err, + ) + } + }, + ) + .await + .context("we should never return a fatal error here")?; + + slog::info!( + &log, + "starting to poll SP sensor data every {SP_POLL_INTERVAL:?}" + ); + + for (spid, _) in sps { + slog::info!( + &log, + "found a new little friend!"; + "sp_slot" => ?spid.slot, + "chassis_type" => ?spid.typ, + ); + + let poller = SpPoller { + spid, + rack_id, + mgs_id, + log: log.new(slog::o!( + "sp_slot" => spid.slot, + "chassis_type" => format!("{:?}", spid.typ), + )), + components: HashMap::new(), + known_state: None, + sample_tx: sample_tx.clone(), + }; + tokio::spawn(poller.run(apictx.clone())); + } + + Ok(()) +} + +impl SpPoller { + async fn run(mut self, apictx: Arc) { + let mut interval = tokio::time::interval(SP_POLL_INTERVAL); + let switch = &apictx.mgmt_switch; + let sp = match switch.sp(self.spid) { + Ok(sp) => sp, + Err(e) => { + // This should never happen, but it's not worth taking down the + // entire management network over that... + const MSG: &'static str = + "the `SpPoller::run` function is only called after \ + discovery completes successfully, and the `SpIdentifier` \ + used was returned by the management switch, \ + so it should be valid."; + if cfg!(debug_assertions) { + unreachable!( + "{MSG} nonetheless, we saw a {e:?} error when looking \ + up {:?}", + self.spid + ); + } else { + slog::error!( + &self.log, + "THIS SHOULDN'T HAPPEN: {MSG}"; + "error" => e, + "sp" => ?self.spid, + ); + return; + } + } + }; + loop { + interval.tick().await; + slog::trace!(&self.log, "interval elapsed, polling SP..."); + + match self.poll(sp).await { + // No sense cluttering the ringbuffer with empty vecs... + Ok(samples) if samples.is_empty() => { + slog::trace!( + &self.log, + "polled SP, no samples returned"; + "num_samples" => 0usize + ); + } + Ok(samples) => { + slog::trace!( + &self.log, + "polled SP successfully"; + "num_samples" => samples.len(), + ); + + if let Err(_) = self.sample_tx.send(samples) { + slog::debug!( + &self.log, + "all sample receiver handles have been dropped! \ + presumably we are shutting down..."; + ); + return; + } + } + // No SP is currently present for this ID. This may change in + // the future: a cubby that is not populated at present may have + // a sled added to it in the future. So, let's wait until it + // changes. + Err(CommunicationError::NoSpDiscovered) => { + slog::info!( + &self.log, + "no SP is present for this slot. waiting for a \ + little buddy to appear..."; + ); + let mut watch = sp.sp_addr_watch().clone(); + loop { + if let Some((addr, port)) = *watch.borrow_and_update() { + // Ladies and gentlemen...we got him! + slog::info!( + &self.log, + "found a SP, resuming polling."; + "sp_addr" => ?addr, + "sp_port" => ?port, + ); + break; + } + + // Wait for an address to be discovered. + slog::debug!(&self.log, "waiting for a SP to appear."); + if watch.changed().await.is_err() { + slog::debug!( + &self.log, + "SP address watch has been closed, presumably \ + we are shutting down"; + ); + return; + } + } + } + Err(error) => { + slog::warn!( + &self.log, + "failed to poll SP, will try again momentarily..."; + "error" => %error, + ); + // TODO(eliza): we should probably have a metric for failed + // SP polls. + } + } + } + } + + async fn poll( + &mut self, + sp: &SingleSp, + ) -> Result, CommunicationError> { + let mut current_state = SpUnderstanding::from(sp.state().await?); + let mut samples = Vec::new(); + // If the SP's state changes dramatically *during* a poll, it may be + // necessary to re-do the metrics scrape, thus the loop. Normally, we + // will only loop a single time, but may retry if necessary. + loop { + // Check if the SP's state has changed. If it has, we need to make sure + // we still know what all of its sensors are. + if Some(¤t_state) != self.known_state.as_ref() { + // The SP's state appears to have changed. Time to make sure our + // understanding of its devices and identity is up to date! + + let chassis_kind = match self.spid.typ { + SpType::Sled => "sled", + SpType::Switch => "switch", + SpType::Power => "power", + }; + let model = stringify_byte_string(¤t_state.model[..]); + let serial = + stringify_byte_string(¤t_state.serial_number[..]); + let hubris_archive_id = + hex::encode(¤t_state.hubris_archive_id); + + slog::debug!( + &self.log, + "our little friend seems to have changed in some kind of way"; + "current_state" => ?current_state, + "known_state" => ?self.known_state, + "new_model" => %model, + "new_serial" => %serial, + "new_hubris_archive_id" => %hubris_archive_id, + ); + + let inv_devices = sp.inventory().await?.devices; + + // Clear out any previously-known devices, and preallocate capacity + // for all the new ones. + self.components.clear(); + self.components.reserve(inv_devices.len()); + + for dev in inv_devices { + // Skip devices which have nothing interesting for us. + if !dev + .capabilities + .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS) + { + continue; + } + let component_id = match dev.component.as_str() { + Some(c) => Cow::Owned(c.to_string()), + None => { + // These are supposed to always be strings. But, if we + // see one that's not a string, fall back to the hex + // representation rather than panicking. + let hex = hex::encode(dev.component.id); + slog::warn!( + &self.log, + "a SP component ID was not a string! this isn't \ + supposed to happen!"; + "component" => %hex, + "device" => ?dev, + ); + Cow::Owned(hex) + } + }; + + // TODO(eliza): i hate having to clone all these strings for + // every device on the SP...it would be cool if Oximeter let us + // reference count them... + let target = metric::HardwareComponent { + rack_id: self.rack_id, + gateway_id: self.mgs_id, + chassis_model: Cow::Owned(model.clone()), + chassis_revision: current_state.revision, + chassis_kind: Cow::Borrowed(chassis_kind), + chassis_serial: Cow::Owned(serial.clone()), + hubris_archive_id: Cow::Owned( + hubris_archive_id.clone(), + ), + slot: self.spid.slot as u32, + component_kind: Cow::Owned(dev.device), + component_id, + description: Cow::Owned(dev.description), + }; + match self.components.entry(dev.component) { + // Found a new device! + hash_map::Entry::Vacant(entry) => { + slog::debug!( + &self.log, + "discovered a new component!"; + "component_id" => %target.component_id, + "component_kind" => %target.component_kind, + "description" => %target.component_id, + ); + entry.insert(ComponentMetrics { + target, + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + // We previously had a known device for this thing, but + // the metrics target has changed, so we should reset + // its cumulative metrics. + hash_map::Entry::Occupied(mut entry) + if entry.get().target != target => + { + slog::trace!( + &self.log, + "target has changed, resetting cumulative metrics \ + for component"; + "component" => ?dev.component, + ); + entry.insert(ComponentMetrics { + target, + sensor_errors: HashMap::new(), + poll_errors: HashMap::new(), + }); + } + + // The target for this device hasn't changed, don't reset it. + hash_map::Entry::Occupied(_) => {} + } + } + + self.known_state = Some(current_state); + } + + // We will need capacity for *at least* the number of components on the + // SP --- it will probably be more, as several components have multiple + // measurement channels which will produce independent samples (e.g. a + // power rail will likely have both voltage and current measurements, + // and a device may have multiple rails...) but, this way, we can avoid + // *some* amount of reallocating... + samples.reserve(self.components.len()); + for (c, metrics) in &mut self.components { + // Metrics samples *should* always be well-formed. If we ever emit a + // messed up one, this is a programmer error, and therefore should + // fail in test, but should probably *not* take down the whole + // management gateway in a real-life rack, especially because it's + // probably going to happen again if we were to get restarted. + const BAD_SAMPLE: &str = + "we emitted a bad metrics sample! this should never happen"; + macro_rules! try_sample { + ($sample:expr) => { + match $sample { + Ok(sample) => samples.push(sample), + + Err(err) => { + slog::error!( + &self.log, + "{BAD_SAMPLE}!"; + "error" => %err, + ); + #[cfg(debug_assertions)] + unreachable!("{BAD_SAMPLE}: {err}"); + } + } + } + } + let details = match sp.component_details(*c).await { + Ok(deets) => deets, + // SP seems gone! + Err(CommunicationError::NoSpDiscovered) => { + return Err(CommunicationError::NoSpDiscovered) + } + Err(error) => { + slog::warn!( + &self.log, + "failed to read details on SP component"; + "sp_component" => %c, + "error" => %error, + ); + try_sample!(metrics.poll_error(comms_error_str(error))); + continue; + } + }; + if details.entries.is_empty() { + slog::warn!( + &self.log, + "a component which claimed to have measurement channels \ + had empty details. this seems weird..."; + "sp_component" => %c, + ); + try_sample!(metrics.poll_error("no_measurement_channels")); + continue; + } + + let ComponentMetrics { sensor_errors, target, .. } = metrics; + for d in details.entries { + let ComponentDetails::Measurement(m) = d else { + // If the component details are switch port details rather + // than measurement channels, ignore it for now. + continue; + }; + let sensor: Cow<'static, str> = Cow::Owned(m.name); + + // First, if there's a measurement error, increment the + // error count metric. We will synthesize a missing sample + // for the sensor's metric as well, after we produce the + // measurement error sample. + // + // We do this first so that we only have to clone the + // sensor's name if there's an error, rather than always + // cloning it in *case* there's an error. + if let Err(error) = m.value { + let kind = match m.kind { + MeasurementKind::Temperature => "temperature", + MeasurementKind::Current => "current", + MeasurementKind::Voltage => "voltage", + MeasurementKind::Power => "power", + MeasurementKind::InputCurrent => "input_current", + MeasurementKind::InputVoltage => "input_voltage", + MeasurementKind::Speed => "fan_speed", + }; + let error = match error { + MeasurementError::InvalidSensor => "invalid_sensor", + MeasurementError::NoReading => "no_reading", + MeasurementError::NotPresent => "not_present", + MeasurementError::DeviceError => "device_error", + MeasurementError::DeviceUnavailable => { + "device_unavailable" + } + MeasurementError::DeviceTimeout => "device_timeout", + MeasurementError::DeviceOff => "device_off", + }; + let datum = sensor_errors + .entry(SensorErrorKey { + name: sensor.clone(), + kind, + error, + }) + .or_insert(Cumulative::new(0)); + // TODO(eliza): perhaps we should treat this as + // "level-triggered" and only increment the counter + // when the sensor has *changed* to an errored + // state after we have seen at least one good + // measurement from it since the last time the error + // was observed? + datum.increment(); + try_sample!(Sample::new( + target, + &metric::SensorErrorCount { + error: Cow::Borrowed(error), + sensor: sensor.clone(), + datum: *datum, + sensor_kind: Cow::Borrowed(kind), + }, + )); + } + + // I don't love this massive `match`, but because the + // `Sample::new_missing` constructor is a different function + // from `Sample::new`, we need separate branches for the + // error and not-error cases, rather than just doing + // something to produce a datum from both the `Ok` and + // `Error` cases... + let sample = match (m.value, m.kind) { + (Ok(datum), MeasurementKind::Temperature) => { + Sample::new( + target, + &metric::Temperature { sensor, datum }, + ) + } + (Err(_), MeasurementKind::Temperature) => { + Sample::new_missing( + target, + &metric::Temperature { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::Current) => Sample::new( + target, + &metric::Current { sensor, datum }, + ), + (Err(_), MeasurementKind::Current) => { + Sample::new_missing( + target, + &metric::Current { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::Voltage) => Sample::new( + target, + &metric::Voltage { sensor, datum }, + ), + + (Err(_), MeasurementKind::Voltage) => { + Sample::new_missing( + target, + &metric::Voltage { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::Power) => Sample::new( + target, + &metric::Power { sensor, datum }, + ), + (Err(_), MeasurementKind::Power) => { + Sample::new_missing( + target, + &metric::Power { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::InputCurrent) => { + Sample::new( + target, + &metric::InputCurrent { sensor, datum }, + ) + } + (Err(_), MeasurementKind::InputCurrent) => { + Sample::new_missing( + target, + &metric::InputCurrent { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::InputVoltage) => { + Sample::new( + target, + &metric::InputVoltage { sensor, datum }, + ) + } + (Err(_), MeasurementKind::InputVoltage) => { + Sample::new_missing( + target, + &metric::InputVoltage { sensor, datum: 0.0 }, + ) + } + (Ok(datum), MeasurementKind::Speed) => Sample::new( + target, + &metric::FanSpeed { sensor, datum }, + ), + (Err(_), MeasurementKind::Speed) => { + Sample::new_missing( + target, + &metric::FanSpeed { sensor, datum: 0.0 }, + ) + } + }; + try_sample!(sample); + } + } + + // Now, fetch the SP's state *again*. It is possible that, while we + // were scraping the SP's samples, the SP's identity changed in some + // way: perhaps its version was updated during the poll, or it + // was removed from the rack and replaced with an entirely different + // chassis! If that's the case, some of the samples we collected may + // have a metrics target describing the wrong thing (e.g. they could + // still have the previous firmware's `hubris_archive_id`, if the SP + // was updated). In that case, we need to throw away the samples we + // collected and try again, potentially rebuilding our understanding + // of the SP's inventory. + let state = SpUnderstanding::from(sp.state().await?); + if state == current_state { + // All good, the SP is still who we thought it was! We can + // "commit" this batch of samples + return Ok(samples); + } + + slog::info!( + &self.log, + "SP's state changed mid-poll! discarding current samples and \ + starting over!"; + "new_state" => ?state, + "current_state" => ?current_state, + ); + // Let's reuse the buffer we already have for the next batch of + // samples. + samples.clear(); + //...and try again with the new state. + current_state = state; + } + } +} + +/// The fields of the `gateway_messages` `VersionedSpState` and +/// `SpStateV1`/`SpStateV2`/`SpStateV3` that we actually care about for purposes +/// of determining whether our understanding of the SP's components are still +/// valid. +/// +/// In particular, we throw out the RoT state and the SP's power state, because +/// those changing won't actually invalidate our understanding of the SP's +/// components. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +struct SpUnderstanding { + hubris_archive_id: [u8; 8], + serial_number: [u8; 32], + model: [u8; 32], + revision: u32, +} + +impl From for SpUnderstanding { + fn from(v: VersionedSpState) -> Self { + match v { + VersionedSpState::V1(gateway_messages::SpStateV1 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + VersionedSpState::V2(gateway_messages::SpStateV2 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + VersionedSpState::V3(gateway_messages::SpStateV3 { + hubris_archive_id, + serial_number, + model, + revision, + .. + }) => Self { hubris_archive_id, serial_number, model, revision }, + } + } +} + +// Reimplement this ourselves because we don't really care about +// reading the RoT state at present. This is unfortunately copied +// from `gateway_messages`. +fn stringify_byte_string(bytes: &[u8]) -> String { + // We expect serial and model numbers to be ASCII and 0-padded: find the first 0 + // byte and convert to a string. If that fails, hexlify the entire slice. + let first_zero = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + + std::str::from_utf8(&bytes[..first_zero]) + .map(|s| s.to_string()) + .unwrap_or_else(|_err| hex::encode(bytes)) +} + +impl ServerManager { + async fn run(mut self, cfg: Option) -> anyhow::Result<()> { + let (registration_address, bind_loopback) = + if let Some(MetricsConfig { + dev_bind_loopback, + dev_nexus_address, + .. + }) = cfg + { + if dev_bind_loopback || dev_nexus_address.is_some() { + slog::warn!( + &self.log, + "using development metrics configuration overrides!"; + "nexus_address" => ?dev_nexus_address, + "bind_loopback" => dev_bind_loopback, + ); + } + (dev_nexus_address, dev_bind_loopback) + } else { + (None, false) + }; + let id = self.registry.producer_id(); + + let mut current_server: Option = None; + loop { + let current_ip = current_server.as_ref().map(|s| s.address().ip()); + let mut new_ip = None; + for addr in self.addrs.borrow_and_update().iter() { + let &ip = addr.ip(); + // Don't bind the metrics endpoint on ::1 + if ip.is_loopback() && !bind_loopback { + continue; + } + // If our current address is contained in the new addresses, + // no need to rebind. + if current_ip == Some(IpAddr::V6(ip)) { + new_ip = None; + break; + } else { + new_ip = Some(ip); + } + } + + if let Some(ip) = new_ip { + slog::debug!( + &self.log, + "rebinding producer server on new IP"; + "new_ip" => ?ip, + "current_ip" => ?current_ip, + "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL, + "producer_id" => ?id, + ); + let server = { + // Listen on any available socket, using the provided underlay IP. + let address = SocketAddr::new(ip.into(), 0); + + let server_info = ProducerEndpoint { + id, + kind: ProducerKind::ManagementGateway, + address, + interval: OXIMETER_COLLECTION_INTERVAL, + }; + let config = oximeter_producer::Config { + server_info, + registration_address, + request_body_max_bytes: METRIC_REQUEST_MAX_SIZE, + log: oximeter_producer::LogConfig::Logger( + self.log.clone(), + ), + }; + oximeter_producer::Server::with_registry( + self.registry.clone(), + &config, + ) + .context("failed to start producer server")? + }; + + slog::info!( + &self.log, + "bound metrics producer server"; + "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL, + "producer_id" => ?id, + "address" => %server.address(), + ); + + if let Some(old_server) = current_server.replace(server) { + let old_addr = old_server.address(); + if let Err(error) = old_server.close().await { + slog::error!( + &self.log, + "failed to close old metrics producer server"; + "address" => %old_addr, + "error" => %error, + ); + } else { + slog::debug!( + &self.log, + "old metrics producer server shut down"; + "address" => %old_addr, + ) + } + } + } + + // Wait for a subsequent address change. + self.addrs.changed().await?; + } + } +} + +impl ComponentMetrics { + fn poll_error( + &mut self, + error_str: &'static str, + ) -> Result { + let datum = self + .poll_errors + .entry(error_str) + .or_insert_with(|| Cumulative::new(0)); + datum.increment(); + Sample::new( + &self.target, + &metric::PollErrorCount { + error: Cow::Borrowed(error_str), + datum: *datum, + }, + ) + } +} + +fn comms_error_str(error: CommunicationError) -> &'static str { + // TODO(eliza): a bunch of these probably can't be returned by the specific + // operations we try to do. It could be good to make the methods this code + // calls return a smaller enum of just the errors it might actually + // encounter? Figure this out later. + match error { + CommunicationError::NoSpDiscovered => "no_sp_discovered", + CommunicationError::InterfaceError(_) => "interface", + CommunicationError::ScopeIdChangingFrequently { .. } => { + "scope_id_changing_frequently" + } + CommunicationError::JoinMulticast { .. } => "join_multicast", + CommunicationError::UdpSendTo { .. } => "udp_send_to", + CommunicationError::UdpRecv(_) => "udp_recv", + CommunicationError::Deserialize { .. } => "deserialize", + CommunicationError::ExhaustedNumAttempts(_) => "exhausted_num_attempts", + CommunicationError::BadResponseType { .. } => "bad_response_type", + CommunicationError::SpError { .. } => "sp_error", + CommunicationError::BogusSerialConsoleState { .. } => { + "bogus_serial_console_state" + } + CommunicationError::VersionMismatch { .. } => { + "protocol_version_mismatch" + } + CommunicationError::TlvDeserialize { .. } => "tlv_deserialize", + CommunicationError::TlvDecode(_) => "tlv_decode", + CommunicationError::TlvPagination { .. } => "tlv_pagination", + CommunicationError::IpccKeyLookupValueTooLarge => { + "ipcc_key_lookup_value_too_large" + } + CommunicationError::UnexpectedTrailingData(_) => { + "unexpected_trailing_data" + } + CommunicationError::BadTrailingDataSize { .. } => { + "bad_trailing_data_size" + } + } +} diff --git a/gateway/tests/integration_tests/component_list.rs b/gateway/tests/integration_tests/component_list.rs index ec876c0783..993dcc9e93 100644 --- a/gateway/tests/integration_tests/component_list.rs +++ b/gateway/tests/integration_tests/component_list.rs @@ -57,7 +57,71 @@ async fn component_list() { capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS .bits(), presence: SpComponentPresence::Failed, - } + }, + SpComponentInfo { + component: "dev-1".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-2".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE Southeast temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-6".to_string(), + device: "at24csw080".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A VPD".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-7".to_string(), + device: "max5970".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A hot swap controller" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-8".to_string(), + device: "nvme_bmc".to_string(), + serial_number: None, + description: "FAKE U.2 A NVMe Basic Management Command" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-39".to_string(), + device: "tmp451".to_string(), + serial_number: None, + description: "FAKE T6 temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-53".to_string(), + device: "max31790".to_string(), + serial_number: None, + description: "FAKE Fan controller".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, ] ); @@ -67,14 +131,89 @@ async fn component_list() { assert_eq!( resp.components, - &[SpComponentInfo { - component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), - device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), - serial_number: None, - description: "FAKE host cpu".to_string(), - capabilities: 0, - presence: SpComponentPresence::Present, - },] + &[ + SpComponentInfo { + component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), + device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(), + serial_number: None, + description: "FAKE host cpu".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-0".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-1".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-2".to_string(), + device: "tmp117".to_string(), + serial_number: None, + description: "FAKE Southeast temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-6".to_string(), + device: "at24csw080".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A VPD".to_string(), + capabilities: 0, + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-7".to_string(), + device: "max5970".to_string(), + serial_number: None, + description: "FAKE U.2 Sharkfin A hot swap controller" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-8".to_string(), + device: "nvme_bmc".to_string(), + serial_number: None, + description: "FAKE U.2 A NVMe Basic Management Command" + .to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-39".to_string(), + device: "tmp451".to_string(), + serial_number: None, + description: "FAKE T6 temperature sensor".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + SpComponentInfo { + component: "dev-53".to_string(), + device: "max31790".to_string(), + serial_number: None, + description: "FAKE Fan controller".to_string(), + capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS + .bits(), + presence: SpComponentPresence::Present, + }, + ] ); // Get the component list for switch 0. diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 74a7356adb..c2fab2de5a 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -22,6 +22,7 @@ impl_enum_type!( #[diesel(sql_type = ProducerKindEnum)] pub enum ProducerKind; + ManagementGateway => b"management_gateway" SledAgent => b"sled_agent" Service => b"service" Instance => b"instance" @@ -30,6 +31,9 @@ impl_enum_type!( impl From for ProducerKind { fn from(kind: internal::nexus::ProducerKind) -> Self { match kind { + internal::nexus::ProducerKind::ManagementGateway => { + ProducerKind::ManagementGateway + } internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent, internal::nexus::ProducerKind::Service => ProducerKind::Service, internal::nexus::ProducerKind::Instance => ProducerKind::Instance, @@ -40,6 +44,9 @@ impl From for ProducerKind { impl From for internal::nexus::ProducerKind { fn from(kind: ProducerKind) -> Self { match kind { + ProducerKind::ManagementGateway => { + internal::nexus::ProducerKind::ManagementGateway + } ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent, ProducerKind::Service => internal::nexus::ProducerKind::Service, ProducerKind::Instance => internal::nexus::ProducerKind::Instance, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index d0542874fb..aef95e6d53 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(90, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(91, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(91, "add-management-gateway-producer-kind"), KnownVersion::new(90, "lookup-bgp-config-by-asn"), KnownVersion::new(89, "collapse_lldp_settings"), KnownVersion::new(88, "route-local-pref"), diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 3b808984ae..9f4652c2da 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -23,8 +23,11 @@ use nexus_types::external_api::views::OxqlQueryResult; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use oximeter::types::Datum; +use oximeter::types::FieldValue; use oximeter::types::Measurement; use oximeter::TimeseriesSchema; +use std::borrow::Borrow; +use std::collections::HashMap; use uuid::Uuid; pub async fn query_for_metrics( @@ -344,7 +347,6 @@ async fn test_instance_watcher_metrics( ); }}; } - use oximeter::types::FieldValue; const INSTANCE_ID_FIELD: &str = "instance_id"; const STATE_FIELD: &str = "state"; const STATE_STARTING: &str = "starting"; @@ -589,6 +591,183 @@ async fn test_instance_watcher_metrics( assert_gte!(ts2_running, 2); } +#[nexus_test] +async fn test_mgs_metrics( + cptestctx: &ControlPlaneTestContext, +) { + // Make a MGS + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); + let mgs = { + // munge the already-parsed MGS config file to point it at the test + // Nexus' address. + mgs_config.metrics = Some(gateway_test_utils::setup::MetricsConfig { + disabled: false, + dev_bind_loopback: true, + dev_nexus_address: Some(cptestctx.internal_client.bind_address), + }); + gateway_test_utils::setup::test_setup_with_config( + "test_mgs_metrics", + gateway_messages::SpPort::One, + mgs_config, + &sp_sim_config, + None, + ) + .await + }; + + // Let's look at all the simulated SP components in the config file which + // have sensor readings, so we can assert that there are timeseries for all + // of them. + let all_sp_configs = { + let gimlet_configs = + sp_sim_config.simulated_sps.gimlet.iter().map(|g| &g.common); + let sidecar_configs = + sp_sim_config.simulated_sps.sidecar.iter().map(|s| &s.common); + gimlet_configs.chain(sidecar_configs) + }; + // XXX(eliza): yes, this code is repetitive. We could probably make it a + // little elss ugly with nested hash maps, but like...I already wrote it, so + // you don't have to. :) + // + // TODO(eliza): presently, we just expect that the number of timeseries for + // each serial number and sensor type lines up. If we wanted to be *really* + // fancy, we could also assert that all the component IDs, component kinds, + // and measurement values line up with the config. But, honestly, it's + // pretty unlikely that a bug in MGS' sensor metrics subsystem would mess + // that up --- the most important thing is just to make sure that the sensor + // data is *present*, as that should catch most regressions. + let mut temp_sensors = HashMap::new(); + let mut current_sensors = HashMap::new(); + let mut voltage_sensors = HashMap::new(); + let mut power_sensors = HashMap::new(); + let mut input_voltage_sensors = HashMap::new(); + let mut input_current_sensors = HashMap::new(); + let mut fan_speed_sensors = HashMap::new(); + for sp in all_sp_configs { + let mut temp = 0; + let mut current = 0; + let mut voltage = 0; + let mut input_voltage = 0; + let mut input_current = 0; + let mut power = 0; + let mut speed = 0; + for component in &sp.components { + for sensor in &component.sensors { + use gateway_messages::measurement::MeasurementKind as Kind; + match sensor.def.kind { + Kind::Temperature => temp += 1, + Kind::Current => current += 1, + Kind::Voltage => voltage += 1, + Kind::InputVoltage => input_voltage += 1, + Kind::InputCurrent => input_current += 1, + Kind::Speed => speed += 1, + Kind::Power => power += 1, + } + } + } + temp_sensors.insert(sp.serial_number.clone(), temp); + current_sensors.insert(sp.serial_number.clone(), current); + voltage_sensors.insert(sp.serial_number.clone(), voltage); + input_voltage_sensors.insert(sp.serial_number.clone(), input_voltage); + input_current_sensors.insert(sp.serial_number.clone(), input_current); + fan_speed_sensors.insert(sp.serial_number.clone(), speed); + power_sensors.insert(sp.serial_number.clone(), power); + } + + async fn check_all_timeseries_present( + cptestctx: &ControlPlaneTestContext, + name: &str, + expected: HashMap, + ) { + let metric_name = format!("hardware_component:{name}"); + eprintln!("\n=== checking timeseries for {metric_name} ===\n"); + + if expected.values().all(|&v| v == 0) { + eprintln!( + "-> SP sim config contains no {name} sensors, skipping it" + ); + return; + } + + let table = timeseries_query(&cptestctx, &format!("get {metric_name}")) + .await + .into_iter() + .find(|t| t.name() == metric_name); + let table = match table { + Some(table) => table, + None => panic!("missing table for {metric_name}"), + }; + + let mut found = expected + .keys() + .map(|serial| (serial.clone(), 0)) + .collect::>(); + for timeseries in table.timeseries() { + let fields = ×eries.fields; + let n_points = timeseries.points.len(); + assert!( + n_points > 0, + "{metric_name} timeseries {fields:?} should have points" + ); + let serial_str: &str = match timeseries.fields.get("chassis_serial") + { + Some(FieldValue::String(s)) => s.borrow(), + Some(x) => panic!( + "{metric_name} `chassis_serial` field should be a string, but got: {x:?}" + ), + None => { + panic!("{metric_name} timeseries should have a `chassis_serial` field") + } + }; + if let Some(count) = found.get_mut(serial_str) { + *count += 1; + } else { + panic!( + "{metric_name} timeseries had an unexpected chassis serial \ + number {serial_str:?} (not in the config file)", + ); + } + } + + eprintln!("-> {metric_name}: found timeseries: {found:#?}"); + assert_eq!( + found, expected, + "number of {metric_name} timeseries didn't match expected in {table:#?}", + ); + eprintln!("-> okay, looks good!"); + } + + // Wait until the MGS registers as a producer with Oximeter. + wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; + + // ...and collect its samples. + cptestctx.oximeter.force_collect().await; + + check_all_timeseries_present(&cptestctx, "temperature", temp_sensors).await; + check_all_timeseries_present(&cptestctx, "voltage", voltage_sensors).await; + check_all_timeseries_present(&cptestctx, "current", current_sensors).await; + check_all_timeseries_present(&cptestctx, "power", power_sensors).await; + check_all_timeseries_present( + &cptestctx, + "input_voltage", + input_voltage_sensors, + ) + .await; + check_all_timeseries_present( + &cptestctx, + "input_current", + input_current_sensors, + ) + .await; + check_all_timeseries_present(&cptestctx, "fan_speed", fan_speed_sensors) + .await; + + // Because the `ControlPlaneTestContext` isn't managing the MGS we made for + // this test, we are responsible for removing its logs. + mgs.logctx.cleanup_successful(); +} + /// Wait until a producer is registered with Oximeter. /// /// This blocks until the producer is registered, for up to 60s. It panics if diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 8314d22173..6e482bc1ad 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -434,9 +434,23 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { #[tokio::test] async fn test_sp_updater_delivers_progress() { // Start MGS + Sim SP. - let mgstestctx = - mgs_setup::test_setup("test_sp_updater_delivers_progress", SpPort::One) - .await; + let mgstestctx = { + let (mut mgs_config, sp_sim_config) = mgs_setup::load_test_config(); + // Enabling SP metrics collection makes this alread-flaky test even + // flakier, so let's just turn it off. + // TODO(eliza): it would be nice if we didn't have to disable metrics in + // this test, so that we can better catch regressions that could be + // introduced by the metrics subsystem... + mgs_config.metrics.get_or_insert_with(Default::default).disabled = true; + mgs_setup::test_setup_with_config( + "test_sp_updater_delivers_progress", + SpPort::One, + mgs_config, + &sp_sim_config, + None, + ) + .await + }; // Configure an MGS client. let mut mgs_clients = diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 54b4822e51..111bd552d0 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4443,6 +4443,13 @@ "enum": [ "instance" ] + }, + { + "description": "The producer is a management gateway service.", + "type": "string", + "enum": [ + "management_gateway" + ] } ] }, diff --git a/openapi/nexus.json b/openapi/nexus.json index 2a8c227c64..f6d140ed05 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -19934,6 +19934,7 @@ "nanoseconds", "volts", "amps", + "watts", "degrees_celsius" ] }, diff --git a/openapi/oximeter.json b/openapi/oximeter.json index f596ac6ee6..327351d961 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -277,6 +277,13 @@ "enum": [ "instance" ] + }, + { + "description": "The producer is a management gateway service.", + "type": "string", + "enum": [ + "management_gateway" + ] } ] } diff --git a/oximeter/oximeter/schema/hardware-component.toml b/oximeter/oximeter/schema/hardware-component.toml new file mode 100644 index 0000000000..30a1d6510f --- /dev/null +++ b/oximeter/oximeter/schema/hardware-component.toml @@ -0,0 +1,183 @@ +format_version = 1 + +[target] +name = "hardware_component" +description = "A hardware component on a compute sled, switch, or power shelf" +authz_scope = "fleet" +versions = [ + { version = 1, fields = [ + "rack_id", + "slot", + "chassis_kind", + "chassis_serial", + "chassis_model", + "chassis_revision", + "hubris_archive_id", + "gateway_id", + "component_kind", + "component_id", + "description", + ]} +] + +[fields.rack_id] +type = "uuid" +description = "ID of the rack on which this measurement was recorded." + +[fields.slot] +type = "u32" +description = """ +The cubby number or switch slot of the service processor reporting the \ +measurement""" + +[fields.chassis_model] +type = "string" +description = "Model number of the sled, switch, or power shelf" + +[fields.chassis_revision] +type = "u32" +description = "Revision number of the sled, switch, or power shelf" + +[fields.chassis_serial] +type = "string" +description = "Serial number of the sled, switch, or power shelf" + +[fields.hubris_archive_id] +type = "string" +description = """ +Hubris firmware archive ID of the service processor when the measurement \ +was recorded.""" + +[fields.gateway_id] +type = "uuid" +description = """ +ID of the Management Gateway Service process which recorded the measurement.""" + +[fields.chassis_kind] +type = "string" +description = """ +What kind of thing the component resides on. + +This will be one of 'sled', for components on compute sleds; 'switch', for \ +components on rack switches; or 'power', for components on power shelves.""" + +[fields.component_id] +type = "string" +description = """ +The service processor component ID uniquely identifying the hardware \ +component on the sled, switch, or power shelf.""" + +[fields.component_kind] +type = "string" +description = "What type of hardware component this thing is." + +[fields.description] +type = "string" +description = """ +A human-readable description of the hardware component. This may include \ +its location or role in the system (e.g. a DIMM's number, or a temperature \ +sensor's location).""" + +[fields.sensor] +type = "string" +description = """The name of a sensor that recorded a sensor reading.""" + +[fields.error] +type = "string" +description = "The kind of sensor error that occurred" + +[fields.sensor_kind] +type = "string" +description = """ +Which kind of sensor could not be read due to a sensor error. + +This will be one of 'temperature', 'current', 'power', 'voltage', \ +'input_current', 'input_voltage', or 'fan_speed' (the same names as \ +the metrics emitted by these sensors when they are read successfully).""" + +[[metrics]] +name = "temperature" +description = "A temperature reading from a hardware component." +units = "degrees_celsius" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "current" +description = "Output current reading in amperes" +units = "amps" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "power" +description = "Power reading, in watts" +units = "watts" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "voltage" +description = "Output voltage reading, in volts" +units = "volts" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "input_current" +description = "Input electric current reading in amperes" +units = "amps" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "input_voltage" +description = "Input electric voltage reading, in volts" +units = "volts" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + + +[[metrics]] +name = "fan_speed" +description = "A fan speed measurement, in rotations per minute" +units = "rpm" +datum_type = "f32" +versions = [ + { added_in = 1, fields = ["sensor"]} +] + +[[metrics]] +name = "sensor_error_count" +description = "Cumulative count of errors reported by a sensor" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = ["sensor", "error", "sensor_kind"]} +] + +[[metrics]] +name = "poll_error_count" +description = """ +Cumulative count of errors encountered whilst polling a component's sensors. + +Unlike the `sensor_error_count` metric, this counts errors encountered by \ +the management gateway while polling the component, rather than errors \ +reported by the component itself.""" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = ["error"] } +] diff --git a/oximeter/schema/src/codegen.rs b/oximeter/schema/src/codegen.rs index c46c25c97d..1e6e352c15 100644 --- a/oximeter/schema/src/codegen.rs +++ b/oximeter/schema/src/codegen.rs @@ -512,6 +512,7 @@ fn quote_units(units: Units) -> TokenStream { } Units::Amps => quote! { ::oximeter::schema::Units::Amps }, Units::Volts => quote! { ::oximeter::schema::Units::Volts }, + Units::Watts => quote! { ::oximeter::schema::Units::Watts }, Units::DegreesCelsius => { quote! { ::oximeter::schema::Units::DegreesCelsius } } diff --git a/oximeter/types/src/schema.rs b/oximeter/types/src/schema.rs index e06e6e2b57..135c77462a 100644 --- a/oximeter/types/src/schema.rs +++ b/oximeter/types/src/schema.rs @@ -189,6 +189,7 @@ pub enum Units { Nanoseconds, Volts, Amps, + Watts, DegreesCelsius, /// Rotations per minute. Rpm, diff --git a/schema/crdb/add-management-gateway-producer-kind/up.sql b/schema/crdb/add-management-gateway-producer-kind/up.sql new file mode 100644 index 0000000000..e872278e2f --- /dev/null +++ b/schema/crdb/add-management-gateway-producer-kind/up.sql @@ -0,0 +1,2 @@ +ALTER TYPE omicron.public.producer_kind + ADD VALUE IF NOT EXISTS 'management_gateway' AFTER 'instance'; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index baef38e44f..1457532c49 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1334,7 +1334,9 @@ CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( -- removed). 'service', -- A Propolis VMM for an instance in the omicron.public.instance table - 'instance' + 'instance', + -- A management gateway service on a scrimlet. + 'management_gateway' ); /* @@ -4212,7 +4214,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '90.0.0', NULL) + (TRUE, NOW(), NOW(), '91.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 4a60d7843b0d5763021ce71b6a3410f8de0859e1 Mon Sep 17 00:00:00 2001 From: "oxide-reflector-bot[bot]" <130185838+oxide-reflector-bot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 01:11:06 +0000 Subject: [PATCH 10/10] Update dendrite to 76c735d (#6434) Updated dendrite to commit 76c735d. Co-authored-by: reflector[bot] <130185838+reflector[bot]@users.noreply.github.com> --- package-manifest.toml | 12 ++++++------ tools/dendrite_openapi_version | 2 +- tools/dendrite_stub_checksums | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index 125861f610..cab3c1877e 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -710,8 +710,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" -source.sha256 = "3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4" +source.commit = "76c735d472e3badaeca08982e22496fccb1ce210" +source.sha256 = "3ee6cfe770da2855b4eb44c048637d56f8d72de45c8c396186dfe7232d8548fa" output.type = "zone" output.intermediate_only = true @@ -737,8 +737,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" -source.sha256 = "ad02632713a57fe8c5371316320309e1fad52f0ce2f7e6f768859aa94dfbb1d9" +source.commit = "76c735d472e3badaeca08982e22496fccb1ce210" +source.sha256 = "0e68ea8fbb609bbe2c643fc8cadc0197bd641006a323149159893bfd0d816805" output.type = "zone" output.intermediate_only = true @@ -757,8 +757,8 @@ only_for_targets.image = "standard" # the other `source.*` keys. source.type = "prebuilt" source.repo = "dendrite" -source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff" -source.sha256 = "23bca3873cdb0441cd18c0cf071b86d49755be06837479661876ac95d2f10f27" +source.commit = "76c735d472e3badaeca08982e22496fccb1ce210" +source.sha256 = "45484d6d8557a0656984d0e6db879589d841d43ab6a11116cb1da314b928a425" output.type = "zone" output.intermediate_only = true diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 2d0f4d4887..a9e13c083a 100755 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="21b16567f28e103f145cd18d53fac6958429c4ff" +COMMIT="76c735d472e3badaeca08982e22496fccb1ce210" SHA2="3a54305ab4b1270c9a5fb0603f481fce199f3767c174a03559ff642f7f44687e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index e3d16d779c..075ead4752 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4" -CIDL_SHA256_LINUX_DPD="6aa070ab0590aca7458f2555012acc5571e61b3b1523de862d4bbb04b9d34135" +CIDL_SHA256_ILLUMOS="3ee6cfe770da2855b4eb44c048637d56f8d72de45c8c396186dfe7232d8548fa" +CIDL_SHA256_LINUX_DPD="5c70318c6feb7595bdbf41d8b33827100d28fcdf34ad738a5af10e0411463f64" CIDL_SHA256_LINUX_SWADM="e1e35784538a4fdd76dc257cc636ac3f43f7ef2842dabfe981f17f8ce6b8e1a2"