From 57dc5b43e5931bc0dc67296a484c0e21d76d9af1 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 14 Nov 2023 19:52:14 +0000 Subject: [PATCH] Add a producer kind to oximeter metric producers - Adds the `kind` enum to metric producer information, including DB schema, model, and various client parameter types. This records the supported types of metric producers, and is intended to aid debugging and future work around updates and instance lifecycle management. - Add schema update files which create the DB enum type and add it as a column to the `metric_producer` table. This currently _drops_ the existing table and recreates it with the new column, rather than adding the column using `ALTER TABLE`. That is intended to remove old entries in bulk, since nothing previously removed the records for Propolis servers when their instance was stopped. --- clients/nexus-client/src/lib.rs | 14 ++++++++ clients/oximeter-client/src/lib.rs | 14 ++++++++ common/src/api/internal/nexus.rs | 21 +++++++++++ nexus/db-model/src/producer_endpoint.rs | 37 +++++++++++++++++++ nexus/db-model/src/schema.rs | 3 +- nexus/src/app/oximeter.rs | 2 ++ nexus/test-utils/src/lib.rs | 2 ++ openapi/nexus-internal.json | 47 +++++++++++++++++++++++-- openapi/oximeter.json | 47 +++++++++++++++++++++++-- oximeter/collector/src/agent.rs | 4 +++ oximeter/producer/examples/producer.rs | 2 ++ schema/crdb/11.0.0/up01.sql | 23 ++++++++++++ schema/crdb/11.0.0/up02.sql | 11 ++++++ schema/crdb/11.0.0/up03.sql | 17 +++++++++ schema/crdb/11.0.0/up04.sql | 8 +++++ schema/crdb/dbinit.sql | 15 +++++++- sled-agent/src/sim/disk.rs | 2 ++ sled-agent/src/sled_agent.rs | 2 ++ 18 files changed, 265 insertions(+), 6 deletions(-) create mode 100644 schema/crdb/11.0.0/up01.sql create mode 100644 schema/crdb/11.0.0/up02.sql create mode 100644 schema/crdb/11.0.0/up03.sql create mode 100644 schema/crdb/11.0.0/up04.sql diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 9f81492d106..3236d698251 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -202,6 +202,19 @@ impl From<&types::InstanceState> } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus::ProducerKind; + match kind { + ProducerKind::SledAgent => Self::SledAgent, + ProducerKind::Service => Self::Service, + ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -212,6 +225,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.into(), interval: s.interval.into(), } } diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 7bd17d7e767..11aa1452f82 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -20,6 +20,19 @@ impl From for types::Duration { } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus; + match kind { + nexus::ProducerKind::Service => Self::Service, + nexus::ProducerKind::SledAgent => Self::SledAgent, + nexus::ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -30,6 +43,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.into(), interval: s.interval.into(), } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index a4a539ad9b1..d7d9fec2055 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -84,13 +84,34 @@ pub struct SledInstanceState { // Oximeter producer/collector objects. +/// The kind of metric producer this is. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ProducerKind { + /// The producer is a sled-agent. + SledAgent, + /// The producer is an Oxide-managed service. + Service, + /// The producer is a Propolis VMM managing a guest instance. + Instance, +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq)] pub struct ProducerEndpoint { + /// A unique ID for this producer. pub id: Uuid, + /// The kind of producer. + pub kind: ProducerKind, + /// The IP address and port at which `oximeter` can collect metrics from the + /// producer. pub address: SocketAddr, + /// The API base route from which `oximeter` can collect metrics. + /// + /// The full route is `{base_route}/{id}`. pub base_route: String, + /// The interval on which `oximeter` should collect metrics. pub interval: Duration, } diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 29e57b0877e..1d26499c01a 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -3,12 +3,47 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::SqlU16; +use crate::impl_enum_type; use crate::schema::metric_producer; use db_macros::Asset; use nexus_types::identity::Asset; use omicron_common::api::internal; use uuid::Uuid; +impl_enum_type!( + #[derive(SqlType, Clone, Debug, QueryId)] + #[diesel(postgres_type(name = "producer_kind"))] + pub struct ProducerKindEnum; + + #[derive(AsExpression, Clone, Debug, FromSqlRow, PartialEq)] + #[diesel(sql_type = ProducerKindEnum)] + pub enum ProducerKind; + + SledAgent => b"sled_agent" + Service => b"service" + Instance => b"instance" +); + +impl From for ProducerKind { + fn from(kind: internal::nexus::ProducerKind) -> Self { + match kind { + internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent, + internal::nexus::ProducerKind::Service => ProducerKind::Service, + internal::nexus::ProducerKind::Instance => ProducerKind::Instance, + } + } +} + +impl From for internal::nexus::ProducerKind { + fn from(kind: ProducerKind) -> Self { + match kind { + ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent, + ProducerKind::Service => internal::nexus::ProducerKind::Service, + ProducerKind::Instance => internal::nexus::ProducerKind::Instance, + } + } +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Queryable, Insertable, Debug, Clone, Selectable, Asset)] @@ -17,6 +52,7 @@ pub struct ProducerEndpoint { #[diesel(embed)] identity: ProducerEndpointIdentity, + pub kind: ProducerKind, pub ip: ipnetwork::IpNetwork, pub port: SqlU16, pub interval: f64, @@ -33,6 +69,7 @@ impl ProducerEndpoint { ) -> Self { Self { identity: ProducerEndpointIdentity::new(endpoint.id), + kind: endpoint.kind.into(), ip: endpoint.address.ip().into(), port: endpoint.address.port().into(), base_route: endpoint.base_route.clone(), diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7c6b8bbd0a2..63fd780fa13 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -399,6 +399,7 @@ table! { id -> Uuid, time_created -> Timestamptz, time_modified -> Timestamptz, + kind -> crate::ProducerKindEnum, ip -> Inet, port -> Int4, interval -> Float8, @@ -1243,7 +1244,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(10, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(11, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/src/app/oximeter.rs b/nexus/src/app/oximeter.rs index 03f833b087e..29033643217 100644 --- a/nexus/src/app/oximeter.rs +++ b/nexus/src/app/oximeter.rs @@ -116,6 +116,7 @@ impl super::Nexus { for producer in producers.into_iter() { let producer_info = oximeter_client::types::ProducerEndpoint { id: producer.id(), + kind: nexus::ProducerKind::from(producer.kind).into(), address: SocketAddr::new( producer.ip.ip(), producer.port.try_into().unwrap(), @@ -139,6 +140,7 @@ impl super::Nexus { pub(crate) async fn register_as_producer(&self, address: SocketAddr) { let producer_endpoint = nexus::ProducerEndpoint { id: self.id, + kind: nexus::ProducerKind::Service, address, base_route: String::from("/metrics/collect"), interval: Duration::from_secs(10), diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 647232031d8..52ff8910f91 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -30,6 +30,7 @@ use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::api::external::MacAddr; use omicron_common::api::external::{IdentityMetadata, Name}; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::nexus_config; use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; @@ -1092,6 +1093,7 @@ pub async fn start_producer_server( let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: ProducerKind::Service, address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_secs(1), diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index f83cf68a8a0..86e9b7fe014 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4224,24 +4224,67 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ "address", "base_route", "id", - "interval" + "interval", + "kind" + ] + }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Oxide-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } ] }, "ProducerResultsItem": { diff --git a/openapi/oximeter.json b/openapi/oximeter.json index 529d20e921c..5205a368c5d 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -191,24 +191,41 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ "address", "base_route", "id", - "interval" + "interval", + "kind" ] }, "ProducerEndpointResultsPage": { @@ -231,6 +248,32 @@ "required": [ "items" ] + }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Oxide-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } + ] } }, "responses": { diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 23ff32ed668..5866375fa10 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -648,6 +648,7 @@ mod tests { use hyper::Response; use hyper::Server; use hyper::StatusCode; + use omicron_common::api::internal::nexus::ProducerKind; use omicron_test_utils::dev::test_setup_log; use std::convert::Infallible; use std::net::Ipv6Addr; @@ -694,6 +695,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: ProducerKind::Service, address, base_route: String::from("/"), interval, @@ -752,6 +754,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: ProducerKind::Service, address: SocketAddr::V6(SocketAddrV6::new( Ipv6Addr::LOCALHOST, 0, @@ -840,6 +843,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: ProducerKind::Service, address, base_route: String::from("/"), interval, diff --git a/oximeter/producer/examples/producer.rs b/oximeter/producer/examples/producer.rs index dd9722c80a9..8dbe0b6ad97 100644 --- a/oximeter/producer/examples/producer.rs +++ b/oximeter/producer/examples/producer.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter::types::Cumulative; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; @@ -124,6 +125,7 @@ async fn main() -> anyhow::Result<()> { registry.register_producer(producer).unwrap(); let server_info = ProducerEndpoint { id: registry.producer_id(), + kind: ProducerKind::Service, address: args.address, base_route: "/collect".to_string(), interval: Duration::from_secs(10), diff --git a/schema/crdb/11.0.0/up01.sql b/schema/crdb/11.0.0/up01.sql new file mode 100644 index 00000000000..acc666a98fc --- /dev/null +++ b/schema/crdb/11.0.0/up01.sql @@ -0,0 +1,23 @@ +/* + * Drop the entire metric producer assignment table. + * + * Programs wishing to produce metrics need to register with Nexus. That creates + * an assignment of the producer to a collector, which is recorded in this + * table. That registration is idempotent, and every _current_ producer will + * register when it restarts. For example, `dpd` includes a task that registers + * with Nexus, so each time it (re)starts, that registration will happen. + * + * With that in mind, dropping this table is safe, _because all updates are + * currently offline_. The current metric producers are: + * + * - `dpd` + * - Each `nexus` instance + * - Each `sled-agent` instance + * - The Propolis server for each guest Instance + * + * Each of this either does not exist at the time of an update, or will be + * restarted afterwards. Each will re-register, and so dropping this table one + * time is safe. It will be recreated in later schema upgrade files in this same + * update. + */ +DROP TABLE IF EXISTS omicron.public.metric_producer; diff --git a/schema/crdb/11.0.0/up02.sql b/schema/crdb/11.0.0/up02.sql new file mode 100644 index 00000000000..96c4c5d6b47 --- /dev/null +++ b/schema/crdb/11.0.0/up02.sql @@ -0,0 +1,11 @@ +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); diff --git a/schema/crdb/11.0.0/up03.sql b/schema/crdb/11.0.0/up03.sql new file mode 100644 index 00000000000..e5a3fceb984 --- /dev/null +++ b/schema/crdb/11.0.0/up03.sql @@ -0,0 +1,17 @@ +/* + * Recreate the metric producer assignment table. + * + * Note that we're adding the `kind` column here, using the new enum in the + * previous update SQL file. + */ +CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind NOT NULL, + ip INET NOT NULL, + port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, + interval FLOAT NOT NULL, + base_route STRING(512) NOT NULL, + oximeter_id UUID NOT NULL +); diff --git a/schema/crdb/11.0.0/up04.sql b/schema/crdb/11.0.0/up04.sql new file mode 100644 index 00000000000..cad33ddcf2d --- /dev/null +++ b/schema/crdb/11.0.0/up04.sql @@ -0,0 +1,8 @@ +/* + * Recreate index to support looking up a producer by its assigned oximeter + * collector ID. + */ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_producer_by_oximeter ON omicron.public.metric_producer ( + oximeter_id, + id +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 875877ee967..f39e0d57dbb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1108,6 +1108,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.oximeter ( port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL ); +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); + /* * Information about registered metric producers. */ @@ -1115,6 +1127,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind NOT NULL, ip INET NOT NULL, port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, interval FLOAT NOT NULL, @@ -2838,7 +2851,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '10.0.0', NULL) + ( TRUE, NOW(), NOW(), '11.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/disk.rs b/sled-agent/src/sim/disk.rs index 2d2c18be258..2b648267790 100644 --- a/sled-agent/src/sim/disk.rs +++ b/sled-agent/src/sim/disk.rs @@ -17,6 +17,7 @@ use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use propolis_client::api::DiskAttachmentState as PropolisDiskState; @@ -168,6 +169,7 @@ impl SimDisk { let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: ProducerKind::SledAgent, address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_millis(200), diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index aec64a1349f..d94d69a2948 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -43,6 +43,7 @@ use omicron_common::address::{ }; use omicron_common::api::external::Vni; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; @@ -504,6 +505,7 @@ impl SledAgent { // Nexus. This should not block progress here. let endpoint = ProducerEndpoint { id: request.body.id, + kind: ProducerKind::SledAgent, address: sled_address.into(), base_route: String::from("/metrics/collect"), interval: crate::metrics::METRIC_COLLECTION_INTERVAL,