From 3211651ee50ee82bd4da5d08c71e40906617a6f6 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 22 Jan 2024 22:28:38 +0000 Subject: [PATCH] WIP This is a WIP to test publishing this data from Propolis itself, rather than the sled-agent. Should we take that path, we'll end up deleting most of this diff in the sled-agent itself. --- oximeter/instruments/Cargo.toml | 6 ++++-- oximeter/instruments/src/kstat/mod.rs | 2 ++ oximeter/instruments/src/lib.rs | 5 ++++- sled-agent/src/instance.rs | 31 +++++++++++++++++++++------ sled-agent/src/metrics.rs | 2 ++ 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 8372b7c560..62524fd6c6 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -18,9 +18,11 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [features] -default = ["http-instruments", "kstat"] +default = ["http-instruments", "datalink", "virtual-machine"] http-instruments = ["http"] -kstat = ["kstat-rs"] +datalink = ["dep:kstat-rs"] +virtual-machine = ["dep:kstat-rs"] + [dev-dependencies] rand.workspace = true diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index d8b0c23887..53ce5e17af 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -87,8 +87,10 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(feature = "datalink")] pub mod link; mod sampler; +#[cfg(feature = "virtual-machine")] pub mod virtual_machine; pub use sampler::CollectionDetails; diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d003e71739..ec1d28713f 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -9,5 +9,8 @@ #[cfg(feature = "http-instruments")] pub mod http; -#[cfg(all(feature = "kstat", target_os = "illumos"))] +#[cfg(all( + any(feature = "link", feature = "virtual-machine"), + target_os = "illumos" +))] pub mod kstat; diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 641a4bb6d3..d94db80ad5 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -9,7 +9,6 @@ use crate::common::instance::{ PublishedVmmState, }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; -use crate::metrics::Error as MetricsError; use crate::metrics::MetricsManager; use crate::metrics::INSTANCE_SAMPLE_INTERVAL; use crate::nexus::NexusClientWithResolver; @@ -111,9 +110,6 @@ pub enum Error { #[error("I/O error")] Io(#[from] std::io::Error), - - #[error("Failed to track instance metrics")] - Metrics(#[source] MetricsError), } // Issues read-only, idempotent HTTP requests at propolis until it responds with @@ -239,7 +235,7 @@ struct InstanceInner { // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, - // Object used to start / stop collection of instance-related metrics. + // Data used to start / stop collection of instance-related metrics. metrics_manager: MetricsManager, // Object representing membership in the "instance manager". @@ -394,14 +390,35 @@ impl InstanceInner { Ok(Reaction::Terminate) } None => { - self.metrics_manager + // TODO-robustness: If we fail to register the actual kstats for + // this instance, we should produce missing samples, rather than + // nothing at all. + // + // See https://github.com/oxidecomputer/omicron/issues/4863. + // + // TODO(ben) -- need to do this once, or ignore duplicate target + // error, not every time we get a state update from propolis. + // + // TODO(ben) -- need to take number of vcpus. It looks like + // Propolis and the kernel hypervisor currently creates a kstat + // for all _possible_ vcpus, even those that don't exist? + if let Err(e) = self + .metrics_manager .track_instance( &self.id(), &self.metadata, + self.properties.vcpus.into(), INSTANCE_SAMPLE_INTERVAL, ) .await - .map_err(Error::Metrics)?; + { + error!( + self.log, + "failed to track instance metrics, \ + no samples will be produced"; + "error" => ?e, + ); + } Ok(Reaction::Continue) } } diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6fdbdef6cf..23edfa1f4e 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -217,6 +217,7 @@ impl MetricsManager { &self, instance_id: &Uuid, metadata: &InstanceMetadata, + _n_vcpus: u32, interval: Duration, ) -> Result<(), Error> { let vm = virtual_machine::VirtualMachine { @@ -303,6 +304,7 @@ impl MetricsManager { &self, _instance_id: &Uuid, _metadata: &InstanceMetadata, + _n_vcpus: u32, _interval: Duration, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!(