diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 8372b7c5602..62524fd6c66 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -18,9 +18,11 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [features] -default = ["http-instruments", "kstat"] +default = ["http-instruments", "datalink", "virtual-machine"] http-instruments = ["http"] -kstat = ["kstat-rs"] +datalink = ["dep:kstat-rs"] +virtual-machine = ["dep:kstat-rs"] + [dev-dependencies] rand.workspace = true diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index d8b0c23887e..53ce5e17af0 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -87,8 +87,10 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(feature = "datalink")] pub mod link; mod sampler; +#[cfg(feature = "virtual-machine")] pub mod virtual_machine; pub use sampler::CollectionDetails; diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d003e717395..ec1d28713f3 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -9,5 +9,8 @@ #[cfg(feature = "http-instruments")] pub mod http; -#[cfg(all(feature = "kstat", target_os = "illumos"))] +#[cfg(all( + any(feature = "link", feature = "virtual-machine"), + target_os = "illumos" +))] pub mod kstat; diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index cffdbf9e979..3795eca4419 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -9,7 +9,6 @@ use crate::common::instance::{ PublishedVmmState, }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; -use crate::metrics::Error as MetricsError; use crate::metrics::MetricsManager; use crate::metrics::INSTANCE_SAMPLE_INTERVAL; use crate::nexus::NexusClientWithResolver; @@ -111,9 +110,6 @@ pub enum Error { #[error("I/O error")] Io(#[from] std::io::Error), - - #[error("Failed to track instance metrics")] - Metrics(#[source] MetricsError), } // Issues read-only, idempotent HTTP requests at propolis until it responds with @@ -239,7 +235,7 @@ struct InstanceInner { // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, - // Object used to start / stop collection of instance-related metrics. + // Data used to start / stop collection of instance-related metrics. metrics_manager: MetricsManager, // Object representing membership in the "instance manager". @@ -392,14 +388,35 @@ impl InstanceInner { Ok(Reaction::Terminate) } None => { - self.metrics_manager + // TODO-robustness: If we fail to register the actual kstats for + // this instance, we should produce missing samples, rather than + // nothing at all. + // + // See https://github.com/oxidecomputer/omicron/issues/4863. + // + // TODO(ben) -- need to do this once, or ignore duplicate target + // error, not every time we get a state update from propolis. + // + // TODO(ben) -- need to take number of vcpus. It looks like + // Propolis and the kernel hypervisor currently creates a kstat + // for all _possible_ vcpus, even those that don't exist? + if let Err(e) = self + .metrics_manager .track_instance( &self.id(), &self.metadata, + self.properties.vcpus.into(), INSTANCE_SAMPLE_INTERVAL, ) .await - .map_err(Error::Metrics)?; + { + error!( + self.log, + "failed to track instance metrics, \ + no samples will be produced"; + "error" => ?e, + ); + } Ok(Reaction::Continue) } } diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6fdbdef6cfe..23edfa1f4e1 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -217,6 +217,7 @@ impl MetricsManager { &self, instance_id: &Uuid, metadata: &InstanceMetadata, + _n_vcpus: u32, interval: Duration, ) -> Result<(), Error> { let vm = virtual_machine::VirtualMachine { @@ -303,6 +304,7 @@ impl MetricsManager { &self, _instance_id: &Uuid, _metadata: &InstanceMetadata, + _n_vcpus: u32, _interval: Duration, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!(