diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 62524fd6c6..0d96fc961a 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -23,7 +23,6 @@ http-instruments = ["http"] datalink = ["dep:kstat-rs"] virtual-machine = ["dep:kstat-rs"] - [dev-dependencies] rand.workspace = true slog-async.workspace = true diff --git a/oximeter/instruments/src/kstat/virtual_machine.rs b/oximeter/instruments/src/kstat/virtual_machine.rs index 1e1637cd56..72f991cf60 100644 --- a/oximeter/instruments/src/kstat/virtual_machine.rs +++ b/oximeter/instruments/src/kstat/virtual_machine.rs @@ -18,13 +18,15 @@ use kstat_rs::Kstat; use kstat_rs::Named; use kstat_rs::NamedData; use oximeter::types::Cumulative; +use oximeter::FieldType; +use oximeter::FieldValue; use oximeter::Metric; use oximeter::Sample; use oximeter::Target; use uuid::Uuid; -/// A single virtual machine -#[derive(Clone, Debug, Target)] +/// A single virtual machine instance. +#[derive(Clone, Debug)] pub struct VirtualMachine { /// The silo to which the instance belongs. pub silo_id: Uuid, @@ -32,6 +34,35 @@ pub struct VirtualMachine { pub project_id: Uuid, /// The ID of the instance. pub instance_id: Uuid, + + // This field is not published as part of the target field definitions. It + // is needed because the hypervisor currently creates kstats for each vCPU, + // regardless of whether they're activated. There is no way to tell from + // userland today which vCPU kstats are "real". We include this value here, + // and implement `oximeter::Target` manually. + pub n_vcpus: u32, +} + +impl Target for VirtualMachine { + fn name(&self) -> &'static str { + "virtual_machine" + } + + fn field_names(&self) -> &'static [&'static str] { + &["silo_id", "project_id", "instance_id"] + } + + fn field_types(&self) -> Vec { + vec![FieldType::Uuid, FieldType::Uuid, FieldType::Uuid] + } + + fn field_values(&self) -> Vec { + vec![ + self.silo_id.into(), + self.project_id.into(), + self.instance_id.into(), + ] + } } /// Metric tracking vCPU usage by state. @@ -92,6 +123,9 @@ impl KstatTarget for VirtualMachine { // First, we need to map the instance's control plane UUID to the // instance ID. We'll find this through the `vmm::vm:vm_name` // kstat, which lists the instance's UUID as its name. + // + // Note that if this code is run from within a Propolis zone, there is + // exactly one `vmm` kstat instance in any case. let instance_id = self.instance_id.to_string(); let instance = kstats .iter() @@ -104,8 +138,21 @@ impl KstatTarget for VirtualMachine { // this particular VM. For now, we produce only vCPU usage metrics, but // others may be chained in the future. let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| { - kstat.ks_instance == instance - && kstat.ks_name.starts_with(VCPU_KSTAT_PREFIX) + // Filter out those that don't match our kstat instance. + if kstat.ks_instance != instance { + return false; + } + + // Filter out those which are neither a vCPU stat of any kind, nor + // for one of the vCPU IDs we know to be active. + let Some(suffix) = kstat.ks_name.strip_prefix(VCPU_KSTAT_PREFIX) + else { + return false; + }; + let Ok(vcpu_id) = suffix.parse::() else { + return false; + }; + vcpu_id < self.n_vcpus }); produce_vcpu_usage(self, vcpu_stats) } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index d94db80ad5..92336b616b 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -236,6 +236,7 @@ struct InstanceInner { zone_bundler: ZoneBundler, // Data used to start / stop collection of instance-related metrics. + tracking_instance_metrics: bool, metrics_manager: MetricsManager, // Object representing membership in the "instance manager". @@ -398,26 +399,26 @@ impl InstanceInner { // // TODO(ben) -- need to do this once, or ignore duplicate target // error, not every time we get a state update from propolis. - // - // TODO(ben) -- need to take number of vcpus. It looks like - // Propolis and the kernel hypervisor currently creates a kstat - // for all _possible_ vcpus, even those that don't exist? - if let Err(e) = self - .metrics_manager - .track_instance( - &self.id(), - &self.metadata, - self.properties.vcpus.into(), - INSTANCE_SAMPLE_INTERVAL, - ) - .await - { - error!( - self.log, - "failed to track instance metrics, \ - no samples will be produced"; - "error" => ?e, - ); + if !self.tracking_instance_metrics { + if let Err(e) = self + .metrics_manager + .track_instance( + &self.id(), + &self.metadata, + self.properties.vcpus.into(), + INSTANCE_SAMPLE_INTERVAL, + ) + .await + { + error!( + self.log, + "failed to track instance metrics, \ + no samples will be produced"; + "error" => ?e, + ); + } else { + self.tracking_instance_metrics = true; + } } Ok(Reaction::Continue) } @@ -745,6 +746,7 @@ impl Instance { storage, zone_builder_factory, zone_bundler, + tracking_instance_metrics: false, metrics_manager, instance_ticket: ticket, metadata, diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 23edfa1f4e..bbff8e96be 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -217,13 +217,14 @@ impl MetricsManager { &self, instance_id: &Uuid, metadata: &InstanceMetadata, - _n_vcpus: u32, + n_vcpus: u32, interval: Duration, ) -> Result<(), Error> { let vm = virtual_machine::VirtualMachine { silo_id: metadata.silo_id, project_id: metadata.project_id, instance_id: *instance_id, + n_vcpus, }; let details = CollectionDetails::never(interval); let id = self