Skip to content

Commit

Permalink
More WIP
Browse files Browse the repository at this point in the history
- Store number of vcpus in virtual machine
- impl target manually
- only produce samples from expected vcpu kstats
- track instance metrics once
  • Loading branch information
bnaecker committed Jan 23, 2024
1 parent ce47e33 commit 0c8226f
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 26 deletions.
1 change: 0 additions & 1 deletion oximeter/instruments/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ http-instruments = ["http"]
datalink = ["dep:kstat-rs"]
virtual-machine = ["dep:kstat-rs"]


[dev-dependencies]
rand.workspace = true
slog-async.workspace = true
Expand Down
55 changes: 51 additions & 4 deletions oximeter/instruments/src/kstat/virtual_machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,51 @@ use kstat_rs::Kstat;
use kstat_rs::Named;
use kstat_rs::NamedData;
use oximeter::types::Cumulative;
use oximeter::FieldType;
use oximeter::FieldValue;
use oximeter::Metric;
use oximeter::Sample;
use oximeter::Target;
use uuid::Uuid;

/// A single virtual machine
#[derive(Clone, Debug, Target)]
/// A single virtual machine instance.
#[derive(Clone, Debug)]
pub struct VirtualMachine {
/// The silo to which the instance belongs.
pub silo_id: Uuid,
/// The project to which the instance belongs.
pub project_id: Uuid,
/// The ID of the instance.
pub instance_id: Uuid,

// This field is not published as part of the target field definitions. It
// is needed because the hypervisor currently creates kstats for each vCPU,
// regardless of whether they're activated. There is no way to tell from
// userland today which vCPU kstats are "real". We include this value here,
// and implement `oximeter::Target` manually.
pub n_vcpus: u32,
}

impl Target for VirtualMachine {
fn name(&self) -> &'static str {
"virtual_machine"
}

fn field_names(&self) -> &'static [&'static str] {
&["silo_id", "project_id", "instance_id"]
}

fn field_types(&self) -> Vec<FieldType> {
vec![FieldType::Uuid, FieldType::Uuid, FieldType::Uuid]
}

fn field_values(&self) -> Vec<FieldValue> {
vec![
self.silo_id.into(),
self.project_id.into(),
self.instance_id.into(),
]
}
}

/// Metric tracking vCPU usage by state.
Expand Down Expand Up @@ -92,6 +123,9 @@ impl KstatTarget for VirtualMachine {
// First, we need to map the instance's control plane UUID to the
// instance ID. We'll find this through the `vmm:<instance>:vm:vm_name`
// kstat, which lists the instance's UUID as its name.
//
// Note that if this code is run from within a Propolis zone, there is
// exactly one `vmm` kstat instance in any case.
let instance_id = self.instance_id.to_string();
let instance = kstats
.iter()
Expand All @@ -104,8 +138,21 @@ impl KstatTarget for VirtualMachine {
// this particular VM. For now, we produce only vCPU usage metrics, but
// others may be chained in the future.
let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| {
kstat.ks_instance == instance
&& kstat.ks_name.starts_with(VCPU_KSTAT_PREFIX)
// Filter out those that don't match our kstat instance.
if kstat.ks_instance != instance {
return false;
}

// Filter out those which are neither a vCPU stat of any kind, nor
// for one of the vCPU IDs we know to be active.
let Some(suffix) = kstat.ks_name.strip_prefix(VCPU_KSTAT_PREFIX)
else {
return false;
};
let Ok(vcpu_id) = suffix.parse::<u32>() else {
return false;
};
vcpu_id < self.n_vcpus
});
produce_vcpu_usage(self, vcpu_stats)
}
Expand Down
42 changes: 22 additions & 20 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ struct InstanceInner {
zone_bundler: ZoneBundler,

// Data used to start / stop collection of instance-related metrics.
tracking_instance_metrics: bool,
metrics_manager: MetricsManager,

// Object representing membership in the "instance manager".
Expand Down Expand Up @@ -396,26 +397,26 @@ impl InstanceInner {
//
// TODO(ben) -- need to do this once, or ignore duplicate target
// error, not every time we get a state update from propolis.
//
// TODO(ben) -- need to take number of vcpus. It looks like
// Propolis and the kernel hypervisor currently creates a kstat
// for all _possible_ vcpus, even those that don't exist?
if let Err(e) = self
.metrics_manager
.track_instance(
&self.id(),
&self.metadata,
self.properties.vcpus.into(),
INSTANCE_SAMPLE_INTERVAL,
)
.await
{
error!(
self.log,
"failed to track instance metrics, \
no samples will be produced";
"error" => ?e,
);
if !self.tracking_instance_metrics {
if let Err(e) = self
.metrics_manager
.track_instance(
&self.id(),
&self.metadata,
self.properties.vcpus.into(),
INSTANCE_SAMPLE_INTERVAL,
)
.await
{
error!(
self.log,
"failed to track instance metrics, \
no samples will be produced";
"error" => ?e,
);
} else {
self.tracking_instance_metrics = true;
}
}
Ok(Reaction::Continue)
}
Expand Down Expand Up @@ -743,6 +744,7 @@ impl Instance {
storage,
zone_builder_factory,
zone_bundler,
tracking_instance_metrics: false,
metrics_manager,
instance_ticket: ticket,
metadata,
Expand Down
3 changes: 2 additions & 1 deletion sled-agent/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,14 @@ impl MetricsManager {
&self,
instance_id: &Uuid,
metadata: &InstanceMetadata,
_n_vcpus: u32,
n_vcpus: u32,
interval: Duration,
) -> Result<(), Error> {
let vm = virtual_machine::VirtualMachine {
silo_id: metadata.silo_id,
project_id: metadata.project_id,
instance_id: *instance_id,
n_vcpus,
};
let details = CollectionDetails::never(interval);
let id = self
Expand Down

0 comments on commit 0c8226f

Please sign in to comment.