Publish instance vCPU usage statistics to oximeter

- Adds the silo and project IDs to the instance-ensure request from Nexus to the sled-agent. These are used as fields on the instance-related statistics. - Defines a `VirtualMachine` oximeter target and `VcpuUsage` metric. The latter has a `state` field which corresponds to the named kstats published by the hypervisor that accumulate the time spent in a number of vCPU microstates. The combination of these should allow us to aggregate or break down vCPU usage by silo, project, instance, vCPU ID, and CPU state. - Adds APIs to the `MetricsManager` for starting / stopping tracking instance-related metrics, and plumbs the type through the `InstanceManager` and `Instance` (and their internal friends), so that new instances can control when data is produced from them. Currently, we'll start producing as soon as we get a non-terminate response from Propolis in the `instance_state_monitor()` task, and stop when the instance is terminated.
oxidecomputer · Jan 21, 2024 · a900cdd · a900cdd
1 parent 1ae97e4
commit a900cdd
Show file tree

Hide file tree

Showing 12 changed files with 482 additions and 93 deletions.
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -1135,6 +1135,21 @@ impl super::Nexus {
             .map(|ssh_key| ssh_key.public_key)
             .collect::<Vec<String>>();
 
+        // Construct instance metadata used to track its statistics.
+        //
+        // This current means fetching the current silo ID, since we have all
+        // the other metadata already.
+        let silo_id = self
+            .current_silo_lookup(opctx)?
+            .lookup_for(authz::Action::Read)
+            .await?
+            .0
+            .id();
+        let metadata = sled_agent_client::types::InstanceMetadata {
+            silo_id,
+            project_id: db_instance.project_id,
+        };
+
         // Ask the sled agent to begin the state change.  Then update the
         // database to reflect the new intermediate state.  If this update is
         // not the newest one, that's fine.  That might just mean the sled agent
@@ -1178,6 +1193,7 @@ impl super::Nexus {
                         PROPOLIS_PORT,
                     )
                     .to_string(),
+                    metadata,
                 },
             )
             .await

diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json
@@ -4515,6 +4515,14 @@
               }
             ]
           },
+          "metadata": {
+            "description": "Metadata used to track instance statistics.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/InstanceMetadata"
+              }
+            ]
+          },
           "propolis_addr": {
             "description": "The address at which this VMM should serve a Propolis server API.",
             "type": "string"
@@ -4536,6 +4544,7 @@
         "required": [
           "hardware",
           "instance_runtime",
+          "metadata",
           "propolis_addr",
           "propolis_id",
           "vmm_runtime"
@@ -4624,6 +4633,24 @@
           "snapshot_id"
         ]
       },
+      "InstanceMetadata": {
+        "description": "Metadata used to track statistics about an instance.",
+        "type": "object",
+        "properties": {
+          "project_id": {
+            "type": "string",
+            "format": "uuid"
+          },
+          "silo_id": {
+            "type": "string",
+            "format": "uuid"
+          }
+        },
+        "required": [
+          "project_id",
+          "silo_id"
+        ]
+      },
       "InstanceMigrationSourceParams": {
         "description": "Instance runtime state to update for a migration.",
         "type": "object",

diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs
@@ -89,6 +89,7 @@ use std::time::Duration;
 
 pub mod link;
 mod sampler;
+pub mod virtual_machine;
 
 pub use sampler::CollectionDetails;
 pub use sampler::ExpirationBehavior;

diff --git a/oximeter/instruments/src/kstat/virtual_machine.rs b/oximeter/instruments/src/kstat/virtual_machine.rs
@@ -0,0 +1,185 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+// Copyright 2023 Oxide Computer Company
+
+//! Types for tracking statistics about virtual machine instances.
+
+use crate::kstat::hrtime_to_utc;
+use crate::kstat::ConvertNamedData;
+use crate::kstat::Error;
+use crate::kstat::KstatList;
+use crate::kstat::KstatTarget;
+use chrono::DateTime;
+use chrono::Utc;
+use kstat_rs::Data;
+use kstat_rs::Kstat;
+use kstat_rs::Named;
+use kstat_rs::NamedData;
+use oximeter::types::Cumulative;
+use oximeter::Metric;
+use oximeter::Sample;
+use oximeter::Target;
+use uuid::Uuid;
+
+/// A single virtual machine
+#[derive(Clone, Debug, Target)]
+pub struct VirtualMachine {
+    /// The silo to which the instance belongs.
+    pub silo_id: Uuid,
+    /// The project to which the instance belongs.
+    pub project_id: Uuid,
+    /// The ID of the instance.
+    pub instance_id: Uuid,
+}
+
+/// Metric tracking vCPU usage by state.
+#[derive(Clone, Debug, Metric)]
+pub struct VcpuUsage {
+    /// The vCPU ID.
+    pub vcpu_id: u32,
+    /// The state of the vCPU.
+    pub state: String,
+    /// The cumulative time spent in this state, in nanoseconds.
+    pub datum: Cumulative<u64>,
+}
+
+// The name of the kstat module containing virtual machine kstats.
+const VMM_KSTAT_MODULE_NAME: &str = "vmm";
+
+// The name of the kstat with virtual machine metadata (VM name currently).
+const VM_KSTAT_NAME: &str = "vm";
+
+// The named kstat holding the virtual machine's name. This is currently the
+// UUID assigned by the control plane to the virtual machine instance.
+const VM_NAME_KSTAT: &str = "vm_name";
+
+// The name of kstat containing vCPU usage data.
+const VCPU_KSTAT_PREFIX: &str = "vcpu";
+
+// Prefix for all named data with a valid vCPU microstate that we track.
+const VCPU_MICROSTATE_PREFIX: &str = "time_";
+
+// The number of expected vCPU microstates we track. This isn't load-bearing,
+// and only used to help preallocate an array holding the `VcpuUsage` samples.
+const N_VCPU_MICROSTATES: usize = 6;
+
+impl KstatTarget for VirtualMachine {
+    // The VMM kstats are organized like so:
+    //
+    // - module: vmm
+    // - instance: a kernel-assigned integer
+    // - name: vm -> generic VM info, vcpuX -> info for each vCPU
+    //
+    // At this part of the code, we don't have that kstat instance, only the
+    // virtual machine instance's control plane UUID. However, the VM's "name"
+    // is assigned to be that control plane UUID in the hypervisor. See
+    // https://github.com/oxidecomputer/propolis/blob/759bf4a19990404c135e608afbe0d38b70bfa370/bin/propolis-server/src/lib/vm/mod.rs#L420
+    // for the current code which does that.
+    //
+    // So we need to indicate interest in any VMM-related kstat here, and we are
+    // forced to filter to the right instance by looking up the VM name inside
+    // the `to_samples()` method below.
+    fn interested(&self, kstat: &Kstat<'_>) -> bool {
+        kstat.ks_module == VMM_KSTAT_MODULE_NAME
+    }
+
+    fn to_samples(
+        &self,
+        kstats: KstatList<'_, '_>,
+    ) -> Result<Vec<Sample>, Error> {
+        // First, we need to map the instance's control plane UUID to the
+        // instance ID. We'll find this through the `vmm:<instance>:vm:vm_name`
+        // kstat, which lists the instance's UUID as its name.
+        let instance_id = self.instance_id.to_string();
+        let instance = kstats
+            .iter()
+            .find_map(|(_, kstat, data)| {
+                kstat_instance_from_instance_id(kstat, data, &instance_id)
+            })
+            .ok_or_else(|| Error::NoSuchKstat)?;
+
+        // Armed with the kstat instance, find all relevant metrics related to
+        // this particular VM. For now, we produce only vCPU usage metrics, but
+        // others may be chained in the future.
+        let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| {
+            kstat.ks_instance == instance
+                && kstat.ks_name.starts_with(VCPU_KSTAT_PREFIX)
+        });
+        produce_vcpu_usage(self, vcpu_stats)
+    }
+}
+
+// Given a kstat and an instance's ID, return the kstat instance if it matches.
+pub fn kstat_instance_from_instance_id(
+    kstat: &Kstat<'_>,
+    data: &Data<'_>,
+    instance_id: &str,
+) -> Option<i32> {
+    if kstat.ks_module != VMM_KSTAT_MODULE_NAME {
+        return None;
+    }
+    if kstat.ks_name != VM_KSTAT_NAME {
+        return None;
+    }
+    let Data::Named(named) = data else {
+        return None;
+    };
+    if named.iter().any(|nd| {
+        if nd.name != VM_NAME_KSTAT {
+            return false;
+        }
+        let NamedData::String(name) = &nd.value else {
+            return false;
+        };
+        instance_id == *name
+    }) {
+        return Some(kstat.ks_instance);
+    }
+    None
+}
+
+// Produce `Sample`s for the `VcpuUsage` metric from the relevant kstats.
+pub fn produce_vcpu_usage<'a>(
+    vm: &'a VirtualMachine,
+    vcpu_stats: impl Iterator<Item = &'a (DateTime<Utc>, Kstat<'a>, Data<'a>)> + 'a,
+) -> Result<Vec<Sample>, Error> {
+    let mut out = Vec::with_capacity(N_VCPU_MICROSTATES);
+    for (creation_time, kstat, data) in vcpu_stats {
+        let Data::Named(named) = data else {
+            return Err(Error::ExpectedNamedKstat);
+        };
+        let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?;
+
+        // Find the vCPU ID, from the relevant named data item.
+        let vcpu_id = named
+            .iter()
+            .find_map(|named| {
+                if named.name == VCPU_KSTAT_PREFIX {
+                    named.value.as_u32().ok()
+                } else {
+                    None
+                }
+            })
+            .ok_or_else(|| Error::NoSuchKstat)?;
+
+        // We'll track all statistics starting with `time_` as the microstate.
+        for Named { name, value } in named
+            .iter()
+            .filter(|nv| nv.name.starts_with(VCPU_MICROSTATE_PREFIX))
+        {
+            // Safety: We're filtering in the loop on this prefix, so it must
+            // exist.
+            let state =
+                name.strip_prefix(VCPU_MICROSTATE_PREFIX).unwrap().to_string();
+            let datum =
+                Cumulative::with_start_time(*creation_time, value.as_u64()?);
+            let metric = VcpuUsage { vcpu_id, state, datum };
+            let sample =
+                Sample::new_with_timestamp(snapshot_time, vm, &metric)?;
+            out.push(sample);
+        }
+    }
+    Ok(out)
+}
diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
@@ -410,6 +410,7 @@ async fn instance_register(
             body_args.instance_runtime,
             body_args.vmm_runtime,
             body_args.propolis_addr,
+            body_args.metadata,
         )
         .await?,
     ))

diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
@@ -9,11 +9,14 @@ use crate::common::instance::{
     PublishedVmmState,
 };
 use crate::instance_manager::{InstanceManagerServices, InstanceTicket};
+use crate::metrics::Error as MetricsError;
+use crate::metrics::MetricsManager;
+use crate::metrics::INSTANCE_SAMPLE_INTERVAL;
 use crate::nexus::NexusClientWithResolver;
 use crate::params::ZoneBundleCause;
 use crate::params::ZoneBundleMetadata;
 use crate::params::{
-    InstanceHardware, InstanceMigrationSourceParams,
+    InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams,
     InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule,
 };
 use crate::profile::*;
@@ -108,6 +111,9 @@ pub enum Error {
 
     #[error("I/O error")]
     Io(#[from] std::io::Error),
+
+    #[error("Failed to track instance metrics")]
+    Metrics(#[source] MetricsError),
 }
 
 // Issues read-only, idempotent HTTP requests at propolis until it responds with
@@ -233,8 +239,14 @@ struct InstanceInner {
     // Object used to collect zone bundles from this instance when terminated.
     zone_bundler: ZoneBundler,
 
+    // Object used to start / stop collection of instance-related metrics.
+    metrics_manager: MetricsManager,
+
     // Object representing membership in the "instance manager".
     instance_ticket: InstanceTicket,
+
+    // Metadata used to track statistics for this instance.
+    metadata: InstanceMetadata,
 }
 
 impl InstanceInner {
@@ -367,6 +379,10 @@ impl InstanceInner {
         // state to Nexus. This ensures that the instance is actually gone from
         // the sled when Nexus receives the state update saying it's actually
         // destroyed.
+        //
+        // In addition, we'll start or stop collecting metrics soley on the
+        // basis of whether the instance is terminated. All other states imply
+        // we start (or continue) to collect instance metrics.
         match action {
             Some(InstanceAction::Destroy) => {
                 info!(self.log, "terminating VMM that has exited";
@@ -375,7 +391,17 @@ impl InstanceInner {
                 self.terminate().await?;
                 Ok(Reaction::Terminate)
             }
-            None => Ok(Reaction::Continue),
+            None => {
+                self.metrics_manager
+                    .track_instance(
+                        &self.id(),
+                        &self.metadata,
+                        INSTANCE_SAMPLE_INTERVAL,
+                    )
+                    .await
+                    .map_err(Error::Metrics)?;
+                Ok(Reaction::Continue)
+            }
         }
     }
 
@@ -537,6 +563,18 @@ impl InstanceInner {
             );
         }
 
+        // Stop tracking instance-related metrics.
+        if let Err(e) =
+            self.metrics_manager.stop_tracking_instance(self.id()).await
+        {
+            error!(
+                self.log,
+                "Failed to stop tracking instance metrics";
+                "instance_id" => %self.id(),
+                "error" => ?e,
+            );
+        }
+
         // Ensure that no zone exists. This succeeds even if no zone was ever
         // created.
         // NOTE: we call`Zones::halt_and_remove_logged` directly instead of
@@ -596,6 +634,7 @@ impl Instance {
         ticket: InstanceTicket,
         state: InstanceInitialState,
         services: InstanceManagerServices,
+        metadata: InstanceMetadata,
     ) -> Result<Self, Error> {
         info!(log, "initializing new Instance";
               "instance_id" => %id,
@@ -615,6 +654,7 @@ impl Instance {
             port_manager,
             storage,
             zone_bundler,
+            metrics_manager,
             zone_builder_factory,
         } = services;
 
@@ -686,7 +726,9 @@ impl Instance {
             storage,
             zone_builder_factory,
             zone_bundler,
+            metrics_manager,
             instance_ticket: ticket,
+            metadata,
         };
 
         let inner = Arc::new(Mutex::new(instance));