From c9f1dddc98b1a0e8b00ca45ce0e242dc501221c5 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 10 Jul 2024 10:14:32 -0700 Subject: [PATCH] Move instance health check timeseries to TOML (#6034) --- .../app/background/tasks/instance_watcher.rs | 67 ++------------ oximeter/oximeter/schema/vm-health-check.toml | 87 +++++++++++++++++++ 2 files changed, 93 insertions(+), 61 deletions(-) create mode 100644 oximeter/oximeter/schema/vm-health-check.toml diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index ce202a2a08..8a41e2d062 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -26,12 +26,14 @@ use sled_agent_client::Client as SledAgentClient; use std::borrow::Cow; use std::collections::BTreeMap; use std::future::Future; -use std::net::IpAddr; use std::num::NonZeroU32; use std::sync::Arc; use std::sync::Mutex; use uuid::Uuid; +oximeter::use_timeseries!("vm-health-check.toml"); +use virtual_machine::VirtualMachine; + /// Background task that periodically checks instance states. pub(crate) struct InstanceWatcher { datastore: Arc, @@ -211,30 +213,6 @@ pub struct WatcherIdentity { pub rack_id: Uuid, } -#[derive( - Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target, -)] -struct VirtualMachine { - /// The rack ID of the Nexus process which performed the health check. - rack_id: Uuid, - /// The ID of the Nexus process which performed the health check. - nexus_id: Uuid, - /// The instance's ID. - instance_id: Uuid, - /// The silo ID of the instance's silo. - silo_id: Uuid, - /// The project ID of the instance. - project_id: Uuid, - /// The VMM ID of the instance's virtual machine manager. - vmm_id: Uuid, - /// The sled-agent's ID. - sled_agent_id: Uuid, - /// The sled agent's IP address. - sled_agent_ip: IpAddr, - /// The sled agent's port. - sled_agent_port: u16, -} - impl VirtualMachine { fn new( WatcherIdentity { rack_id, nexus_id }: WatcherIdentity, @@ -497,12 +475,12 @@ impl BackgroundTask for InstanceWatcher { } mod metrics { + use super::virtual_machine::Check; + use super::virtual_machine::IncompleteCheck; use super::{CheckOutcome, Incomplete, VirtualMachine}; use oximeter::types::Cumulative; - use oximeter::Metric; use oximeter::MetricsError; use oximeter::Sample; - use std::borrow::Cow; use std::collections::BTreeMap; use std::sync::Arc; use std::sync::Mutex; @@ -539,7 +517,7 @@ mod metrics { .check_errors .entry(error) .or_insert_with(|| IncompleteCheck { - reason: error.as_str(), + failure_reason: error.as_str(), datum: Cumulative::default(), }) .datum += 1; @@ -592,37 +570,4 @@ mod metrics { Ok(()) } } - - /// The number of successful checks for a single instance, VMM, and sled agent. - #[derive(Clone, Debug, Metric)] - struct Check { - /// The string representation of the instance's state as understood by - /// the VMM. If the check failed, this will generally be "failed". - state: Cow<'static, str>, - /// `Why the instance was marked as being in this state. - /// - /// If an instance was marked as "failed" due to a check failure, this - /// will be a string representation of the failure reason. Otherwise, if - /// the check was successful, this will be "success". Note that this may - /// be "success" even if the instance's state is "failed", which - /// indicates that we successfully queried the instance's state from the - /// sled-agent, and the *sled-agent* reported that the instance has - /// failed --- which is distinct from the instance watcher marking an - /// instance as failed due to a failed check. - reason: Cow<'static, str>, - /// The number of checks for this instance and sled agent which recorded - /// this state for this reason. - datum: Cumulative, - } - - /// The number of unsuccessful checks for an instance and sled agent pair. - #[derive(Clone, Debug, Metric)] - struct IncompleteCheck { - /// The reason why the check was unsuccessful. - /// - /// This is generated from the [`Incomplete`] enum's `Display` implementation. - reason: Cow<'static, str>, - /// The number of failed checks for this instance and sled agent. - datum: Cumulative, - } } diff --git a/oximeter/oximeter/schema/vm-health-check.toml b/oximeter/oximeter/schema/vm-health-check.toml new file mode 100644 index 0000000000..62a5e68ca0 --- /dev/null +++ b/oximeter/oximeter/schema/vm-health-check.toml @@ -0,0 +1,87 @@ +format_version = 1 + +[target] +name = "virtual_machine" +description = "A virtual machine instance" +authz_scope = "fleet" +versions = [ + { version = 1, fields = [ "rack_id", "nexus_id", "instance_id", "silo_id", "project_id", "vmm_id", "sled_agent_id", "sled_agent_ip", "sled_agent_port" ] }, +] + +[[metrics]] +name = "check" +description = "The number of successful checks of an instance's health" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "state", "reason" ] } +] + +[[metrics]] +name = "incomplete_check" +description = "The number of unsuccessful checks of an instance's health" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "failure_reason" ] } +] + +[fields.rack_id] +type = "uuid" +description = "The rack ID of the Nexus process which performed the health check" + +[fields.nexus_id] +type = "uuid" +description = "The ID of the Nexus process which performed the health check" + +[fields.instance_id] +type = "uuid" +description = "The instance's ID" + +[fields.silo_id] +type = "uuid" +description = "The ID of the instance's silo" + +[fields.project_id] +type = "uuid" +description = "The ID of the instance's project" + +[fields.vmm_id] +type = "uuid" +description = "The VMM ID of the instance's virtual machine manager" + +[fields.sled_agent_id] +type = "uuid" +description = "The ID of the sled-agent managing the instance" + +[fields.sled_agent_ip] +type = "ip_addr" +description = "The IP address of the sled-agent managing the instance" + +[fields.sled_agent_port] +type = "u16" +description = "The port of the sled-agent managing the instance" + +[fields.state] +type = "string" +description = """ +The string representation of the instance's state as understood by \ +the VMM. If the check failed, this will generally be "failed".""" + +[fields.reason] +type = "string" +description = """ +Why the instance was marked as being in this state. + +If an instance was marked as "failed" due to a check failure, this \ +will be a string representation of the failure reason. Otherwise, if \ +the check was successful, this will be "success". Note that this may \ +be "success" even if the instance's state is "failed", which \ +indicates that we successfully queried the instance's state from the \ +sled-agent, and the *sled-agent* reported that the instance has \ +failed -- which is distinct from the instance watcher marking an \ +instance as failed due to a failed check.""" + +[fields.failure_reason] +type = "string" +description = "The reason why the instance healh check failed"