Skip to content

Commit

Permalink
Move instance health check timeseries to TOML (#6034)
Browse files Browse the repository at this point in the history
  • Loading branch information
bnaecker authored Jul 10, 2024
1 parent f8dbe85 commit c9f1ddd
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 61 deletions.
67 changes: 6 additions & 61 deletions nexus/src/app/background/tasks/instance_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ use sled_agent_client::Client as SledAgentClient;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::future::Future;
use std::net::IpAddr;
use std::num::NonZeroU32;
use std::sync::Arc;
use std::sync::Mutex;
use uuid::Uuid;

oximeter::use_timeseries!("vm-health-check.toml");
use virtual_machine::VirtualMachine;

/// Background task that periodically checks instance states.
pub(crate) struct InstanceWatcher {
datastore: Arc<DataStore>,
Expand Down Expand Up @@ -211,30 +213,6 @@ pub struct WatcherIdentity {
pub rack_id: Uuid,
}

#[derive(
Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target,
)]
struct VirtualMachine {
/// The rack ID of the Nexus process which performed the health check.
rack_id: Uuid,
/// The ID of the Nexus process which performed the health check.
nexus_id: Uuid,
/// The instance's ID.
instance_id: Uuid,
/// The silo ID of the instance's silo.
silo_id: Uuid,
/// The project ID of the instance.
project_id: Uuid,
/// The VMM ID of the instance's virtual machine manager.
vmm_id: Uuid,
/// The sled-agent's ID.
sled_agent_id: Uuid,
/// The sled agent's IP address.
sled_agent_ip: IpAddr,
/// The sled agent's port.
sled_agent_port: u16,
}

impl VirtualMachine {
fn new(
WatcherIdentity { rack_id, nexus_id }: WatcherIdentity,
Expand Down Expand Up @@ -497,12 +475,12 @@ impl BackgroundTask for InstanceWatcher {
}

mod metrics {
use super::virtual_machine::Check;
use super::virtual_machine::IncompleteCheck;
use super::{CheckOutcome, Incomplete, VirtualMachine};
use oximeter::types::Cumulative;
use oximeter::Metric;
use oximeter::MetricsError;
use oximeter::Sample;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::sync::Arc;
use std::sync::Mutex;
Expand Down Expand Up @@ -539,7 +517,7 @@ mod metrics {
.check_errors
.entry(error)
.or_insert_with(|| IncompleteCheck {
reason: error.as_str(),
failure_reason: error.as_str(),
datum: Cumulative::default(),
})
.datum += 1;
Expand Down Expand Up @@ -592,37 +570,4 @@ mod metrics {
Ok(())
}
}

/// The number of successful checks for a single instance, VMM, and sled agent.
#[derive(Clone, Debug, Metric)]
struct Check {
/// The string representation of the instance's state as understood by
/// the VMM. If the check failed, this will generally be "failed".
state: Cow<'static, str>,
/// `Why the instance was marked as being in this state.
///
/// If an instance was marked as "failed" due to a check failure, this
/// will be a string representation of the failure reason. Otherwise, if
/// the check was successful, this will be "success". Note that this may
/// be "success" even if the instance's state is "failed", which
/// indicates that we successfully queried the instance's state from the
/// sled-agent, and the *sled-agent* reported that the instance has
/// failed --- which is distinct from the instance watcher marking an
/// instance as failed due to a failed check.
reason: Cow<'static, str>,
/// The number of checks for this instance and sled agent which recorded
/// this state for this reason.
datum: Cumulative<u64>,
}

/// The number of unsuccessful checks for an instance and sled agent pair.
#[derive(Clone, Debug, Metric)]
struct IncompleteCheck {
/// The reason why the check was unsuccessful.
///
/// This is generated from the [`Incomplete`] enum's `Display` implementation.
reason: Cow<'static, str>,
/// The number of failed checks for this instance and sled agent.
datum: Cumulative<u64>,
}
}
87 changes: 87 additions & 0 deletions oximeter/oximeter/schema/vm-health-check.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
format_version = 1

[target]
name = "virtual_machine"
description = "A virtual machine instance"
authz_scope = "fleet"
versions = [
{ version = 1, fields = [ "rack_id", "nexus_id", "instance_id", "silo_id", "project_id", "vmm_id", "sled_agent_id", "sled_agent_ip", "sled_agent_port" ] },
]

[[metrics]]
name = "check"
description = "The number of successful checks of an instance's health"
units = "count"
datum_type = "cumulative_u64"
versions = [
{ added_in = 1, fields = [ "state", "reason" ] }
]

[[metrics]]
name = "incomplete_check"
description = "The number of unsuccessful checks of an instance's health"
units = "count"
datum_type = "cumulative_u64"
versions = [
{ added_in = 1, fields = [ "failure_reason" ] }
]

[fields.rack_id]
type = "uuid"
description = "The rack ID of the Nexus process which performed the health check"

[fields.nexus_id]
type = "uuid"
description = "The ID of the Nexus process which performed the health check"

[fields.instance_id]
type = "uuid"
description = "The instance's ID"

[fields.silo_id]
type = "uuid"
description = "The ID of the instance's silo"

[fields.project_id]
type = "uuid"
description = "The ID of the instance's project"

[fields.vmm_id]
type = "uuid"
description = "The VMM ID of the instance's virtual machine manager"

[fields.sled_agent_id]
type = "uuid"
description = "The ID of the sled-agent managing the instance"

[fields.sled_agent_ip]
type = "ip_addr"
description = "The IP address of the sled-agent managing the instance"

[fields.sled_agent_port]
type = "u16"
description = "The port of the sled-agent managing the instance"

[fields.state]
type = "string"
description = """
The string representation of the instance's state as understood by \
the VMM. If the check failed, this will generally be "failed"."""

[fields.reason]
type = "string"
description = """
Why the instance was marked as being in this state.
If an instance was marked as "failed" due to a check failure, this \
will be a string representation of the failure reason. Otherwise, if \
the check was successful, this will be "success". Note that this may \
be "success" even if the instance's state is "failed", which \
indicates that we successfully queried the instance's state from the \
sled-agent, and the *sled-agent* reported that the instance has \
failed -- which is distinct from the instance watcher marking an \
instance as failed due to a failed check."""

[fields.failure_reason]
type = "string"
description = "The reason why the instance healh check failed"

0 comments on commit c9f1ddd

Please sign in to comment.