From 9e555fe7ed708dd3e98121a31f6c652236823af1 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Sat, 20 Jan 2024 01:21:32 +0000 Subject: [PATCH] Groundwork for oximeter instance vCPU metrics - Adds the silo and project IDs to the instance-ensure request from Nexus to the sled-agent. These are used as fields on the instance-related statistics. This metadata is currently unused, but will be forwarded to Propolis in the instance-ensure request once the server is updated to accept it. - Defines a `VirtualMachine` oximeter target and `VcpuUsage` metric. The latter has a `state` field which corresponds to the named kstats published by the hypervisor that accumulate the time spent in a number of vCPU microstates. The combination of these should allow us to aggregate or break down vCPU usage by silo, project, instance, vCPU ID, and CPU state. Adds some simple mocks and tests for these. - Adds more fine-grained feature flags to the `oximeter-instruments` crate. Add names for silo, project and instance to instance metadata for statistics tracking Remove actual VM and kstat implementations, they'll live in Propolis Make named data conversion trait public --- Cargo.lock | 4 +- nexus/src/app/instance.rs | 23 +++++ openapi/sled-agent.json | 39 +++++++++ oximeter/instruments/Cargo.toml | 47 +++++++--- oximeter/instruments/src/kstat/mod.rs | 9 +- oximeter/instruments/src/kstat/sampler.rs | 4 +- oximeter/instruments/src/lib.rs | 2 +- oximeter/producer/src/lib.rs | 22 +++-- sled-agent/src/http_entrypoints.rs | 1 + sled-agent/src/instance.rs | 11 ++- sled-agent/src/instance_manager.rs | 14 ++- sled-agent/src/metrics.rs | 101 +++++++++++++--------- sled-agent/src/params.rs | 13 +++ sled-agent/src/sim/http_entrypoints.rs | 1 + sled-agent/src/sim/sled_agent.rs | 5 +- sled-agent/src/sled_agent.rs | 95 +++++++++++--------- 16 files changed, 273 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e82fd6a60f..b2c86e7bdc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1960,7 +1960,7 @@ dependencies = [ [[package]] name = "dropshot" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "async-stream", "async-trait", @@ -2006,7 +2006,7 @@ dependencies = [ [[package]] name = "dropshot_endpoint" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "proc-macro2", "quote", diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 4b52b597ba..769f1c5de8 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1183,6 +1183,28 @@ impl super::Nexus { let ssh_keys: Vec = ssh_keys.map(|ssh_key| ssh_key.public_key).collect(); + // Construct instance metadata used to track its statistics. + // + // This requires another fetch on the silo and project, to extract their + // IDs and names. + let (.., db_project) = self + .project_lookup( + opctx, + params::ProjectSelector { + project: NameOrId::Id(db_instance.project_id), + }, + )? + .fetch() + .await?; + let (_, db_silo) = self.current_silo_lookup(opctx)?.fetch().await?; + let metadata = sled_agent_client::types::InstanceMetadata { + silo_id: db_silo.id(), + silo_name: db_silo.name().to_string(), + project_id: db_project.id(), + project_name: db_project.name().to_string(), + instance_name: db_instance.name().to_string(), + }; + // Ask the sled agent to begin the state change. Then update the // database to reflect the new intermediate state. If this update is // not the newest one, that's fine. That might just mean the sled agent @@ -1226,6 +1248,7 @@ impl super::Nexus { PROPOLIS_PORT, ) .to_string(), + metadata, }, ) .await diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 99156fffd4..1fc89216b4 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4708,6 +4708,14 @@ } ] }, + "metadata": { + "description": "Metadata used to track instance statistics.", + "allOf": [ + { + "$ref": "#/components/schemas/InstanceMetadata" + } + ] + }, "propolis_addr": { "description": "The address at which this VMM should serve a Propolis server API.", "type": "string" @@ -4729,6 +4737,7 @@ "required": [ "hardware", "instance_runtime", + "metadata", "propolis_addr", "propolis_id", "vmm_runtime" @@ -4860,6 +4869,36 @@ "snapshot_id" ] }, + "InstanceMetadata": { + "description": "Metadata used to track statistics about an instance.", + "type": "object", + "properties": { + "instance_name": { + "type": "string" + }, + "project_id": { + "type": "string", + "format": "uuid" + }, + "project_name": { + "type": "string" + }, + "silo_id": { + "type": "string", + "format": "uuid" + }, + "silo_name": { + "type": "string" + } + }, + "required": [ + "instance_name", + "project_id", + "project_name", + "silo_id", + "silo_name" + ] + }, "InstanceMigrationSourceParams": { "description": "Instance runtime state to update for a migration.", "type": "object", diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 8372b7c560..12827699f1 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -5,27 +5,48 @@ edition = "2021" license = "MPL-2.0" [dependencies] -cfg-if.workspace = true -chrono.workspace = true -dropshot.workspace = true -futures.workspace = true +cfg-if = { workspace = true, optional = true } +chrono = { workspace = true, optional = true } +dropshot = { workspace = true, optional = true } +futures = { workspace = true, optional = true } http = { workspace = true, optional = true } -oximeter.workspace = true -slog.workspace = true -tokio.workspace = true -thiserror.workspace = true -uuid.workspace = true -omicron-workspace-hack.workspace = true +oximeter = { workspace = true, optional = true } +slog = { workspace = true, optional = true } +tokio = { workspace = true, optional = true } +thiserror = { workspace = true, optional = true } +uuid = { workspace = true, optional = true } +omicron-workspace-hack = { workspace = true, optional = true } [features] -default = ["http-instruments", "kstat"] -http-instruments = ["http"] -kstat = ["kstat-rs"] +default = ["http-instruments", "datalink"] +http-instruments = [ + "dep:chrono", + "dep:dropshot", + "dep:futures", + "dep:http", + "dep:omicron-workspace-hack", + "dep:oximeter", + "dep:uuid" +] +kstat = [ + "dep:cfg-if", + "dep:chrono", + "dep:futures", + "dep:kstat-rs", + "dep:omicron-workspace-hack", + "dep:oximeter", + "dep:slog", + "dep:tokio", + "dep:thiserror", + "dep:uuid" +] +datalink = ["kstat"] [dev-dependencies] rand.workspace = true slog-async.workspace = true slog-term.workspace = true +oximeter.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] kstat-rs = { workspace = true, optional = true } diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index 90f34acae8..c792a51408 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company //! Types for publishing kernel statistics via oximeter. //! @@ -87,6 +87,7 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(any(feature = "datalink", test))] pub mod link; mod sampler; @@ -206,9 +207,9 @@ pub fn hrtime_to_utc(hrtime: i64) -> Result, Error> { } } -// Helper trait for converting a `NamedData` item into a specific contained data -// type, if possible. -pub(crate) trait ConvertNamedData { +/// Helper trait for converting a `NamedData` item into a specific contained data +/// type, if possible. +pub trait ConvertNamedData { fn as_i32(&self) -> Result; fn as_u32(&self) -> Result; fn as_i64(&self) -> Result; diff --git a/oximeter/instruments/src/kstat/sampler.rs b/oximeter/instruments/src/kstat/sampler.rs index bab8ad0ba5..af1b3ba7cf 100644 --- a/oximeter/instruments/src/kstat/sampler.rs +++ b/oximeter/instruments/src/kstat/sampler.rs @@ -671,7 +671,7 @@ impl KstatSamplerWorker { let n_current_samples = current_samples.len(); let n_total_samples = n_new_samples + n_current_samples; let n_overflow_samples = - n_total_samples.checked_sub(self.sample_limit).unwrap_or(0); + n_total_samples.saturating_sub(self.sample_limit); if n_overflow_samples > 0 { warn!( self.log, @@ -788,7 +788,7 @@ impl KstatSamplerWorker { // or the time we added the kstat if not. let start = sampled_kstat .time_of_last_collection - .unwrap_or_else(|| sampled_kstat.time_added); + .unwrap_or(sampled_kstat.time_added); let expire_at = start + duration; cfg_if::cfg_if! { if #[cfg(test)] { diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d003e71739..c1f839c85d 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -4,7 +4,7 @@ //! General-purpose types for instrumenting code to producer oximeter metrics. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company #[cfg(feature = "http-instruments")] pub mod http; diff --git a/oximeter/producer/src/lib.rs b/oximeter/producer/src/lib.rs index 2354f9c217..3fecaadf4f 100644 --- a/oximeter/producer/src/lib.rs +++ b/oximeter/producer/src/lib.rs @@ -38,8 +38,8 @@ pub enum Error { #[error("Error running producer HTTP server: {0}")] Server(String), - #[error("Error registering as metric producer: {0}")] - RegistrationError(String), + #[error("Error registering as metric producer: {msg}")] + RegistrationError { retryable: bool, msg: String }, #[error("Producer registry and config UUIDs do not match")] UuidMismatch, @@ -251,11 +251,19 @@ pub async fn register( ) -> Result<(), Error> { let client = nexus_client::Client::new(&format!("http://{}", address), log.clone()); - client - .cpapi_producers_post(&server_info.into()) - .await - .map(|_| ()) - .map_err(|msg| Error::RegistrationError(msg.to_string())) + client.cpapi_producers_post(&server_info.into()).await.map(|_| ()).map_err( + |err| { + let retryable = match &err { + nexus_client::Error::CommunicationError(..) => true, + nexus_client::Error::ErrorResponse(resp) => { + resp.status().is_server_error() + } + _ => false, + }; + let msg = err.to_string(); + Error::RegistrationError { retryable, msg } + }, + ) } /// Handle a request to pull available metric data from a [`ProducerRegistry`]. diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 5f888504db..6ce9d59b1c 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -413,6 +413,7 @@ async fn instance_register( body_args.instance_runtime, body_args.vmm_runtime, body_args.propolis_addr, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 7a6033b4bb..5ddca90403 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -15,7 +15,7 @@ use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleMetadata; use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ - InstanceHardware, InstanceMigrationSourceParams, + InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, InstanceMigrationTargetParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, VpcFirewallRule, }; @@ -311,6 +311,11 @@ struct InstanceRunner { // Properties visible to Propolis properties: propolis_client::types::InstanceProperties, + // This is currently unused, but will be sent to Propolis as part of the + // work tracked in https://github.com/oxidecomputer/omicron/issues/4851. It + // will be included in the InstanceProperties above, most likely. + _metadata: InstanceMetadata, + // The ID of the Propolis server (and zone) running this instance propolis_id: Uuid, @@ -928,6 +933,7 @@ impl Instance { ticket: InstanceTicket, state: InstanceInitialState, services: InstanceManagerServices, + metadata: InstanceMetadata, ) -> Result { info!(log, "initializing new Instance"; "instance_id" => %id, @@ -1005,6 +1011,9 @@ impl Instance { // InstanceCpuCount here, to avoid any casting... vcpus: hardware.properties.ncpus.0 as u8, }, + // This will be used in a follow up, tracked under + // https://github.com/oxidecomputer/omicron/issues/4851. + _metadata: metadata, propolis_id, propolis_addr, vnic_allocator, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 80c62be234..666d970538 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -8,6 +8,7 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::nexus::NexusClientWithResolver; use crate::params::InstanceExternalIpBody; +use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, @@ -227,6 +228,7 @@ impl InstanceManager { *self.inner.reservoir_size.lock().unwrap() } + #[allow(clippy::too_many_arguments)] pub async fn ensure_registered( &self, instance_id: Uuid, @@ -235,6 +237,7 @@ impl InstanceManager { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { let (tx, rx) = oneshot::channel(); self.inner @@ -246,6 +249,7 @@ impl InstanceManager { instance_runtime, vmm_runtime, propolis_addr, + metadata, tx, }) .await @@ -396,6 +400,7 @@ enum InstanceManagerRequest { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, tx: oneshot::Sender>, }, EnsureUnregistered { @@ -509,9 +514,10 @@ impl InstanceManagerRunner { instance_runtime, vmm_runtime, propolis_addr, + metadata, tx, }) => { - tx.send(self.ensure_registered(instance_id, propolis_id, hardware, instance_runtime, vmm_runtime, propolis_addr).await).map_err(|_| Error::FailedSendClientClosed) + tx.send(self.ensure_registered(instance_id, propolis_id, hardware, instance_runtime, vmm_runtime, propolis_addr, metadata).await).map_err(|_| Error::FailedSendClientClosed) }, Some(EnsureUnregistered { instance_id, tx }) => { self.ensure_unregistered(tx, instance_id).await @@ -574,7 +580,8 @@ impl InstanceManagerRunner { /// (instance ID, Propolis ID) pair multiple times, but will fail if the /// instance is registered with a Propolis ID different from the one the /// caller supplied. - async fn ensure_registered( + #[allow(clippy::too_many_arguments)] + pub async fn ensure_registered( &mut self, instance_id: Uuid, propolis_id: Uuid, @@ -582,6 +589,7 @@ impl InstanceManagerRunner { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { info!( &self.log, @@ -592,6 +600,7 @@ impl InstanceManagerRunner { "instance_runtime" => ?instance_runtime, "vmm_runtime" => ?vmm_runtime, "propolis_addr" => ?propolis_addr, + "metadata" => ?metadata, ); let instance = { @@ -646,6 +655,7 @@ impl InstanceManagerRunner { ticket, state, services, + metadata, )?; let _old = self.instances.insert(instance_id, (propolis_id, instance)); diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6c3383c88f..33941ab6fe 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -8,6 +8,7 @@ use oximeter::types::MetricsError; use oximeter::types::ProducerRegistry; use sled_hardware::Baseboard; use slog::Logger; +use std::sync::Arc; use std::time::Duration; use uuid::Uuid; @@ -19,6 +20,7 @@ cfg_if::cfg_if! { use oximeter_instruments::kstat::KstatSampler; use oximeter_instruments::kstat::TargetId; use std::collections::BTreeMap; + use std::sync::Mutex; } else { use anyhow::anyhow; } @@ -46,6 +48,18 @@ pub enum Error { #[error("Failed to fetch hostname")] Hostname(#[source] std::io::Error), + + #[error("Non-UTF8 hostname")] + NonUtf8Hostname, +} + +// Basic metadata about the sled agent used when publishing metrics. +#[derive(Clone, Debug)] +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +struct Metadata { + sled_id: Uuid, + rack_id: Uuid, + baseboard: Baseboard, } /// Type managing all oximeter metrics produced by the sled-agent. @@ -61,10 +75,7 @@ pub enum Error { // the name of fields that are not yet used. #[cfg_attr(not(target_os = "illumos"), allow(dead_code))] pub struct MetricsManager { - sled_id: Uuid, - rack_id: Uuid, - baseboard: Baseboard, - hostname: Option, + metadata: Arc, _log: Logger, #[cfg(target_os = "illumos")] kstat_sampler: KstatSampler, @@ -78,7 +89,7 @@ pub struct MetricsManager { // namespace them internally, e.g., `"datalink:{link_name}"` would be the // real key. #[cfg(target_os = "illumos")] - tracked_links: BTreeMap, + tracked_links: Arc>>, registry: ProducerRegistry, } @@ -101,14 +112,11 @@ impl MetricsManager { registry .register_producer(kstat_sampler.clone()) .map_err(Error::Registry)?; - let tracked_links = BTreeMap::new(); + let tracked_links = Arc::new(Mutex::new(BTreeMap::new())); } } Ok(Self { - sled_id, - rack_id, - baseboard, - hostname: None, + metadata: Arc::new(Metadata { sled_id, rack_id, baseboard }), _log: log, #[cfg(target_os = "illumos")] kstat_sampler, @@ -128,14 +136,14 @@ impl MetricsManager { impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, link_name: impl AsRef, interval: Duration, ) -> Result<(), Error> { - let hostname = self.hostname().await?; + let hostname = hostname()?; let link = link::PhysicalDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname, link_name: link_name.as_ref().to_string(), @@ -146,7 +154,10 @@ impl MetricsManager { .add_target(link, details) .await .map_err(Error::Kstat)?; - self.tracked_links.insert(link_name.as_ref().to_string(), id); + self.tracked_links + .lock() + .unwrap() + .insert(link_name.as_ref().to_string(), id); Ok(()) } @@ -155,10 +166,12 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, link_name: impl AsRef, ) -> Result<(), Error> { - if let Some(id) = self.tracked_links.remove(link_name.as_ref()) { + let maybe_id = + self.tracked_links.lock().unwrap().remove(link_name.as_ref()); + if let Some(id) = maybe_id { self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat) } else { Ok(()) @@ -174,8 +187,8 @@ impl MetricsManager { interval: Duration, ) -> Result<(), Error> { let link = link::VirtualDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname: hostname.as_ref().to_string(), link_name: link_name.as_ref().to_string(), @@ -190,40 +203,19 @@ impl MetricsManager { // Return the serial number out of the baseboard, if one exists. fn serial_number(&self) -> String { - match &self.baseboard { + match &self.metadata.baseboard { Baseboard::Gimlet { identifier, .. } => identifier.clone(), Baseboard::Unknown => String::from("unknown"), Baseboard::Pc { identifier, .. } => identifier.clone(), } } - - // Return the system's hostname. - // - // If we've failed to get it previously, we try again. If _that_ fails, - // return an error. - // - // TODO-cleanup: This will become much simpler once - // `OnceCell::get_or_try_init` is stabilized. - async fn hostname(&mut self) -> Result { - if let Some(hn) = &self.hostname { - return Ok(hn.clone()); - } - let hn = tokio::process::Command::new("hostname") - .env_clear() - .output() - .await - .map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string()) - .map_err(Error::Hostname)?; - self.hostname.replace(hn.clone()); - Ok(hn) - } } #[cfg(not(target_os = "illumos"))] impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, _link_name: impl AsRef, _interval: Duration, ) -> Result<(), Error> { @@ -237,7 +229,7 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, _link_name: impl AsRef, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!( @@ -258,3 +250,26 @@ impl MetricsManager { ))) } } + +// Return the current hostname if possible. +#[cfg(target_os = "illumos")] +fn hostname() -> Result { + // See netdb.h + const MAX_LEN: usize = 256; + let mut out = vec![0u8; MAX_LEN + 1]; + if unsafe { + libc::gethostname(out.as_mut_ptr() as *mut libc::c_char, MAX_LEN) + } == 0 + { + // Split into subslices by NULL bytes. + // + // Safety: We've passed an extra NULL to the gethostname(2) call, so + // there must be at least one of these. + let chunk = out.split(|x| *x == 0).next().unwrap(); + let s = std::ffi::CString::new(chunk) + .map_err(|_| Error::NonUtf8Hostname)?; + s.into_string().map_err(|_| Error::NonUtf8Hostname) + } else { + Err(std::io::Error::last_os_error()).map_err(|_| Error::NonUtf8Hostname) + } +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index fda952ca87..728a6521f2 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -82,6 +82,16 @@ pub struct InstanceHardware { pub cloud_init_bytes: Option, } +/// Metadata used to track statistics about an instance. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct InstanceMetadata { + pub silo_id: Uuid, + pub silo_name: String, + pub project_id: Uuid, + pub project_name: String, + pub instance_name: String, +} + /// The body of a request to ensure that a instance and VMM are known to a sled /// agent. #[derive(Serialize, Deserialize, JsonSchema)] @@ -103,6 +113,9 @@ pub struct InstanceEnsureBody { /// The address at which this VMM should serve a Propolis server API. pub propolis_addr: SocketAddr, + + /// Metadata used to track instance statistics. + pub metadata: InstanceMetadata, } /// The body of a request to move a previously-ensured instance into a specific diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 09ffdf5dc4..494db58828 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -98,6 +98,7 @@ async fn instance_register( body_args.hardware, body_args.instance_runtime, body_args.vmm_runtime, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d4c92fca51..3beda7b578 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -13,7 +13,7 @@ use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, }; @@ -240,6 +240,9 @@ impl SledAgent { hardware: InstanceHardware, instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, + // This is currently unused, but will be included as part of work + // tracked in https://github.com/oxidecomputer/omicron/issues/4851. + _metadata: InstanceMetadata, ) -> Result { // respond with a fake 500 level failure if asked to ensure an instance // with more than 16 CPUs. diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1a634a6346..9613521714 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -17,7 +17,7 @@ use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, @@ -397,6 +397,55 @@ impl SledAgent { let underlay_nics = underlay::find_nics(&config.data_links)?; illumos_utils::opte::initialize_xde_driver(&log, &underlay_nics)?; + // Start collecting metric data. + // + // First, we're creating a shareable type for managing the metrics + // themselves early on, so that we can pass it to other components of + // the sled agent that need it. + // + // Then we'll start tracking physical links and register as a producer + // with Nexus in the background. + let metrics_manager = MetricsManager::new( + request.body.id, + request.body.rack_id, + long_running_task_handles.hardware_manager.baseboard(), + log.new(o!("component" => "MetricsManager")), + )?; + + // Start tracking the underlay physical links. + for nic in underlay::find_nics(&config.data_links)? { + let link_name = nic.interface(); + if let Err(e) = metrics_manager + .track_physical_link( + link_name, + crate::metrics::LINK_SAMPLE_INTERVAL, + ) + .await + { + error!( + log, + "failed to start tracking physical link metrics"; + "link_name" => link_name, + "error" => ?e, + ); + } + } + + // Spawn a task in the background to register our metric producer with + // Nexus. This should not block progress here. + let endpoint = ProducerEndpoint { + id: request.body.id, + kind: ProducerKind::SledAgent, + address: sled_address.into(), + base_route: String::from("/metrics/collect"), + interval: crate::metrics::METRIC_COLLECTION_INTERVAL, + }; + tokio::task::spawn(register_metric_producer_with_nexus( + log.clone(), + nexus_client.clone(), + endpoint, + )); + // Create the PortManager to manage all the OPTE ports on the sled. let port_manager = PortManager::new( parent_log.new(o!("component" => "PortManager")), @@ -517,47 +566,6 @@ impl SledAgent { rack_network_config.clone(), )?; - let mut metrics_manager = MetricsManager::new( - request.body.id, - request.body.rack_id, - long_running_task_handles.hardware_manager.baseboard(), - log.new(o!("component" => "MetricsManager")), - )?; - - // Start tracking the underlay physical links. - for nic in underlay::find_nics(&config.data_links)? { - let link_name = nic.interface(); - if let Err(e) = metrics_manager - .track_physical_link( - link_name, - crate::metrics::LINK_SAMPLE_INTERVAL, - ) - .await - { - error!( - log, - "failed to start tracking physical link metrics"; - "link_name" => link_name, - "error" => ?e, - ); - } - } - - // Spawn a task in the background to register our metric producer with - // Nexus. This should not block progress here. - let endpoint = ProducerEndpoint { - id: request.body.id, - kind: ProducerKind::SledAgent, - address: sled_address.into(), - base_route: String::from("/metrics/collect"), - interval: crate::metrics::METRIC_COLLECTION_INTERVAL, - }; - tokio::task::spawn(register_metric_producer_with_nexus( - log.clone(), - nexus_client.clone(), - endpoint, - )); - let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, @@ -941,6 +949,7 @@ impl SledAgent { /// Idempotently ensures that a given instance is registered with this sled, /// i.e., that it can be addressed by future calls to /// [`Self::instance_ensure_state`]. + #[allow(clippy::too_many_arguments)] pub async fn instance_ensure_registered( &self, instance_id: Uuid, @@ -949,6 +958,7 @@ impl SledAgent { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { self.inner .instances @@ -959,6 +969,7 @@ impl SledAgent { instance_runtime, vmm_runtime, propolis_addr, + metadata, ) .await .map_err(|e| Error::Instance(e))