diff --git a/Cargo.lock b/Cargo.lock index 2f13d1e9906..c97fe37de5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1960,7 +1960,7 @@ dependencies = [ [[package]] name = "dropshot" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "async-stream", "async-trait", @@ -2006,7 +2006,7 @@ dependencies = [ [[package]] name = "dropshot_endpoint" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "proc-macro2", "quote", diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 4b52b597ba8..769f1c5de87 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1183,6 +1183,28 @@ impl super::Nexus { let ssh_keys: Vec = ssh_keys.map(|ssh_key| ssh_key.public_key).collect(); + // Construct instance metadata used to track its statistics. + // + // This requires another fetch on the silo and project, to extract their + // IDs and names. + let (.., db_project) = self + .project_lookup( + opctx, + params::ProjectSelector { + project: NameOrId::Id(db_instance.project_id), + }, + )? + .fetch() + .await?; + let (_, db_silo) = self.current_silo_lookup(opctx)?.fetch().await?; + let metadata = sled_agent_client::types::InstanceMetadata { + silo_id: db_silo.id(), + silo_name: db_silo.name().to_string(), + project_id: db_project.id(), + project_name: db_project.name().to_string(), + instance_name: db_instance.name().to_string(), + }; + // Ask the sled agent to begin the state change. Then update the // database to reflect the new intermediate state. If this update is // not the newest one, that's fine. That might just mean the sled agent @@ -1226,6 +1248,7 @@ impl super::Nexus { PROPOLIS_PORT, ) .to_string(), + metadata, }, ) .await diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 99156fffd4c..1fc89216b41 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4708,6 +4708,14 @@ } ] }, + "metadata": { + "description": "Metadata used to track instance statistics.", + "allOf": [ + { + "$ref": "#/components/schemas/InstanceMetadata" + } + ] + }, "propolis_addr": { "description": "The address at which this VMM should serve a Propolis server API.", "type": "string" @@ -4729,6 +4737,7 @@ "required": [ "hardware", "instance_runtime", + "metadata", "propolis_addr", "propolis_id", "vmm_runtime" @@ -4860,6 +4869,36 @@ "snapshot_id" ] }, + "InstanceMetadata": { + "description": "Metadata used to track statistics about an instance.", + "type": "object", + "properties": { + "instance_name": { + "type": "string" + }, + "project_id": { + "type": "string", + "format": "uuid" + }, + "project_name": { + "type": "string" + }, + "silo_id": { + "type": "string", + "format": "uuid" + }, + "silo_name": { + "type": "string" + } + }, + "required": [ + "instance_name", + "project_id", + "project_name", + "silo_id", + "silo_name" + ] + }, "InstanceMigrationSourceParams": { "description": "Instance runtime state to update for a migration.", "type": "object", diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 8372b7c5602..12827699f1f 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -5,27 +5,48 @@ edition = "2021" license = "MPL-2.0" [dependencies] -cfg-if.workspace = true -chrono.workspace = true -dropshot.workspace = true -futures.workspace = true +cfg-if = { workspace = true, optional = true } +chrono = { workspace = true, optional = true } +dropshot = { workspace = true, optional = true } +futures = { workspace = true, optional = true } http = { workspace = true, optional = true } -oximeter.workspace = true -slog.workspace = true -tokio.workspace = true -thiserror.workspace = true -uuid.workspace = true -omicron-workspace-hack.workspace = true +oximeter = { workspace = true, optional = true } +slog = { workspace = true, optional = true } +tokio = { workspace = true, optional = true } +thiserror = { workspace = true, optional = true } +uuid = { workspace = true, optional = true } +omicron-workspace-hack = { workspace = true, optional = true } [features] -default = ["http-instruments", "kstat"] -http-instruments = ["http"] -kstat = ["kstat-rs"] +default = ["http-instruments", "datalink"] +http-instruments = [ + "dep:chrono", + "dep:dropshot", + "dep:futures", + "dep:http", + "dep:omicron-workspace-hack", + "dep:oximeter", + "dep:uuid" +] +kstat = [ + "dep:cfg-if", + "dep:chrono", + "dep:futures", + "dep:kstat-rs", + "dep:omicron-workspace-hack", + "dep:oximeter", + "dep:slog", + "dep:tokio", + "dep:thiserror", + "dep:uuid" +] +datalink = ["kstat"] [dev-dependencies] rand.workspace = true slog-async.workspace = true slog-term.workspace = true +oximeter.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] kstat-rs = { workspace = true, optional = true } diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index 90f34acae85..c792a514087 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company //! Types for publishing kernel statistics via oximeter. //! @@ -87,6 +87,7 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(any(feature = "datalink", test))] pub mod link; mod sampler; @@ -206,9 +207,9 @@ pub fn hrtime_to_utc(hrtime: i64) -> Result, Error> { } } -// Helper trait for converting a `NamedData` item into a specific contained data -// type, if possible. -pub(crate) trait ConvertNamedData { +/// Helper trait for converting a `NamedData` item into a specific contained data +/// type, if possible. +pub trait ConvertNamedData { fn as_i32(&self) -> Result; fn as_u32(&self) -> Result; fn as_i64(&self) -> Result; diff --git a/oximeter/instruments/src/kstat/sampler.rs b/oximeter/instruments/src/kstat/sampler.rs index bab8ad0ba51..af1b3ba7cf9 100644 --- a/oximeter/instruments/src/kstat/sampler.rs +++ b/oximeter/instruments/src/kstat/sampler.rs @@ -671,7 +671,7 @@ impl KstatSamplerWorker { let n_current_samples = current_samples.len(); let n_total_samples = n_new_samples + n_current_samples; let n_overflow_samples = - n_total_samples.checked_sub(self.sample_limit).unwrap_or(0); + n_total_samples.saturating_sub(self.sample_limit); if n_overflow_samples > 0 { warn!( self.log, @@ -788,7 +788,7 @@ impl KstatSamplerWorker { // or the time we added the kstat if not. let start = sampled_kstat .time_of_last_collection - .unwrap_or_else(|| sampled_kstat.time_added); + .unwrap_or(sampled_kstat.time_added); let expire_at = start + duration; cfg_if::cfg_if! { if #[cfg(test)] { diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d003e717395..c1f839c85d2 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -4,7 +4,7 @@ //! General-purpose types for instrumenting code to producer oximeter metrics. -// Copyright 2023 Oxide Computer Company +// Copyright 2024 Oxide Computer Company #[cfg(feature = "http-instruments")] pub mod http; diff --git a/oximeter/producer/src/lib.rs b/oximeter/producer/src/lib.rs index 2354f9c2176..3fecaadf4fe 100644 --- a/oximeter/producer/src/lib.rs +++ b/oximeter/producer/src/lib.rs @@ -38,8 +38,8 @@ pub enum Error { #[error("Error running producer HTTP server: {0}")] Server(String), - #[error("Error registering as metric producer: {0}")] - RegistrationError(String), + #[error("Error registering as metric producer: {msg}")] + RegistrationError { retryable: bool, msg: String }, #[error("Producer registry and config UUIDs do not match")] UuidMismatch, @@ -251,11 +251,19 @@ pub async fn register( ) -> Result<(), Error> { let client = nexus_client::Client::new(&format!("http://{}", address), log.clone()); - client - .cpapi_producers_post(&server_info.into()) - .await - .map(|_| ()) - .map_err(|msg| Error::RegistrationError(msg.to_string())) + client.cpapi_producers_post(&server_info.into()).await.map(|_| ()).map_err( + |err| { + let retryable = match &err { + nexus_client::Error::CommunicationError(..) => true, + nexus_client::Error::ErrorResponse(resp) => { + resp.status().is_server_error() + } + _ => false, + }; + let msg = err.to_string(); + Error::RegistrationError { retryable, msg } + }, + ) } /// Handle a request to pull available metric data from a [`ProducerRegistry`]. diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 5f888504dbc..6ce9d59b1c9 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -413,6 +413,7 @@ async fn instance_register( body_args.instance_runtime, body_args.vmm_runtime, body_args.propolis_addr, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 5b9bf54dd96..1fdc47f94e9 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -15,7 +15,7 @@ use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleMetadata; use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ - InstanceHardware, InstanceMigrationSourceParams, + InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, InstanceMigrationTargetParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, VpcFirewallRule, }; @@ -311,6 +311,11 @@ struct InstanceRunner { // Properties visible to Propolis properties: propolis_client::types::InstanceProperties, + // This is currently unused, but will be sent to Propolis as part of the + // work tracked in https://github.com/oxidecomputer/omicron/issues/4851. It + // will be included in the InstanceProperties above, most likely. + _metadata: InstanceMetadata, + // The ID of the Propolis server (and zone) running this instance propolis_id: Uuid, @@ -596,6 +601,10 @@ impl InstanceRunner { // state to Nexus. This ensures that the instance is actually gone from // the sled when Nexus receives the state update saying it's actually // destroyed. + // + // In addition, we'll start or stop collecting metrics soley on the + // basis of whether the instance is terminated. All other states imply + // we start (or continue) to collect instance metrics. match action { Some(InstanceAction::Destroy) => { info!(self.log, "terminating VMM that has exited"; @@ -927,6 +936,7 @@ impl Instance { ticket: InstanceTicket, state: InstanceInitialState, services: InstanceManagerServices, + metadata: InstanceMetadata, ) -> Result { info!(log, "initializing new Instance"; "instance_id" => %id, @@ -1004,6 +1014,9 @@ impl Instance { // InstanceCpuCount here, to avoid any casting... vcpus: hardware.properties.ncpus.0 as u8, }, + // This will be used in a follow up, tracked under + // https://github.com/oxidecomputer/omicron/issues/4851. + _metadata: metadata, propolis_id, propolis_addr, vnic_allocator, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 80c62be234a..666d9705381 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -8,6 +8,7 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::nexus::NexusClientWithResolver; use crate::params::InstanceExternalIpBody; +use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, @@ -227,6 +228,7 @@ impl InstanceManager { *self.inner.reservoir_size.lock().unwrap() } + #[allow(clippy::too_many_arguments)] pub async fn ensure_registered( &self, instance_id: Uuid, @@ -235,6 +237,7 @@ impl InstanceManager { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { let (tx, rx) = oneshot::channel(); self.inner @@ -246,6 +249,7 @@ impl InstanceManager { instance_runtime, vmm_runtime, propolis_addr, + metadata, tx, }) .await @@ -396,6 +400,7 @@ enum InstanceManagerRequest { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, tx: oneshot::Sender>, }, EnsureUnregistered { @@ -509,9 +514,10 @@ impl InstanceManagerRunner { instance_runtime, vmm_runtime, propolis_addr, + metadata, tx, }) => { - tx.send(self.ensure_registered(instance_id, propolis_id, hardware, instance_runtime, vmm_runtime, propolis_addr).await).map_err(|_| Error::FailedSendClientClosed) + tx.send(self.ensure_registered(instance_id, propolis_id, hardware, instance_runtime, vmm_runtime, propolis_addr, metadata).await).map_err(|_| Error::FailedSendClientClosed) }, Some(EnsureUnregistered { instance_id, tx }) => { self.ensure_unregistered(tx, instance_id).await @@ -574,7 +580,8 @@ impl InstanceManagerRunner { /// (instance ID, Propolis ID) pair multiple times, but will fail if the /// instance is registered with a Propolis ID different from the one the /// caller supplied. - async fn ensure_registered( + #[allow(clippy::too_many_arguments)] + pub async fn ensure_registered( &mut self, instance_id: Uuid, propolis_id: Uuid, @@ -582,6 +589,7 @@ impl InstanceManagerRunner { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { info!( &self.log, @@ -592,6 +600,7 @@ impl InstanceManagerRunner { "instance_runtime" => ?instance_runtime, "vmm_runtime" => ?vmm_runtime, "propolis_addr" => ?propolis_addr, + "metadata" => ?metadata, ); let instance = { @@ -646,6 +655,7 @@ impl InstanceManagerRunner { ticket, state, services, + metadata, )?; let _old = self.instances.insert(instance_id, (propolis_id, instance)); diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6c3383c88f0..33941ab6fe7 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -8,6 +8,7 @@ use oximeter::types::MetricsError; use oximeter::types::ProducerRegistry; use sled_hardware::Baseboard; use slog::Logger; +use std::sync::Arc; use std::time::Duration; use uuid::Uuid; @@ -19,6 +20,7 @@ cfg_if::cfg_if! { use oximeter_instruments::kstat::KstatSampler; use oximeter_instruments::kstat::TargetId; use std::collections::BTreeMap; + use std::sync::Mutex; } else { use anyhow::anyhow; } @@ -46,6 +48,18 @@ pub enum Error { #[error("Failed to fetch hostname")] Hostname(#[source] std::io::Error), + + #[error("Non-UTF8 hostname")] + NonUtf8Hostname, +} + +// Basic metadata about the sled agent used when publishing metrics. +#[derive(Clone, Debug)] +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +struct Metadata { + sled_id: Uuid, + rack_id: Uuid, + baseboard: Baseboard, } /// Type managing all oximeter metrics produced by the sled-agent. @@ -61,10 +75,7 @@ pub enum Error { // the name of fields that are not yet used. #[cfg_attr(not(target_os = "illumos"), allow(dead_code))] pub struct MetricsManager { - sled_id: Uuid, - rack_id: Uuid, - baseboard: Baseboard, - hostname: Option, + metadata: Arc, _log: Logger, #[cfg(target_os = "illumos")] kstat_sampler: KstatSampler, @@ -78,7 +89,7 @@ pub struct MetricsManager { // namespace them internally, e.g., `"datalink:{link_name}"` would be the // real key. #[cfg(target_os = "illumos")] - tracked_links: BTreeMap, + tracked_links: Arc>>, registry: ProducerRegistry, } @@ -101,14 +112,11 @@ impl MetricsManager { registry .register_producer(kstat_sampler.clone()) .map_err(Error::Registry)?; - let tracked_links = BTreeMap::new(); + let tracked_links = Arc::new(Mutex::new(BTreeMap::new())); } } Ok(Self { - sled_id, - rack_id, - baseboard, - hostname: None, + metadata: Arc::new(Metadata { sled_id, rack_id, baseboard }), _log: log, #[cfg(target_os = "illumos")] kstat_sampler, @@ -128,14 +136,14 @@ impl MetricsManager { impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, link_name: impl AsRef, interval: Duration, ) -> Result<(), Error> { - let hostname = self.hostname().await?; + let hostname = hostname()?; let link = link::PhysicalDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname, link_name: link_name.as_ref().to_string(), @@ -146,7 +154,10 @@ impl MetricsManager { .add_target(link, details) .await .map_err(Error::Kstat)?; - self.tracked_links.insert(link_name.as_ref().to_string(), id); + self.tracked_links + .lock() + .unwrap() + .insert(link_name.as_ref().to_string(), id); Ok(()) } @@ -155,10 +166,12 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, link_name: impl AsRef, ) -> Result<(), Error> { - if let Some(id) = self.tracked_links.remove(link_name.as_ref()) { + let maybe_id = + self.tracked_links.lock().unwrap().remove(link_name.as_ref()); + if let Some(id) = maybe_id { self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat) } else { Ok(()) @@ -174,8 +187,8 @@ impl MetricsManager { interval: Duration, ) -> Result<(), Error> { let link = link::VirtualDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname: hostname.as_ref().to_string(), link_name: link_name.as_ref().to_string(), @@ -190,40 +203,19 @@ impl MetricsManager { // Return the serial number out of the baseboard, if one exists. fn serial_number(&self) -> String { - match &self.baseboard { + match &self.metadata.baseboard { Baseboard::Gimlet { identifier, .. } => identifier.clone(), Baseboard::Unknown => String::from("unknown"), Baseboard::Pc { identifier, .. } => identifier.clone(), } } - - // Return the system's hostname. - // - // If we've failed to get it previously, we try again. If _that_ fails, - // return an error. - // - // TODO-cleanup: This will become much simpler once - // `OnceCell::get_or_try_init` is stabilized. - async fn hostname(&mut self) -> Result { - if let Some(hn) = &self.hostname { - return Ok(hn.clone()); - } - let hn = tokio::process::Command::new("hostname") - .env_clear() - .output() - .await - .map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string()) - .map_err(Error::Hostname)?; - self.hostname.replace(hn.clone()); - Ok(hn) - } } #[cfg(not(target_os = "illumos"))] impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, _link_name: impl AsRef, _interval: Duration, ) -> Result<(), Error> { @@ -237,7 +229,7 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, _link_name: impl AsRef, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!( @@ -258,3 +250,26 @@ impl MetricsManager { ))) } } + +// Return the current hostname if possible. +#[cfg(target_os = "illumos")] +fn hostname() -> Result { + // See netdb.h + const MAX_LEN: usize = 256; + let mut out = vec![0u8; MAX_LEN + 1]; + if unsafe { + libc::gethostname(out.as_mut_ptr() as *mut libc::c_char, MAX_LEN) + } == 0 + { + // Split into subslices by NULL bytes. + // + // Safety: We've passed an extra NULL to the gethostname(2) call, so + // there must be at least one of these. + let chunk = out.split(|x| *x == 0).next().unwrap(); + let s = std::ffi::CString::new(chunk) + .map_err(|_| Error::NonUtf8Hostname)?; + s.into_string().map_err(|_| Error::NonUtf8Hostname) + } else { + Err(std::io::Error::last_os_error()).map_err(|_| Error::NonUtf8Hostname) + } +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index fda952ca871..728a6521f28 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -82,6 +82,16 @@ pub struct InstanceHardware { pub cloud_init_bytes: Option, } +/// Metadata used to track statistics about an instance. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct InstanceMetadata { + pub silo_id: Uuid, + pub silo_name: String, + pub project_id: Uuid, + pub project_name: String, + pub instance_name: String, +} + /// The body of a request to ensure that a instance and VMM are known to a sled /// agent. #[derive(Serialize, Deserialize, JsonSchema)] @@ -103,6 +113,9 @@ pub struct InstanceEnsureBody { /// The address at which this VMM should serve a Propolis server API. pub propolis_addr: SocketAddr, + + /// Metadata used to track instance statistics. + pub metadata: InstanceMetadata, } /// The body of a request to move a previously-ensured instance into a specific diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 09ffdf5dc40..494db588288 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -98,6 +98,7 @@ async fn instance_register( body_args.hardware, body_args.instance_runtime, body_args.vmm_runtime, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d4c92fca513..3beda7b578f 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -13,7 +13,7 @@ use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, }; @@ -240,6 +240,9 @@ impl SledAgent { hardware: InstanceHardware, instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, + // This is currently unused, but will be included as part of work + // tracked in https://github.com/oxidecomputer/omicron/issues/4851. + _metadata: InstanceMetadata, ) -> Result { // respond with a fake 500 level failure if asked to ensure an instance // with more than 16 CPUs. diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1a634a63461..9613521714c 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -17,7 +17,7 @@ use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, @@ -397,6 +397,55 @@ impl SledAgent { let underlay_nics = underlay::find_nics(&config.data_links)?; illumos_utils::opte::initialize_xde_driver(&log, &underlay_nics)?; + // Start collecting metric data. + // + // First, we're creating a shareable type for managing the metrics + // themselves early on, so that we can pass it to other components of + // the sled agent that need it. + // + // Then we'll start tracking physical links and register as a producer + // with Nexus in the background. + let metrics_manager = MetricsManager::new( + request.body.id, + request.body.rack_id, + long_running_task_handles.hardware_manager.baseboard(), + log.new(o!("component" => "MetricsManager")), + )?; + + // Start tracking the underlay physical links. + for nic in underlay::find_nics(&config.data_links)? { + let link_name = nic.interface(); + if let Err(e) = metrics_manager + .track_physical_link( + link_name, + crate::metrics::LINK_SAMPLE_INTERVAL, + ) + .await + { + error!( + log, + "failed to start tracking physical link metrics"; + "link_name" => link_name, + "error" => ?e, + ); + } + } + + // Spawn a task in the background to register our metric producer with + // Nexus. This should not block progress here. + let endpoint = ProducerEndpoint { + id: request.body.id, + kind: ProducerKind::SledAgent, + address: sled_address.into(), + base_route: String::from("/metrics/collect"), + interval: crate::metrics::METRIC_COLLECTION_INTERVAL, + }; + tokio::task::spawn(register_metric_producer_with_nexus( + log.clone(), + nexus_client.clone(), + endpoint, + )); + // Create the PortManager to manage all the OPTE ports on the sled. let port_manager = PortManager::new( parent_log.new(o!("component" => "PortManager")), @@ -517,47 +566,6 @@ impl SledAgent { rack_network_config.clone(), )?; - let mut metrics_manager = MetricsManager::new( - request.body.id, - request.body.rack_id, - long_running_task_handles.hardware_manager.baseboard(), - log.new(o!("component" => "MetricsManager")), - )?; - - // Start tracking the underlay physical links. - for nic in underlay::find_nics(&config.data_links)? { - let link_name = nic.interface(); - if let Err(e) = metrics_manager - .track_physical_link( - link_name, - crate::metrics::LINK_SAMPLE_INTERVAL, - ) - .await - { - error!( - log, - "failed to start tracking physical link metrics"; - "link_name" => link_name, - "error" => ?e, - ); - } - } - - // Spawn a task in the background to register our metric producer with - // Nexus. This should not block progress here. - let endpoint = ProducerEndpoint { - id: request.body.id, - kind: ProducerKind::SledAgent, - address: sled_address.into(), - base_route: String::from("/metrics/collect"), - interval: crate::metrics::METRIC_COLLECTION_INTERVAL, - }; - tokio::task::spawn(register_metric_producer_with_nexus( - log.clone(), - nexus_client.clone(), - endpoint, - )); - let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, @@ -941,6 +949,7 @@ impl SledAgent { /// Idempotently ensures that a given instance is registered with this sled, /// i.e., that it can be addressed by future calls to /// [`Self::instance_ensure_state`]. + #[allow(clippy::too_many_arguments)] pub async fn instance_ensure_registered( &self, instance_id: Uuid, @@ -949,6 +958,7 @@ impl SledAgent { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { self.inner .instances @@ -959,6 +969,7 @@ impl SledAgent { instance_runtime, vmm_runtime, propolis_addr, + metadata, ) .await .map_err(|e| Error::Instance(e))