Skip to content

Commit

Permalink
Groundwork for oximeter instance vCPU metrics
Browse files Browse the repository at this point in the history
- Adds the silo and project IDs to the instance-ensure request from
  Nexus to the sled-agent. These are used as fields on the
  instance-related statistics. This metadata is currently unused, but
  will be forwarded to Propolis in the instance-ensure request once the
  server is updated to accept it.
- Defines a `VirtualMachine` oximeter target and `VcpuUsage` metric. The
  latter has a `state` field which corresponds to the named kstats
  published by the hypervisor that accumulate the time spent in a number
  of vCPU microstates. The combination of these should allow us to
  aggregate or break down vCPU usage by silo, project, instance, vCPU
  ID, and CPU state. Adds some simple mocks and tests for these.
- Adds more fine-grained feature flags to the `oximeter-instruments`
  crate.

Add names for silo, project and instance to instance metadata for statistics tracking

Remove actual VM and kstat implementations, they'll live in Propolis

Make named data conversion trait public
  • Loading branch information
bnaecker committed Feb 15, 2024
1 parent eda0550 commit 9e555fe
Show file tree
Hide file tree
Showing 16 changed files with 273 additions and 118 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,28 @@ impl super::Nexus {
let ssh_keys: Vec<String> =
ssh_keys.map(|ssh_key| ssh_key.public_key).collect();

// Construct instance metadata used to track its statistics.
//
// This requires another fetch on the silo and project, to extract their
// IDs and names.
let (.., db_project) = self
.project_lookup(
opctx,
params::ProjectSelector {
project: NameOrId::Id(db_instance.project_id),
},
)?
.fetch()
.await?;
let (_, db_silo) = self.current_silo_lookup(opctx)?.fetch().await?;
let metadata = sled_agent_client::types::InstanceMetadata {
silo_id: db_silo.id(),
silo_name: db_silo.name().to_string(),
project_id: db_project.id(),
project_name: db_project.name().to_string(),
instance_name: db_instance.name().to_string(),
};

// Ask the sled agent to begin the state change. Then update the
// database to reflect the new intermediate state. If this update is
// not the newest one, that's fine. That might just mean the sled agent
Expand Down Expand Up @@ -1226,6 +1248,7 @@ impl super::Nexus {
PROPOLIS_PORT,
)
.to_string(),
metadata,
},
)
.await
Expand Down
39 changes: 39 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -4708,6 +4708,14 @@
}
]
},
"metadata": {
"description": "Metadata used to track instance statistics.",
"allOf": [
{
"$ref": "#/components/schemas/InstanceMetadata"
}
]
},
"propolis_addr": {
"description": "The address at which this VMM should serve a Propolis server API.",
"type": "string"
Expand All @@ -4729,6 +4737,7 @@
"required": [
"hardware",
"instance_runtime",
"metadata",
"propolis_addr",
"propolis_id",
"vmm_runtime"
Expand Down Expand Up @@ -4860,6 +4869,36 @@
"snapshot_id"
]
},
"InstanceMetadata": {
"description": "Metadata used to track statistics about an instance.",
"type": "object",
"properties": {
"instance_name": {
"type": "string"
},
"project_id": {
"type": "string",
"format": "uuid"
},
"project_name": {
"type": "string"
},
"silo_id": {
"type": "string",
"format": "uuid"
},
"silo_name": {
"type": "string"
}
},
"required": [
"instance_name",
"project_id",
"project_name",
"silo_id",
"silo_name"
]
},
"InstanceMigrationSourceParams": {
"description": "Instance runtime state to update for a migration.",
"type": "object",
Expand Down
47 changes: 34 additions & 13 deletions oximeter/instruments/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,48 @@ edition = "2021"
license = "MPL-2.0"

[dependencies]
cfg-if.workspace = true
chrono.workspace = true
dropshot.workspace = true
futures.workspace = true
cfg-if = { workspace = true, optional = true }
chrono = { workspace = true, optional = true }
dropshot = { workspace = true, optional = true }
futures = { workspace = true, optional = true }
http = { workspace = true, optional = true }
oximeter.workspace = true
slog.workspace = true
tokio.workspace = true
thiserror.workspace = true
uuid.workspace = true
omicron-workspace-hack.workspace = true
oximeter = { workspace = true, optional = true }
slog = { workspace = true, optional = true }
tokio = { workspace = true, optional = true }
thiserror = { workspace = true, optional = true }
uuid = { workspace = true, optional = true }
omicron-workspace-hack = { workspace = true, optional = true }

[features]
default = ["http-instruments", "kstat"]
http-instruments = ["http"]
kstat = ["kstat-rs"]
default = ["http-instruments", "datalink"]
http-instruments = [
"dep:chrono",
"dep:dropshot",
"dep:futures",
"dep:http",
"dep:omicron-workspace-hack",
"dep:oximeter",
"dep:uuid"
]
kstat = [
"dep:cfg-if",
"dep:chrono",
"dep:futures",
"dep:kstat-rs",
"dep:omicron-workspace-hack",
"dep:oximeter",
"dep:slog",
"dep:tokio",
"dep:thiserror",
"dep:uuid"
]
datalink = ["kstat"]

[dev-dependencies]
rand.workspace = true
slog-async.workspace = true
slog-term.workspace = true
oximeter.workspace = true

[target.'cfg(target_os = "illumos")'.dependencies]
kstat-rs = { workspace = true, optional = true }
9 changes: 5 additions & 4 deletions oximeter/instruments/src/kstat/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2023 Oxide Computer Company
// Copyright 2024 Oxide Computer Company

//! Types for publishing kernel statistics via oximeter.
//!
Expand Down Expand Up @@ -87,6 +87,7 @@ use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::time::Duration;

#[cfg(any(feature = "datalink", test))]
pub mod link;
mod sampler;

Expand Down Expand Up @@ -206,9 +207,9 @@ pub fn hrtime_to_utc(hrtime: i64) -> Result<DateTime<Utc>, Error> {
}
}

// Helper trait for converting a `NamedData` item into a specific contained data
// type, if possible.
pub(crate) trait ConvertNamedData {
/// Helper trait for converting a `NamedData` item into a specific contained data
/// type, if possible.
pub trait ConvertNamedData {
fn as_i32(&self) -> Result<i32, Error>;
fn as_u32(&self) -> Result<u32, Error>;
fn as_i64(&self) -> Result<i64, Error>;
Expand Down
4 changes: 2 additions & 2 deletions oximeter/instruments/src/kstat/sampler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ impl KstatSamplerWorker {
let n_current_samples = current_samples.len();
let n_total_samples = n_new_samples + n_current_samples;
let n_overflow_samples =
n_total_samples.checked_sub(self.sample_limit).unwrap_or(0);
n_total_samples.saturating_sub(self.sample_limit);
if n_overflow_samples > 0 {
warn!(
self.log,
Expand Down Expand Up @@ -788,7 +788,7 @@ impl KstatSamplerWorker {
// or the time we added the kstat if not.
let start = sampled_kstat
.time_of_last_collection
.unwrap_or_else(|| sampled_kstat.time_added);
.unwrap_or(sampled_kstat.time_added);
let expire_at = start + duration;
cfg_if::cfg_if! {
if #[cfg(test)] {
Expand Down
2 changes: 1 addition & 1 deletion oximeter/instruments/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

//! General-purpose types for instrumenting code to producer oximeter metrics.
// Copyright 2023 Oxide Computer Company
// Copyright 2024 Oxide Computer Company

#[cfg(feature = "http-instruments")]
pub mod http;
Expand Down
22 changes: 15 additions & 7 deletions oximeter/producer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ pub enum Error {
#[error("Error running producer HTTP server: {0}")]
Server(String),

#[error("Error registering as metric producer: {0}")]
RegistrationError(String),
#[error("Error registering as metric producer: {msg}")]
RegistrationError { retryable: bool, msg: String },

#[error("Producer registry and config UUIDs do not match")]
UuidMismatch,
Expand Down Expand Up @@ -251,11 +251,19 @@ pub async fn register(
) -> Result<(), Error> {
let client =
nexus_client::Client::new(&format!("http://{}", address), log.clone());
client
.cpapi_producers_post(&server_info.into())
.await
.map(|_| ())
.map_err(|msg| Error::RegistrationError(msg.to_string()))
client.cpapi_producers_post(&server_info.into()).await.map(|_| ()).map_err(
|err| {
let retryable = match &err {
nexus_client::Error::CommunicationError(..) => true,
nexus_client::Error::ErrorResponse(resp) => {
resp.status().is_server_error()
}
_ => false,
};
let msg = err.to_string();
Error::RegistrationError { retryable, msg }
},
)
}

/// Handle a request to pull available metric data from a [`ProducerRegistry`].
Expand Down
1 change: 1 addition & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ async fn instance_register(
body_args.instance_runtime,
body_args.vmm_runtime,
body_args.propolis_addr,
body_args.metadata,
)
.await?,
))
Expand Down
11 changes: 10 additions & 1 deletion sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::nexus::NexusClientWithResolver;
use crate::params::ZoneBundleMetadata;
use crate::params::{InstanceExternalIpBody, ZoneBundleCause};
use crate::params::{
InstanceHardware, InstanceMigrationSourceParams,
InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams,
InstanceMigrationTargetParams, InstancePutStateResponse,
InstanceStateRequested, InstanceUnregisterResponse, VpcFirewallRule,
};
Expand Down Expand Up @@ -311,6 +311,11 @@ struct InstanceRunner {
// Properties visible to Propolis
properties: propolis_client::types::InstanceProperties,

// This is currently unused, but will be sent to Propolis as part of the
// work tracked in https://github.com/oxidecomputer/omicron/issues/4851. It
// will be included in the InstanceProperties above, most likely.
_metadata: InstanceMetadata,

// The ID of the Propolis server (and zone) running this instance
propolis_id: Uuid,

Expand Down Expand Up @@ -928,6 +933,7 @@ impl Instance {
ticket: InstanceTicket,
state: InstanceInitialState,
services: InstanceManagerServices,
metadata: InstanceMetadata,
) -> Result<Self, Error> {
info!(log, "initializing new Instance";
"instance_id" => %id,
Expand Down Expand Up @@ -1005,6 +1011,9 @@ impl Instance {
// InstanceCpuCount here, to avoid any casting...
vcpus: hardware.properties.ncpus.0 as u8,
},
// This will be used in a follow up, tracked under
// https://github.com/oxidecomputer/omicron/issues/4851.
_metadata: metadata,
propolis_id,
propolis_addr,
vnic_allocator,
Expand Down
Loading

0 comments on commit 9e555fe

Please sign in to comment.