Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
This is a WIP to test publishing this data from Propolis itself, rather
than the sled-agent. Should we take that path, we'll end up deleting
most of this diff in the sled-agent itself.
  • Loading branch information
bnaecker committed Jan 23, 2024
1 parent 4c127c2 commit 7f30715
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 10 deletions.
6 changes: 4 additions & 2 deletions oximeter/instruments/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ uuid.workspace = true
omicron-workspace-hack.workspace = true

[features]
default = ["http-instruments", "kstat"]
default = ["http-instruments", "datalink", "virtual-machine"]
http-instruments = ["http"]
kstat = ["kstat-rs"]
datalink = ["dep:kstat-rs"]
virtual-machine = ["dep:kstat-rs"]


[dev-dependencies]
rand.workspace = true
Expand Down
2 changes: 2 additions & 0 deletions oximeter/instruments/src/kstat/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,10 @@ use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::time::Duration;

#[cfg(feature = "datalink")]
pub mod link;
mod sampler;
#[cfg(feature = "virtual-machine")]
pub mod virtual_machine;

pub use sampler::CollectionDetails;
Expand Down
5 changes: 4 additions & 1 deletion oximeter/instruments/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@
#[cfg(feature = "http-instruments")]
pub mod http;

#[cfg(all(feature = "kstat", target_os = "illumos"))]
#[cfg(all(
any(feature = "link", feature = "virtual-machine"),
target_os = "illumos"
))]
pub mod kstat;
31 changes: 24 additions & 7 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ use crate::common::instance::{
PublishedVmmState,
};
use crate::instance_manager::{InstanceManagerServices, InstanceTicket};
use crate::metrics::Error as MetricsError;
use crate::metrics::MetricsManager;
use crate::metrics::INSTANCE_SAMPLE_INTERVAL;
use crate::nexus::NexusClientWithResolver;
Expand Down Expand Up @@ -111,9 +110,6 @@ pub enum Error {

#[error("I/O error")]
Io(#[from] std::io::Error),

#[error("Failed to track instance metrics")]
Metrics(#[source] MetricsError),
}

// Issues read-only, idempotent HTTP requests at propolis until it responds with
Expand Down Expand Up @@ -239,7 +235,7 @@ struct InstanceInner {
// Object used to collect zone bundles from this instance when terminated.
zone_bundler: ZoneBundler,

// Object used to start / stop collection of instance-related metrics.
// Data used to start / stop collection of instance-related metrics.
metrics_manager: MetricsManager,

// Object representing membership in the "instance manager".
Expand Down Expand Up @@ -392,14 +388,35 @@ impl InstanceInner {
Ok(Reaction::Terminate)
}
None => {
self.metrics_manager
// TODO-robustness: If we fail to register the actual kstats for
// this instance, we should produce missing samples, rather than
// nothing at all.
//
// See https://github.com/oxidecomputer/omicron/issues/4863.
//
// TODO(ben) -- need to do this once, or ignore duplicate target
// error, not every time we get a state update from propolis.
//
// TODO(ben) -- need to take number of vcpus. It looks like
// Propolis and the kernel hypervisor currently creates a kstat
// for all _possible_ vcpus, even those that don't exist?
if let Err(e) = self
.metrics_manager
.track_instance(
&self.id(),
&self.metadata,
self.properties.vcpus.into(),
INSTANCE_SAMPLE_INTERVAL,
)
.await
.map_err(Error::Metrics)?;
{
error!(
self.log,
"failed to track instance metrics, \
no samples will be produced";
"error" => ?e,
);
}
Ok(Reaction::Continue)
}
}
Expand Down
2 changes: 2 additions & 0 deletions sled-agent/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ impl MetricsManager {
&self,
instance_id: &Uuid,
metadata: &InstanceMetadata,
_n_vcpus: u32,
interval: Duration,
) -> Result<(), Error> {
let vm = virtual_machine::VirtualMachine {
Expand Down Expand Up @@ -303,6 +304,7 @@ impl MetricsManager {
&self,
_instance_id: &Uuid,
_metadata: &InstanceMetadata,
_n_vcpus: u32,
_interval: Duration,
) -> Result<(), Error> {
Err(Error::Kstat(anyhow!(
Expand Down

0 comments on commit 7f30715

Please sign in to comment.