Skip to content

Commit

Permalink
More wip
Browse files Browse the repository at this point in the history
  • Loading branch information
bnaecker committed Jan 24, 2024
1 parent 225b07e commit d1b954f
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 85 deletions.
18 changes: 9 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 9 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,15 @@ prettyplease = "0.2.16"
proc-macro2 = "1.0"
progenitor = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" }
progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" }
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }

# TODO(ben): Put back to main before merge.
#bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }
#propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }
#propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" }
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" }
propolis-client = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" }
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" }

proptest = "1.4.0"
quote = "1.0"
rand = "0.8.5"
Expand Down
2 changes: 2 additions & 0 deletions oximeter/instruments/src/kstat/virtual_machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ use oximeter::Sample;
use oximeter::Target;
use uuid::Uuid;

// TODO(ben): Add tests for changes to the kstat-based schema.

/// A single virtual machine instance.
#[derive(Clone, Debug)]
pub struct VirtualMachine {
Expand Down
22 changes: 15 additions & 7 deletions oximeter/producer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ pub enum Error {
#[error("Error running producer HTTP server: {0}")]
Server(String),

#[error("Error registering as metric producer: {0}")]
RegistrationError(String),
#[error("Error registering as metric producer: {msg}")]
RegistrationError { retryable: bool, msg: String },

#[error("Producer registry and config UUIDs do not match")]
UuidMismatch,
Expand Down Expand Up @@ -251,11 +251,19 @@ pub async fn register(
) -> Result<(), Error> {
let client =
nexus_client::Client::new(&format!("http://{}", address), log.clone());
client
.cpapi_producers_post(&server_info.into())
.await
.map(|_| ())
.map_err(|msg| Error::RegistrationError(msg.to_string()))
client.cpapi_producers_post(&server_info.into()).await.map(|_| ()).map_err(
|err| {
let retryable = match &err {
nexus_client::Error::CommunicationError(..) => true,
nexus_client::Error::ErrorResponse(resp) => {
resp.status().is_server_error()
}
_ => false,
};
let msg = err.to_string();
Error::RegistrationError { retryable, msg }
},
)
}

/// Handle a request to pull available metric data from a [`ProducerRegistry`].
Expand Down
61 changes: 4 additions & 57 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ use crate::common::instance::{
PublishedVmmState,
};
use crate::instance_manager::{InstanceManagerServices, InstanceTicket};
use crate::metrics::MetricsManager;
use crate::metrics::INSTANCE_SAMPLE_INTERVAL;
use crate::nexus::NexusClientWithResolver;
use crate::params::ZoneBundleCause;
use crate::params::ZoneBundleMetadata;
Expand Down Expand Up @@ -235,15 +233,8 @@ struct InstanceInner {
// Object used to collect zone bundles from this instance when terminated.
zone_bundler: ZoneBundler,

// Data used to start / stop collection of instance-related metrics.
tracking_instance_metrics: bool,
metrics_manager: MetricsManager,

// Object representing membership in the "instance manager".
instance_ticket: InstanceTicket,

// Metadata used to track statistics for this instance.
metadata: InstanceMetadata,
}

impl InstanceInner {
Expand Down Expand Up @@ -390,38 +381,7 @@ impl InstanceInner {
self.terminate().await?;
Ok(Reaction::Terminate)
}
None => {
// TODO-robustness: If we fail to register the actual kstats for
// this instance, we should produce missing samples, rather than
// nothing at all.
//
// See https://github.com/oxidecomputer/omicron/issues/4863.
//
// TODO(ben) -- need to do this once, or ignore duplicate target
// error, not every time we get a state update from propolis.
if !self.tracking_instance_metrics {
if let Err(e) = self
.metrics_manager
.track_instance(
&self.id(),
&self.metadata,
self.properties.vcpus.into(),
INSTANCE_SAMPLE_INTERVAL,
)
.await
{
error!(
self.log,
"failed to track instance metrics, \
no samples will be produced";
"error" => ?e,
);
} else {
self.tracking_instance_metrics = true;
}
}
Ok(Reaction::Continue)
}
None => Ok(Reaction::Continue),
}
}

Expand Down Expand Up @@ -583,18 +543,6 @@ impl InstanceInner {
);
}

// Stop tracking instance-related metrics.
if let Err(e) =
self.metrics_manager.stop_tracking_instance(self.id()).await
{
error!(
self.log,
"Failed to stop tracking instance metrics";
"instance_id" => %self.id(),
"error" => ?e,
);
}

// Ensure that no zone exists. This succeeds even if no zone was ever
// created.
// NOTE: we call`Zones::halt_and_remove_logged` directly instead of
Expand Down Expand Up @@ -674,7 +622,6 @@ impl Instance {
port_manager,
storage,
zone_bundler,
metrics_manager,
zone_builder_factory,
} = services;

Expand Down Expand Up @@ -716,6 +663,9 @@ impl Instance {
id,
name: hardware.properties.hostname.clone(),
description: "Test description".to_string(),
metadata: propolis_client::types::InstanceMetadata::from(
metadata.clone(),
),
image_id: Uuid::nil(),
bootrom_id: Uuid::nil(),
// TODO: Align the byte type w/propolis.
Expand Down Expand Up @@ -746,10 +696,7 @@ impl Instance {
storage,
zone_builder_factory,
zone_bundler,
tracking_instance_metrics: false,
metrics_manager,
instance_ticket: ticket,
metadata,
};

let inner = Arc::new(Mutex::new(instance));
Expand Down
7 changes: 0 additions & 7 deletions sled-agent/src/instance_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
use crate::instance::propolis_zone_name;
use crate::instance::Instance;
use crate::metrics::MetricsManager;
use crate::nexus::NexusClientWithResolver;
use crate::params::InstanceMetadata;
use crate::params::ZoneBundleMetadata;
Expand Down Expand Up @@ -79,7 +78,6 @@ struct InstanceManagerInternal {
port_manager: PortManager,
storage: StorageHandle,
zone_bundler: ZoneBundler,
metrics_manager: MetricsManager,
zone_builder_factory: ZoneBuilderFactory,
}

Expand All @@ -89,7 +87,6 @@ pub(crate) struct InstanceManagerServices {
pub port_manager: PortManager,
pub storage: StorageHandle,
pub zone_bundler: ZoneBundler,
pub metrics_manager: MetricsManager,
pub zone_builder_factory: ZoneBuilderFactory,
}

Expand All @@ -100,15 +97,13 @@ pub struct InstanceManager {

impl InstanceManager {
/// Initializes a new [`InstanceManager`] object.
#[allow(clippy::too_many_arguments)]
pub fn new(
log: Logger,
nexus_client: NexusClientWithResolver,
etherstub: Etherstub,
port_manager: PortManager,
storage: StorageHandle,
zone_bundler: ZoneBundler,
metrics_manager: MetricsManager,
zone_builder_factory: ZoneBuilderFactory,
) -> Result<InstanceManager, Error> {
Ok(InstanceManager {
Expand All @@ -123,7 +118,6 @@ impl InstanceManager {
port_manager,
storage,
zone_bundler,
metrics_manager,
zone_builder_factory,
}),
})
Expand Down Expand Up @@ -281,7 +275,6 @@ impl InstanceManager {
port_manager: self.inner.port_manager.clone(),
storage: self.inner.storage.clone(),
zone_bundler: self.inner.zone_bundler.clone(),
metrics_manager: self.inner.metrics_manager.clone(),
zone_builder_factory: self
.inner
.zone_builder_factory
Expand Down
9 changes: 9 additions & 0 deletions sled-agent/src/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ pub struct InstanceMetadata {
pub project_id: Uuid,
}

impl From<InstanceMetadata> for propolis_client::types::InstanceMetadata {
fn from(md: InstanceMetadata) -> propolis_client::types::InstanceMetadata {
propolis_client::types::InstanceMetadata {
silo_id: md.silo_id,
project_id: md.project_id,
}
}
}

/// The body of a request to ensure that a instance and VMM are known to a sled
/// agent.
#[derive(Serialize, Deserialize, JsonSchema)]
Expand Down
3 changes: 2 additions & 1 deletion sled-agent/src/sim/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl SledAgent {
hardware: InstanceHardware,
instance_runtime: InstanceRuntimeState,
vmm_runtime: VmmRuntimeState,
_metadata: InstanceMetadata,
metadata: InstanceMetadata,
) -> Result<SledInstanceState, Error> {
// respond with a fake 500 level failure if asked to ensure an instance
// with more than 16 CPUs.
Expand Down Expand Up @@ -287,6 +287,7 @@ impl SledAgent {
id: propolis_id,
name: hardware.properties.hostname.clone(),
description: "sled-agent-sim created instance".to_string(),
metadata: metadata.into(),
image_id: Uuid::default(),
bootrom_id: Uuid::default(),
memory: hardware.properties.memory.to_whole_mebibytes(),
Expand Down
1 change: 0 additions & 1 deletion sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,6 @@ impl SledAgent {
port_manager.clone(),
storage_manager.clone(),
long_running_task_handles.zone_bundler.clone(),
metrics_manager.clone(),
ZoneBuilderFactory::default(),
)?;

Expand Down

0 comments on commit d1b954f

Please sign in to comment.