From bdd4443a73e364ee3fafbf3845e3343c261dcda2 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 23 Jan 2024 20:49:47 +0000 Subject: [PATCH 1/5] Update progenitor from v0.4.0 -> v0.5.0 --- Cargo.lock | 152 +++++++++--------- clients/dns-service-client/src/lib.rs | 6 +- common/src/api/external/error.rs | 7 + .../app/sagas/switch_port_settings_apply.rs | 2 +- .../app/sagas/switch_port_settings_clear.rs | 2 +- sled-agent/src/instance.rs | 6 +- wicketd/src/preflight_check/uplink.rs | 9 +- workspace-hack/Cargo.toml | 8 +- 8 files changed, 103 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a2d9601a38..ed5c82d4fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,7 +168,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -269,7 +269,7 @@ checksum = "5fd55a5ba1179988837d24ab4c7cc8ed6efdeff578ede0416b4225a5fca35bd0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -291,7 +291,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -302,7 +302,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -353,7 +353,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream 0.2.0", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -490,7 +490,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.46", + "syn 2.0.48", "which", ] @@ -986,7 +986,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1412,7 +1412,7 @@ checksum = "83fdaf97f4804dcebfa5862639bc9ce4121e82140bec2a987ac5140294865b5b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1460,7 +1460,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.10.0", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1482,7 +1482,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core 0.20.3", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1514,7 +1514,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream 0.2.0", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1566,7 +1566,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1599,7 +1599,7 @@ checksum = "5fe87ce4529967e0ba1dcf8450bab64d97dfd5010a6256187ffe2e43e6f0e049" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1619,7 +1619,7 @@ checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1706,7 +1706,7 @@ dependencies = [ "diesel_table_macro_syntax", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1715,7 +1715,7 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" dependencies = [ - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -1958,7 +1958,7 @@ dependencies = [ "quote", "serde", "serde_tokenstream 0.2.0", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -2313,7 +2313,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -2430,7 +2430,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -3604,7 +3604,7 @@ version = "0.1.0" source = "git+https://github.com/oxidecomputer/opte?rev=dd2b7b0306d3f01fa09170b8884d402209e49244#dd2b7b0306d3f01fa09170b8884d402209e49244" dependencies = [ "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -4007,7 +4007,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -4332,7 +4332,7 @@ version = "0.1.0" dependencies = [ "omicron-workspace-hack", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -4485,7 +4485,7 @@ checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -5204,7 +5204,7 @@ dependencies = [ "string_cache", "subtle", "syn 1.0.109", - "syn 2.0.46", + "syn 2.0.48", "time", "time-macros", "tokio", @@ -5318,7 +5318,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -5601,7 +5601,7 @@ dependencies = [ "omicron-workspace-hack", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -5741,7 +5741,7 @@ dependencies = [ "regex", "regex-syntax 0.7.5", "structmeta", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -5887,7 +5887,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -5957,7 +5957,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -6201,7 +6201,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -6249,17 +6249,17 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.74" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "progenitor" -version = "0.4.0" -source = "git+https://github.com/oxidecomputer/progenitor?branch=main#9339b57628e1e76b1d7131ef93a6c0db2ab0a762" +version = "0.5.0" +source = "git+https://github.com/oxidecomputer/progenitor?branch=main#2d3b9d0eb50a1907974c0b0ba7ee7893425b3e79" dependencies = [ "progenitor-client", "progenitor-impl", @@ -6269,8 +6269,8 @@ dependencies = [ [[package]] name = "progenitor-client" -version = "0.4.0" -source = "git+https://github.com/oxidecomputer/progenitor?branch=main#9339b57628e1e76b1d7131ef93a6c0db2ab0a762" +version = "0.5.0" +source = "git+https://github.com/oxidecomputer/progenitor?branch=main#2d3b9d0eb50a1907974c0b0ba7ee7893425b3e79" dependencies = [ "bytes", "futures-core", @@ -6283,8 +6283,8 @@ dependencies = [ [[package]] name = "progenitor-impl" -version = "0.4.0" -source = "git+https://github.com/oxidecomputer/progenitor?branch=main#9339b57628e1e76b1d7131ef93a6c0db2ab0a762" +version = "0.5.0" +source = "git+https://github.com/oxidecomputer/progenitor?branch=main#2d3b9d0eb50a1907974c0b0ba7ee7893425b3e79" dependencies = [ "getopts", "heck 0.4.1", @@ -6297,7 +6297,7 @@ dependencies = [ "schemars", "serde", "serde_json", - "syn 2.0.46", + "syn 2.0.48", "thiserror", "typify", "unicode-ident", @@ -6305,8 +6305,8 @@ dependencies = [ [[package]] name = "progenitor-macro" -version = "0.4.0" -source = "git+https://github.com/oxidecomputer/progenitor?branch=main#9339b57628e1e76b1d7131ef93a6c0db2ab0a762" +version = "0.5.0" +source = "git+https://github.com/oxidecomputer/progenitor?branch=main#2d3b9d0eb50a1907974c0b0ba7ee7893425b3e79" dependencies = [ "openapiv3", "proc-macro2", @@ -6317,7 +6317,7 @@ dependencies = [ "serde_json", "serde_tokenstream 0.2.0", "serde_yaml", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -6693,7 +6693,7 @@ checksum = "7f7473c2cfcf90008193dd0e3e16599455cb601a9fce322b5bb55de799664925" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -6940,7 +6940,7 @@ dependencies = [ "regex", "relative-path", "rustc_version 0.4.0", - "syn 2.0.46", + "syn 2.0.48", "unicode-ident", ] @@ -7501,7 +7501,7 @@ checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -7562,7 +7562,7 @@ checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -7594,7 +7594,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -7635,7 +7635,7 @@ dependencies = [ "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -7949,7 +7949,7 @@ source = "git+https://github.com/oxidecomputer/slog-error-chain?branch=main#15f6 dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8204,7 +8204,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8301,7 +8301,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8312,7 +8312,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8371,7 +8371,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8419,9 +8419,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.46" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89456b690ff72fddcecf231caedbe615c59480c93358a93dfae7fc29e3ebbf0e" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", @@ -8603,7 +8603,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8628,22 +8628,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.49" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.49" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -8830,7 +8830,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -9075,7 +9075,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -9301,8 +9301,8 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.14" -source = "git+https://github.com/oxidecomputer/typify#c9d6453fc3cf69726d539925b838b267f886cb53" +version = "0.0.15" +source = "git+https://github.com/oxidecomputer/typify#1f97f167923f001818d461b1286f8a5242abf8b1" dependencies = [ "typify-impl", "typify-macro", @@ -9310,8 +9310,8 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.14" -source = "git+https://github.com/oxidecomputer/typify#c9d6453fc3cf69726d539925b838b267f886cb53" +version = "0.0.15" +source = "git+https://github.com/oxidecomputer/typify#1f97f167923f001818d461b1286f8a5242abf8b1" dependencies = [ "heck 0.4.1", "log", @@ -9320,15 +9320,15 @@ dependencies = [ "regress", "schemars", "serde_json", - "syn 2.0.46", + "syn 2.0.48", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.14" -source = "git+https://github.com/oxidecomputer/typify#c9d6453fc3cf69726d539925b838b267f886cb53" +version = "0.0.15" +source = "git+https://github.com/oxidecomputer/typify#1f97f167923f001818d461b1286f8a5242abf8b1" dependencies = [ "proc-macro2", "quote", @@ -9336,7 +9336,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream 0.2.0", - "syn 2.0.46", + "syn 2.0.48", "typify-impl", ] @@ -9725,7 +9725,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", "wasm-bindgen-shared", ] @@ -9759,7 +9759,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -10302,7 +10302,7 @@ checksum = "56097d5b91d711293a42be9289403896b68654625021732067eac7a4ca388a1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -10313,7 +10313,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] @@ -10333,7 +10333,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.46", + "syn 2.0.48", ] [[package]] diff --git a/clients/dns-service-client/src/lib.rs b/clients/dns-service-client/src/lib.rs index 931e68322f..e437f1a7f6 100644 --- a/clients/dns-service-client/src/lib.rs +++ b/clients/dns-service-client/src/lib.rs @@ -29,8 +29,10 @@ pub fn is_retryable(error: &DnsConfigError) -> bool { let response_value = match error { DnsConfigError::CommunicationError(_) => return true, DnsConfigError::InvalidRequest(_) - | DnsConfigError::InvalidResponsePayload(_) - | DnsConfigError::UnexpectedResponse(_) => return false, + | DnsConfigError::InvalidResponsePayload(_, _) + | DnsConfigError::UnexpectedResponse(_) + | DnsConfigError::InvalidUpgrade(_) + | DnsConfigError::ResponseBodyError(_) => return false, DnsConfigError::ErrorResponse(response_value) => response_value, }; diff --git a/common/src/api/external/error.rs b/common/src/api/external/error.rs index 2661db7bb6..c125c7f52f 100644 --- a/common/src/api/external/error.rs +++ b/common/src/api/external/error.rs @@ -523,6 +523,7 @@ impl From> for Error { // our version constraints (i.e. that the call was to a newer // service with an incompatible response). progenitor::progenitor_client::Error::InvalidResponsePayload( + _bytes, ee, ) => Error::internal_error(&format!( "InvalidResponsePayload: {}", @@ -539,6 +540,12 @@ impl From> for Error { r.status(), )) } + progenitor::progenitor_client::Error::InvalidUpgrade(e) => { + Error::internal_error(&format!("InvalidUpgrade: {e}",)) + } + progenitor::progenitor_client::Error::ResponseBodyError(_) => { + Error::internal_error(&format!("ResponseBodyError: {e}",)) + } } } } diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 0d6bb52421..ff07ef8ea4 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -223,7 +223,7 @@ async fn spa_undo_ensure_switch_port_settings( let log = sagactx.user_data().log(); let port_id: PortId = PortId::from_str(¶ms.switch_port_name) - .map_err(|e| external::Error::internal_error(e))?; + .map_err(|e| external::Error::internal_error(e.to_string().as_str()))?; let orig_port_settings_id = sagactx .lookup::>("original_switch_port_settings_id") diff --git a/nexus/src/app/sagas/switch_port_settings_clear.rs b/nexus/src/app/sagas/switch_port_settings_clear.rs index 0d876f8159..fff2e71e8e 100644 --- a/nexus/src/app/sagas/switch_port_settings_clear.rs +++ b/nexus/src/app/sagas/switch_port_settings_clear.rs @@ -179,7 +179,7 @@ async fn spa_undo_clear_switch_port_settings( let log = sagactx.user_data().log(); let port_id: PortId = PortId::from_str(¶ms.port_name) - .map_err(|e| external::Error::internal_error(e))?; + .map_err(|e| external::Error::internal_error(e.to_string().as_str()))?; let orig_port_settings_id = sagactx .lookup::>("original_switch_port_settings_id") diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 057402c57a..86182d8473 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -274,8 +274,10 @@ impl InstanceInner { )) } nexus_client::Error::InvalidRequest(_) - | nexus_client::Error::InvalidResponsePayload(_) - | nexus_client::Error::UnexpectedResponse(_) => { + | nexus_client::Error::InvalidResponsePayload(..) + | nexus_client::Error::UnexpectedResponse(_) + | nexus_client::Error::InvalidUpgrade(_) + | nexus_client::Error::ResponseBodyError(_) => { BackoffError::permanent(Error::Notification( err, )) diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index 25411f17a5..47995f0c10 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -161,8 +161,11 @@ fn add_steps_for_single_local_uplink_preflight_check<'a>( |_cx| async { // Check that the port name is valid and that it has no links // configured already. - let port_id = PortId::from_str(&uplink.port) - .map_err(UplinkPreflightTerminalError::InvalidPortName)?; + let port_id = PortId::from_str(&uplink.port).map_err(|_| { + UplinkPreflightTerminalError::InvalidPortName( + uplink.port.clone(), + ) + })?; let links = dpd_client .link_list(&port_id) .await @@ -892,7 +895,7 @@ type DpdError = dpd_client::Error; #[derive(Debug, Error)] pub(crate) enum UplinkPreflightTerminalError { #[error("invalid port name: {0}")] - InvalidPortName(&'static str), + InvalidPortName(String), #[error("failed to connect to dpd to check for current configuration")] GetCurrentConfig(#[source] DpdError), #[error("uplink already configured - is rack already initialized?")] diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index b574a292d1..738ef44bbd 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -78,7 +78,7 @@ petgraph = { version = "0.6.4", features = ["serde-1"] } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.0" } -proc-macro2 = { version = "1.0.74" } +proc-macro2 = { version = "1.0.78" } rand = { version = "0.8.5" } rand_chacha = { version = "0.3.1", default-features = false, features = ["std"] } regex = { version = "1.10.2" } @@ -98,7 +98,7 @@ spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.46", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.48", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.27", features = ["formatting", "local-offset", "macros", "parsing"] } tokio = { version = "1.35.1", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } @@ -182,7 +182,7 @@ petgraph = { version = "0.6.4", features = ["serde-1"] } postgres-types = { version = "0.2.6", default-features = false, features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } ppv-lite86 = { version = "0.2.17", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.0" } -proc-macro2 = { version = "1.0.74" } +proc-macro2 = { version = "1.0.78" } rand = { version = "0.8.5" } rand_chacha = { version = "0.3.1", default-features = false, features = ["std"] } regex = { version = "1.10.2" } @@ -202,7 +202,7 @@ spin = { version = "0.9.8" } string_cache = { version = "0.8.7" } subtle = { version = "2.5.0" } syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.46", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.48", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3.27", features = ["formatting", "local-offset", "macros", "parsing"] } time-macros = { version = "0.2.13", default-features = false, features = ["formatting", "parsing"] } tokio = { version = "1.35.1", features = ["full", "test-util"] } From 25041e81778fb0fea7c153acfc87f7c2d3efe2c6 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Sat, 20 Jan 2024 01:21:32 +0000 Subject: [PATCH 2/5] Publish instance vCPU usage statistics to `oximeter` - Adds the silo and project IDs to the instance-ensure request from Nexus to the sled-agent. These are used as fields on the instance-related statistics. - Defines a `VirtualMachine` oximeter target and `VcpuUsage` metric. The latter has a `state` field which corresponds to the named kstats published by the hypervisor that accumulate the time spent in a number of vCPU microstates. The combination of these should allow us to aggregate or break down vCPU usage by silo, project, instance, vCPU ID, and CPU state. - Adds APIs to the `MetricsManager` for starting / stopping tracking instance-related metrics, and plumbs the type through the `InstanceManager` and `Instance` (and their internal friends), so that new instances can control when data is produced from them. Currently, we'll start producing as soon as we get a non-terminate response from Propolis in the `instance_state_monitor()` task, and stop when the instance is terminated. --- nexus/src/app/instance.rs | 16 ++ openapi/sled-agent.json | 27 +++ oximeter/instruments/src/kstat/mod.rs | 1 + .../instruments/src/kstat/virtual_machine.rs | 185 ++++++++++++++++++ sled-agent/src/http_entrypoints.rs | 1 + sled-agent/src/instance.rs | 46 ++++- sled-agent/src/instance_manager.rs | 12 ++ sled-agent/src/metrics.rs | 172 ++++++++++++---- sled-agent/src/params.rs | 10 + sled-agent/src/sim/http_entrypoints.rs | 1 + sled-agent/src/sim/sled_agent.rs | 8 +- sled-agent/src/sled_agent.rs | 103 +++++----- 12 files changed, 489 insertions(+), 93 deletions(-) create mode 100644 oximeter/instruments/src/kstat/virtual_machine.rs diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 778c5e2fe1..45eab1de91 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1135,6 +1135,21 @@ impl super::Nexus { .map(|ssh_key| ssh_key.public_key) .collect::>(); + // Construct instance metadata used to track its statistics. + // + // This current means fetching the current silo ID, since we have all + // the other metadata already. + let silo_id = self + .current_silo_lookup(opctx)? + .lookup_for(authz::Action::Read) + .await? + .0 + .id(); + let metadata = sled_agent_client::types::InstanceMetadata { + silo_id, + project_id: db_instance.project_id, + }; + // Ask the sled agent to begin the state change. Then update the // database to reflect the new intermediate state. If this update is // not the newest one, that's fine. That might just mean the sled agent @@ -1178,6 +1193,7 @@ impl super::Nexus { PROPOLIS_PORT, ) .to_string(), + metadata, }, ) .await diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index b5b9d3fd5b..2667be5356 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4515,6 +4515,14 @@ } ] }, + "metadata": { + "description": "Metadata used to track instance statistics.", + "allOf": [ + { + "$ref": "#/components/schemas/InstanceMetadata" + } + ] + }, "propolis_addr": { "description": "The address at which this VMM should serve a Propolis server API.", "type": "string" @@ -4536,6 +4544,7 @@ "required": [ "hardware", "instance_runtime", + "metadata", "propolis_addr", "propolis_id", "vmm_runtime" @@ -4624,6 +4633,24 @@ "snapshot_id" ] }, + "InstanceMetadata": { + "description": "Metadata used to track statistics about an instance.", + "type": "object", + "properties": { + "project_id": { + "type": "string", + "format": "uuid" + }, + "silo_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "project_id", + "silo_id" + ] + }, "InstanceMigrationSourceParams": { "description": "Instance runtime state to update for a migration.", "type": "object", diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index 90f34acae8..d8b0c23887 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -89,6 +89,7 @@ use std::time::Duration; pub mod link; mod sampler; +pub mod virtual_machine; pub use sampler::CollectionDetails; pub use sampler::ExpirationBehavior; diff --git a/oximeter/instruments/src/kstat/virtual_machine.rs b/oximeter/instruments/src/kstat/virtual_machine.rs new file mode 100644 index 0000000000..1e1637cd56 --- /dev/null +++ b/oximeter/instruments/src/kstat/virtual_machine.rs @@ -0,0 +1,185 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +//! Types for tracking statistics about virtual machine instances. + +use crate::kstat::hrtime_to_utc; +use crate::kstat::ConvertNamedData; +use crate::kstat::Error; +use crate::kstat::KstatList; +use crate::kstat::KstatTarget; +use chrono::DateTime; +use chrono::Utc; +use kstat_rs::Data; +use kstat_rs::Kstat; +use kstat_rs::Named; +use kstat_rs::NamedData; +use oximeter::types::Cumulative; +use oximeter::Metric; +use oximeter::Sample; +use oximeter::Target; +use uuid::Uuid; + +/// A single virtual machine +#[derive(Clone, Debug, Target)] +pub struct VirtualMachine { + /// The silo to which the instance belongs. + pub silo_id: Uuid, + /// The project to which the instance belongs. + pub project_id: Uuid, + /// The ID of the instance. + pub instance_id: Uuid, +} + +/// Metric tracking vCPU usage by state. +#[derive(Clone, Debug, Metric)] +pub struct VcpuUsage { + /// The vCPU ID. + pub vcpu_id: u32, + /// The state of the vCPU. + pub state: String, + /// The cumulative time spent in this state, in nanoseconds. + pub datum: Cumulative, +} + +// The name of the kstat module containing virtual machine kstats. +const VMM_KSTAT_MODULE_NAME: &str = "vmm"; + +// The name of the kstat with virtual machine metadata (VM name currently). +const VM_KSTAT_NAME: &str = "vm"; + +// The named kstat holding the virtual machine's name. This is currently the +// UUID assigned by the control plane to the virtual machine instance. +const VM_NAME_KSTAT: &str = "vm_name"; + +// The name of kstat containing vCPU usage data. +const VCPU_KSTAT_PREFIX: &str = "vcpu"; + +// Prefix for all named data with a valid vCPU microstate that we track. +const VCPU_MICROSTATE_PREFIX: &str = "time_"; + +// The number of expected vCPU microstates we track. This isn't load-bearing, +// and only used to help preallocate an array holding the `VcpuUsage` samples. +const N_VCPU_MICROSTATES: usize = 6; + +impl KstatTarget for VirtualMachine { + // The VMM kstats are organized like so: + // + // - module: vmm + // - instance: a kernel-assigned integer + // - name: vm -> generic VM info, vcpuX -> info for each vCPU + // + // At this part of the code, we don't have that kstat instance, only the + // virtual machine instance's control plane UUID. However, the VM's "name" + // is assigned to be that control plane UUID in the hypervisor. See + // https://github.com/oxidecomputer/propolis/blob/759bf4a19990404c135e608afbe0d38b70bfa370/bin/propolis-server/src/lib/vm/mod.rs#L420 + // for the current code which does that. + // + // So we need to indicate interest in any VMM-related kstat here, and we are + // forced to filter to the right instance by looking up the VM name inside + // the `to_samples()` method below. + fn interested(&self, kstat: &Kstat<'_>) -> bool { + kstat.ks_module == VMM_KSTAT_MODULE_NAME + } + + fn to_samples( + &self, + kstats: KstatList<'_, '_>, + ) -> Result, Error> { + // First, we need to map the instance's control plane UUID to the + // instance ID. We'll find this through the `vmm::vm:vm_name` + // kstat, which lists the instance's UUID as its name. + let instance_id = self.instance_id.to_string(); + let instance = kstats + .iter() + .find_map(|(_, kstat, data)| { + kstat_instance_from_instance_id(kstat, data, &instance_id) + }) + .ok_or_else(|| Error::NoSuchKstat)?; + + // Armed with the kstat instance, find all relevant metrics related to + // this particular VM. For now, we produce only vCPU usage metrics, but + // others may be chained in the future. + let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| { + kstat.ks_instance == instance + && kstat.ks_name.starts_with(VCPU_KSTAT_PREFIX) + }); + produce_vcpu_usage(self, vcpu_stats) + } +} + +// Given a kstat and an instance's ID, return the kstat instance if it matches. +pub fn kstat_instance_from_instance_id( + kstat: &Kstat<'_>, + data: &Data<'_>, + instance_id: &str, +) -> Option { + if kstat.ks_module != VMM_KSTAT_MODULE_NAME { + return None; + } + if kstat.ks_name != VM_KSTAT_NAME { + return None; + } + let Data::Named(named) = data else { + return None; + }; + if named.iter().any(|nd| { + if nd.name != VM_NAME_KSTAT { + return false; + } + let NamedData::String(name) = &nd.value else { + return false; + }; + instance_id == *name + }) { + return Some(kstat.ks_instance); + } + None +} + +// Produce `Sample`s for the `VcpuUsage` metric from the relevant kstats. +pub fn produce_vcpu_usage<'a>( + vm: &'a VirtualMachine, + vcpu_stats: impl Iterator, Kstat<'a>, Data<'a>)> + 'a, +) -> Result, Error> { + let mut out = Vec::with_capacity(N_VCPU_MICROSTATES); + for (creation_time, kstat, data) in vcpu_stats { + let Data::Named(named) = data else { + return Err(Error::ExpectedNamedKstat); + }; + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; + + // Find the vCPU ID, from the relevant named data item. + let vcpu_id = named + .iter() + .find_map(|named| { + if named.name == VCPU_KSTAT_PREFIX { + named.value.as_u32().ok() + } else { + None + } + }) + .ok_or_else(|| Error::NoSuchKstat)?; + + // We'll track all statistics starting with `time_` as the microstate. + for Named { name, value } in named + .iter() + .filter(|nv| nv.name.starts_with(VCPU_MICROSTATE_PREFIX)) + { + // Safety: We're filtering in the loop on this prefix, so it must + // exist. + let state = + name.strip_prefix(VCPU_MICROSTATE_PREFIX).unwrap().to_string(); + let datum = + Cumulative::with_start_time(*creation_time, value.as_u64()?); + let metric = VcpuUsage { vcpu_id, state, datum }; + let sample = + Sample::new_with_timestamp(snapshot_time, vm, &metric)?; + out.push(sample); + } + } + Ok(out) +} diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 39d1ae26a0..886b683f49 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -410,6 +410,7 @@ async fn instance_register( body_args.instance_runtime, body_args.vmm_runtime, body_args.propolis_addr, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 86182d8473..641a4bb6d3 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -9,11 +9,14 @@ use crate::common::instance::{ PublishedVmmState, }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; +use crate::metrics::Error as MetricsError; +use crate::metrics::MetricsManager; +use crate::metrics::INSTANCE_SAMPLE_INTERVAL; use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleCause; use crate::params::ZoneBundleMetadata; use crate::params::{ - InstanceHardware, InstanceMigrationSourceParams, + InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule, }; use crate::profile::*; @@ -108,6 +111,9 @@ pub enum Error { #[error("I/O error")] Io(#[from] std::io::Error), + + #[error("Failed to track instance metrics")] + Metrics(#[source] MetricsError), } // Issues read-only, idempotent HTTP requests at propolis until it responds with @@ -233,8 +239,14 @@ struct InstanceInner { // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, + // Object used to start / stop collection of instance-related metrics. + metrics_manager: MetricsManager, + // Object representing membership in the "instance manager". instance_ticket: InstanceTicket, + + // Metadata used to track statistics for this instance. + metadata: InstanceMetadata, } impl InstanceInner { @@ -369,6 +381,10 @@ impl InstanceInner { // state to Nexus. This ensures that the instance is actually gone from // the sled when Nexus receives the state update saying it's actually // destroyed. + // + // In addition, we'll start or stop collecting metrics soley on the + // basis of whether the instance is terminated. All other states imply + // we start (or continue) to collect instance metrics. match action { Some(InstanceAction::Destroy) => { info!(self.log, "terminating VMM that has exited"; @@ -377,7 +393,17 @@ impl InstanceInner { self.terminate().await?; Ok(Reaction::Terminate) } - None => Ok(Reaction::Continue), + None => { + self.metrics_manager + .track_instance( + &self.id(), + &self.metadata, + INSTANCE_SAMPLE_INTERVAL, + ) + .await + .map_err(Error::Metrics)?; + Ok(Reaction::Continue) + } } } @@ -539,6 +565,18 @@ impl InstanceInner { ); } + // Stop tracking instance-related metrics. + if let Err(e) = + self.metrics_manager.stop_tracking_instance(self.id()).await + { + error!( + self.log, + "Failed to stop tracking instance metrics"; + "instance_id" => %self.id(), + "error" => ?e, + ); + } + // Ensure that no zone exists. This succeeds even if no zone was ever // created. // NOTE: we call`Zones::halt_and_remove_logged` directly instead of @@ -598,6 +636,7 @@ impl Instance { ticket: InstanceTicket, state: InstanceInitialState, services: InstanceManagerServices, + metadata: InstanceMetadata, ) -> Result { info!(log, "initializing new Instance"; "instance_id" => %id, @@ -617,6 +656,7 @@ impl Instance { port_manager, storage, zone_bundler, + metrics_manager, zone_builder_factory, } = services; @@ -688,7 +728,9 @@ impl Instance { storage, zone_builder_factory, zone_bundler, + metrics_manager, instance_ticket: ticket, + metadata, }; let inner = Arc::new(Mutex::new(instance)); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index c1b7e402a4..61acf6d15f 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -6,7 +6,9 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; +use crate::metrics::MetricsManager; use crate::nexus::NexusClientWithResolver; +use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, @@ -77,6 +79,7 @@ struct InstanceManagerInternal { port_manager: PortManager, storage: StorageHandle, zone_bundler: ZoneBundler, + metrics_manager: MetricsManager, zone_builder_factory: ZoneBuilderFactory, } @@ -86,6 +89,7 @@ pub(crate) struct InstanceManagerServices { pub port_manager: PortManager, pub storage: StorageHandle, pub zone_bundler: ZoneBundler, + pub metrics_manager: MetricsManager, pub zone_builder_factory: ZoneBuilderFactory, } @@ -96,6 +100,7 @@ pub struct InstanceManager { impl InstanceManager { /// Initializes a new [`InstanceManager`] object. + #[allow(clippy::too_many_arguments)] pub fn new( log: Logger, nexus_client: NexusClientWithResolver, @@ -103,6 +108,7 @@ impl InstanceManager { port_manager: PortManager, storage: StorageHandle, zone_bundler: ZoneBundler, + metrics_manager: MetricsManager, zone_builder_factory: ZoneBuilderFactory, ) -> Result { Ok(InstanceManager { @@ -117,6 +123,7 @@ impl InstanceManager { port_manager, storage, zone_bundler, + metrics_manager, zone_builder_factory, }), }) @@ -215,6 +222,7 @@ impl InstanceManager { /// (instance ID, Propolis ID) pair multiple times, but will fail if the /// instance is registered with a Propolis ID different from the one the /// caller supplied. + #[allow(clippy::too_many_arguments)] pub async fn ensure_registered( &self, instance_id: Uuid, @@ -223,6 +231,7 @@ impl InstanceManager { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { info!( &self.inner.log, @@ -233,6 +242,7 @@ impl InstanceManager { "instance_runtime" => ?instance_runtime, "vmm_runtime" => ?vmm_runtime, "propolis_addr" => ?propolis_addr, + "metadata" => ?metadata, ); let instance = { @@ -271,6 +281,7 @@ impl InstanceManager { port_manager: self.inner.port_manager.clone(), storage: self.inner.storage.clone(), zone_bundler: self.inner.zone_bundler.clone(), + metrics_manager: self.inner.metrics_manager.clone(), zone_builder_factory: self .inner .zone_builder_factory @@ -291,6 +302,7 @@ impl InstanceManager { ticket, state, services, + metadata, )?; let instance_clone = instance.clone(); let _old = diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6c3383c88f..6fdbdef6cf 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -8,17 +8,22 @@ use oximeter::types::MetricsError; use oximeter::types::ProducerRegistry; use sled_hardware::Baseboard; use slog::Logger; +use std::sync::Arc; use std::time::Duration; use uuid::Uuid; +use crate::params::InstanceMetadata; + cfg_if::cfg_if! { if #[cfg(target_os = "illumos")] { use oximeter_instruments::kstat::link; + use oximeter_instruments::kstat::virtual_machine; use oximeter_instruments::kstat::CollectionDetails; use oximeter_instruments::kstat::Error as KstatError; use oximeter_instruments::kstat::KstatSampler; use oximeter_instruments::kstat::TargetId; use std::collections::BTreeMap; + use std::sync::Mutex; } else { use anyhow::anyhow; } @@ -30,6 +35,9 @@ pub(crate) const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(30); /// The interval on which we sample link metrics. pub(crate) const LINK_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); +/// The interval on which we sample instance-related metrics, e.g., vCPU usage. +pub(crate) const INSTANCE_SAMPLE_INTERVAL: Duration = Duration::from_secs(10); + /// An error during sled-agent metric production. #[derive(Debug, thiserror::Error)] pub enum Error { @@ -46,6 +54,18 @@ pub enum Error { #[error("Failed to fetch hostname")] Hostname(#[source] std::io::Error), + + #[error("Non-UTF8 hostname")] + NonUtf8Hostname, +} + +// Basic metadata about the sled agent used when publishing metrics. +#[derive(Clone, Debug)] +#[cfg_attr(not(target_os = "illumos"), allow(dead_code))] +struct Metadata { + sled_id: Uuid, + rack_id: Uuid, + baseboard: Baseboard, } /// Type managing all oximeter metrics produced by the sled-agent. @@ -61,10 +81,7 @@ pub enum Error { // the name of fields that are not yet used. #[cfg_attr(not(target_os = "illumos"), allow(dead_code))] pub struct MetricsManager { - sled_id: Uuid, - rack_id: Uuid, - baseboard: Baseboard, - hostname: Option, + metadata: Arc, _log: Logger, #[cfg(target_os = "illumos")] kstat_sampler: KstatSampler, @@ -78,7 +95,9 @@ pub struct MetricsManager { // namespace them internally, e.g., `"datalink:{link_name}"` would be the // real key. #[cfg(target_os = "illumos")] - tracked_links: BTreeMap, + tracked_links: Arc>>, + #[cfg(target_os = "illumos")] + tracked_instances: Arc>>, registry: ProducerRegistry, } @@ -101,19 +120,19 @@ impl MetricsManager { registry .register_producer(kstat_sampler.clone()) .map_err(Error::Registry)?; - let tracked_links = BTreeMap::new(); + let tracked_links = Arc::new(Mutex::new(BTreeMap::new())); + let tracked_instances = Arc::new(Mutex::new(BTreeMap::new())); } } Ok(Self { - sled_id, - rack_id, - baseboard, - hostname: None, + metadata: Arc::new(Metadata { sled_id, rack_id, baseboard }), _log: log, #[cfg(target_os = "illumos")] kstat_sampler, #[cfg(target_os = "illumos")] tracked_links, + #[cfg(target_os = "illumos")] + tracked_instances, registry, }) } @@ -128,14 +147,14 @@ impl MetricsManager { impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, link_name: impl AsRef, interval: Duration, ) -> Result<(), Error> { - let hostname = self.hostname().await?; + let hostname = hostname()?; let link = link::PhysicalDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname, link_name: link_name.as_ref().to_string(), @@ -146,7 +165,10 @@ impl MetricsManager { .add_target(link, details) .await .map_err(Error::Kstat)?; - self.tracked_links.insert(link_name.as_ref().to_string(), id); + self.tracked_links + .lock() + .unwrap() + .insert(link_name.as_ref().to_string(), id); Ok(()) } @@ -155,10 +177,12 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, link_name: impl AsRef, ) -> Result<(), Error> { - if let Some(id) = self.tracked_links.remove(link_name.as_ref()) { + let maybe_id = + self.tracked_links.lock().unwrap().remove(link_name.as_ref()); + if let Some(id) = maybe_id { self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat) } else { Ok(()) @@ -174,8 +198,8 @@ impl MetricsManager { interval: Duration, ) -> Result<(), Error> { let link = link::VirtualDataLink { - rack_id: self.rack_id, - sled_id: self.sled_id, + rack_id: self.metadata.rack_id, + sled_id: self.metadata.sled_id, serial: self.serial_number(), hostname: hostname.as_ref().to_string(), link_name: link_name.as_ref().to_string(), @@ -188,42 +212,57 @@ impl MetricsManager { .map_err(Error::Kstat) } + /// Start tracking instance-related metrics. + pub async fn track_instance( + &self, + instance_id: &Uuid, + metadata: &InstanceMetadata, + interval: Duration, + ) -> Result<(), Error> { + let vm = virtual_machine::VirtualMachine { + silo_id: metadata.silo_id, + project_id: metadata.project_id, + instance_id: *instance_id, + }; + let details = CollectionDetails::never(interval); + let id = self + .kstat_sampler + .add_target(vm, details) + .await + .map_err(Error::Kstat)?; + self.tracked_instances.lock().unwrap().insert(*instance_id, id); + Ok(()) + } + + /// Stop tracking metrics associated with this instance. + pub async fn stop_tracking_instance( + &self, + instance_id: &Uuid, + ) -> Result<(), Error> { + let maybe_id = + self.tracked_instances.lock().unwrap().remove(instance_id); + if let Some(id) = maybe_id { + self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat) + } else { + Ok(()) + } + } + // Return the serial number out of the baseboard, if one exists. fn serial_number(&self) -> String { - match &self.baseboard { + match &self.metadata.baseboard { Baseboard::Gimlet { identifier, .. } => identifier.clone(), Baseboard::Unknown => String::from("unknown"), Baseboard::Pc { identifier, .. } => identifier.clone(), } } - - // Return the system's hostname. - // - // If we've failed to get it previously, we try again. If _that_ fails, - // return an error. - // - // TODO-cleanup: This will become much simpler once - // `OnceCell::get_or_try_init` is stabilized. - async fn hostname(&mut self) -> Result { - if let Some(hn) = &self.hostname { - return Ok(hn.clone()); - } - let hn = tokio::process::Command::new("hostname") - .env_clear() - .output() - .await - .map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string()) - .map_err(Error::Hostname)?; - self.hostname.replace(hn.clone()); - Ok(hn) - } } #[cfg(not(target_os = "illumos"))] impl MetricsManager { /// Track metrics for a physical datalink. pub async fn track_physical_link( - &mut self, + &self, _link_name: impl AsRef, _interval: Duration, ) -> Result<(), Error> { @@ -237,7 +276,7 @@ impl MetricsManager { /// This works for both physical and virtual links. #[allow(dead_code)] pub async fn stop_tracking_link( - &mut self, + &self, _link_name: impl AsRef, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!( @@ -257,4 +296,51 @@ impl MetricsManager { "kstat metrics are not supported on this platform" ))) } + + /// Start tracking instance-related metrics. + #[allow(dead_code)] + pub async fn track_instance( + &self, + _instance_id: &Uuid, + _metadata: &InstanceMetadata, + _interval: Duration, + ) -> Result<(), Error> { + Err(Error::Kstat(anyhow!( + "kstat metrics are not supported on this platform" + ))) + } + + /// Stop tracking metrics associated with this instance. + #[allow(dead_code)] + pub async fn stop_tracking_instance( + &self, + _instance_id: &Uuid, + ) -> Result<(), Error> { + Err(Error::Kstat(anyhow!( + "kstat metrics are not supported on this platform" + ))) + } +} + +// Return the current hostname if possible. +#[cfg(target_os = "illumos")] +fn hostname() -> Result { + // See netdb.h + const MAX_LEN: usize = 256; + let mut out = vec![0u8; MAX_LEN + 1]; + if unsafe { + libc::gethostname(out.as_mut_ptr() as *mut libc::c_char, MAX_LEN) + } == 0 + { + // Split into subslices by NULL bytes. + // + // Safety: We've passed an extra NULL to the gethostname(2) call, so + // there must be at least one of these. + let chunk = out.split(|x| *x == 0).next().unwrap(); + let s = std::ffi::CString::new(chunk) + .map_err(|_| Error::NonUtf8Hostname)?; + s.into_string().map_err(|_| Error::NonUtf8Hostname) + } else { + Err(std::io::Error::last_os_error()).map_err(|_| Error::NonUtf8Hostname) + } } diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 9120bafa9a..0d9df8723d 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -81,6 +81,13 @@ pub struct InstanceHardware { pub cloud_init_bytes: Option, } +/// Metadata used to track statistics about an instance. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct InstanceMetadata { + pub silo_id: Uuid, + pub project_id: Uuid, +} + /// The body of a request to ensure that a instance and VMM are known to a sled /// agent. #[derive(Serialize, Deserialize, JsonSchema)] @@ -102,6 +109,9 @@ pub struct InstanceEnsureBody { /// The address at which this VMM should serve a Propolis server API. pub propolis_addr: SocketAddr, + + /// Metadata used to track instance statistics. + pub metadata: InstanceMetadata, } /// The body of a request to move a previously-ensured instance into a specific diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index e5d7752511..3063247942 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -95,6 +95,7 @@ async fn instance_register( body_args.hardware, body_args.instance_runtime, body_args.vmm_runtime, + body_args.metadata, ) .await?, )) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 8a76bf6abc..2b0e84a012 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -12,9 +12,10 @@ use super::storage::CrucibleData; use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ - DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, + DiskStateRequested, InstanceHardware, InstanceMetadata, + InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceStateRequested, InstanceUnregisterResponse, Inventory, + OmicronZonesConfig, SledRole, }; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -236,6 +237,7 @@ impl SledAgent { hardware: InstanceHardware, instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, + _metadata: InstanceMetadata, ) -> Result { // respond with a fake 500 level failure if asked to ensure an instance // with more than 16 CPUs. diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 71fe3584f0..5539c721f3 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -16,10 +16,11 @@ use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ - DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, - TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, + DiskStateRequested, InstanceHardware, InstanceMetadata, + InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceStateRequested, InstanceUnregisterResponse, Inventory, + OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRule, + ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_monitor::UnderlayAccess; @@ -393,6 +394,55 @@ impl SledAgent { let underlay_nics = underlay::find_nics(&config.data_links)?; illumos_utils::opte::initialize_xde_driver(&log, &underlay_nics)?; + // Start collecting metric data. + // + // First, we're creating a shareable type for managing the metrics + // themselves early on, so that we can pass it to other components of + // the sled agent that need it. + // + // Then we'll start tracking physical links and register as a producer + // with Nexus in the background. + let metrics_manager = MetricsManager::new( + request.body.id, + request.body.rack_id, + long_running_task_handles.hardware_manager.baseboard(), + log.new(o!("component" => "MetricsManager")), + )?; + + // Start tracking the underlay physical links. + for nic in underlay::find_nics(&config.data_links)? { + let link_name = nic.interface(); + if let Err(e) = metrics_manager + .track_physical_link( + link_name, + crate::metrics::LINK_SAMPLE_INTERVAL, + ) + .await + { + error!( + log, + "failed to start tracking physical link metrics"; + "link_name" => link_name, + "error" => ?e, + ); + } + } + + // Spawn a task in the background to register our metric producer with + // Nexus. This should not block progress here. + let endpoint = ProducerEndpoint { + id: request.body.id, + kind: ProducerKind::SledAgent, + address: sled_address.into(), + base_route: String::from("/metrics/collect"), + interval: crate::metrics::METRIC_COLLECTION_INTERVAL, + }; + tokio::task::spawn(register_metric_producer_with_nexus( + log.clone(), + nexus_client.clone(), + endpoint, + )); + // Create the PortManager to manage all the OPTE ports on the sled. let port_manager = PortManager::new( parent_log.new(o!("component" => "PortManager")), @@ -416,6 +466,7 @@ impl SledAgent { port_manager.clone(), storage_manager.clone(), long_running_task_handles.zone_bundler.clone(), + metrics_manager.clone(), ZoneBuilderFactory::default(), )?; @@ -513,47 +564,6 @@ impl SledAgent { rack_network_config.clone(), )?; - let mut metrics_manager = MetricsManager::new( - request.body.id, - request.body.rack_id, - long_running_task_handles.hardware_manager.baseboard(), - log.new(o!("component" => "MetricsManager")), - )?; - - // Start tracking the underlay physical links. - for nic in underlay::find_nics(&config.data_links)? { - let link_name = nic.interface(); - if let Err(e) = metrics_manager - .track_physical_link( - link_name, - crate::metrics::LINK_SAMPLE_INTERVAL, - ) - .await - { - error!( - log, - "failed to start tracking physical link metrics"; - "link_name" => link_name, - "error" => ?e, - ); - } - } - - // Spawn a task in the background to register our metric producer with - // Nexus. This should not block progress here. - let endpoint = ProducerEndpoint { - id: request.body.id, - kind: ProducerKind::SledAgent, - address: sled_address.into(), - base_route: String::from("/metrics/collect"), - interval: crate::metrics::METRIC_COLLECTION_INTERVAL, - }; - tokio::task::spawn(register_metric_producer_with_nexus( - log.clone(), - nexus_client.clone(), - endpoint, - )); - let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, @@ -909,6 +919,7 @@ impl SledAgent { /// Idempotently ensures that a given instance is registered with this sled, /// i.e., that it can be addressed by future calls to /// [`Self::instance_ensure_state`]. + #[allow(clippy::too_many_arguments)] pub async fn instance_ensure_registered( &self, instance_id: Uuid, @@ -917,6 +928,7 @@ impl SledAgent { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, + metadata: InstanceMetadata, ) -> Result { self.inner .instances @@ -927,6 +939,7 @@ impl SledAgent { instance_runtime, vmm_runtime, propolis_addr, + metadata, ) .await .map_err(|e| Error::Instance(e)) From 3211651ee50ee82bd4da5d08c71e40906617a6f6 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 22 Jan 2024 22:28:38 +0000 Subject: [PATCH 3/5] WIP This is a WIP to test publishing this data from Propolis itself, rather than the sled-agent. Should we take that path, we'll end up deleting most of this diff in the sled-agent itself. --- oximeter/instruments/Cargo.toml | 6 ++++-- oximeter/instruments/src/kstat/mod.rs | 2 ++ oximeter/instruments/src/lib.rs | 5 ++++- sled-agent/src/instance.rs | 31 +++++++++++++++++++++------ sled-agent/src/metrics.rs | 2 ++ 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 8372b7c560..62524fd6c6 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -18,9 +18,11 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [features] -default = ["http-instruments", "kstat"] +default = ["http-instruments", "datalink", "virtual-machine"] http-instruments = ["http"] -kstat = ["kstat-rs"] +datalink = ["dep:kstat-rs"] +virtual-machine = ["dep:kstat-rs"] + [dev-dependencies] rand.workspace = true diff --git a/oximeter/instruments/src/kstat/mod.rs b/oximeter/instruments/src/kstat/mod.rs index d8b0c23887..53ce5e17af 100644 --- a/oximeter/instruments/src/kstat/mod.rs +++ b/oximeter/instruments/src/kstat/mod.rs @@ -87,8 +87,10 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::time::Duration; +#[cfg(feature = "datalink")] pub mod link; mod sampler; +#[cfg(feature = "virtual-machine")] pub mod virtual_machine; pub use sampler::CollectionDetails; diff --git a/oximeter/instruments/src/lib.rs b/oximeter/instruments/src/lib.rs index d003e71739..ec1d28713f 100644 --- a/oximeter/instruments/src/lib.rs +++ b/oximeter/instruments/src/lib.rs @@ -9,5 +9,8 @@ #[cfg(feature = "http-instruments")] pub mod http; -#[cfg(all(feature = "kstat", target_os = "illumos"))] +#[cfg(all( + any(feature = "link", feature = "virtual-machine"), + target_os = "illumos" +))] pub mod kstat; diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 641a4bb6d3..d94db80ad5 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -9,7 +9,6 @@ use crate::common::instance::{ PublishedVmmState, }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; -use crate::metrics::Error as MetricsError; use crate::metrics::MetricsManager; use crate::metrics::INSTANCE_SAMPLE_INTERVAL; use crate::nexus::NexusClientWithResolver; @@ -111,9 +110,6 @@ pub enum Error { #[error("I/O error")] Io(#[from] std::io::Error), - - #[error("Failed to track instance metrics")] - Metrics(#[source] MetricsError), } // Issues read-only, idempotent HTTP requests at propolis until it responds with @@ -239,7 +235,7 @@ struct InstanceInner { // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, - // Object used to start / stop collection of instance-related metrics. + // Data used to start / stop collection of instance-related metrics. metrics_manager: MetricsManager, // Object representing membership in the "instance manager". @@ -394,14 +390,35 @@ impl InstanceInner { Ok(Reaction::Terminate) } None => { - self.metrics_manager + // TODO-robustness: If we fail to register the actual kstats for + // this instance, we should produce missing samples, rather than + // nothing at all. + // + // See https://github.com/oxidecomputer/omicron/issues/4863. + // + // TODO(ben) -- need to do this once, or ignore duplicate target + // error, not every time we get a state update from propolis. + // + // TODO(ben) -- need to take number of vcpus. It looks like + // Propolis and the kernel hypervisor currently creates a kstat + // for all _possible_ vcpus, even those that don't exist? + if let Err(e) = self + .metrics_manager .track_instance( &self.id(), &self.metadata, + self.properties.vcpus.into(), INSTANCE_SAMPLE_INTERVAL, ) .await - .map_err(Error::Metrics)?; + { + error!( + self.log, + "failed to track instance metrics, \ + no samples will be produced"; + "error" => ?e, + ); + } Ok(Reaction::Continue) } } diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 6fdbdef6cf..23edfa1f4e 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -217,6 +217,7 @@ impl MetricsManager { &self, instance_id: &Uuid, metadata: &InstanceMetadata, + _n_vcpus: u32, interval: Duration, ) -> Result<(), Error> { let vm = virtual_machine::VirtualMachine { @@ -303,6 +304,7 @@ impl MetricsManager { &self, _instance_id: &Uuid, _metadata: &InstanceMetadata, + _n_vcpus: u32, _interval: Duration, ) -> Result<(), Error> { Err(Error::Kstat(anyhow!( From 225b07e3fade4aa21c4fbc3d8639f95edce49907 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 23 Jan 2024 19:45:36 +0000 Subject: [PATCH 4/5] More WIP - Store number of vcpus in virtual machine - impl target manually - only produce samples from expected vcpu kstats - track instance metrics once --- oximeter/instruments/Cargo.toml | 1 - .../instruments/src/kstat/virtual_machine.rs | 55 +++++++++++++++++-- sled-agent/src/instance.rs | 42 +++++++------- sled-agent/src/metrics.rs | 3 +- 4 files changed, 75 insertions(+), 26 deletions(-) diff --git a/oximeter/instruments/Cargo.toml b/oximeter/instruments/Cargo.toml index 62524fd6c6..0d96fc961a 100644 --- a/oximeter/instruments/Cargo.toml +++ b/oximeter/instruments/Cargo.toml @@ -23,7 +23,6 @@ http-instruments = ["http"] datalink = ["dep:kstat-rs"] virtual-machine = ["dep:kstat-rs"] - [dev-dependencies] rand.workspace = true slog-async.workspace = true diff --git a/oximeter/instruments/src/kstat/virtual_machine.rs b/oximeter/instruments/src/kstat/virtual_machine.rs index 1e1637cd56..72f991cf60 100644 --- a/oximeter/instruments/src/kstat/virtual_machine.rs +++ b/oximeter/instruments/src/kstat/virtual_machine.rs @@ -18,13 +18,15 @@ use kstat_rs::Kstat; use kstat_rs::Named; use kstat_rs::NamedData; use oximeter::types::Cumulative; +use oximeter::FieldType; +use oximeter::FieldValue; use oximeter::Metric; use oximeter::Sample; use oximeter::Target; use uuid::Uuid; -/// A single virtual machine -#[derive(Clone, Debug, Target)] +/// A single virtual machine instance. +#[derive(Clone, Debug)] pub struct VirtualMachine { /// The silo to which the instance belongs. pub silo_id: Uuid, @@ -32,6 +34,35 @@ pub struct VirtualMachine { pub project_id: Uuid, /// The ID of the instance. pub instance_id: Uuid, + + // This field is not published as part of the target field definitions. It + // is needed because the hypervisor currently creates kstats for each vCPU, + // regardless of whether they're activated. There is no way to tell from + // userland today which vCPU kstats are "real". We include this value here, + // and implement `oximeter::Target` manually. + pub n_vcpus: u32, +} + +impl Target for VirtualMachine { + fn name(&self) -> &'static str { + "virtual_machine" + } + + fn field_names(&self) -> &'static [&'static str] { + &["silo_id", "project_id", "instance_id"] + } + + fn field_types(&self) -> Vec { + vec![FieldType::Uuid, FieldType::Uuid, FieldType::Uuid] + } + + fn field_values(&self) -> Vec { + vec![ + self.silo_id.into(), + self.project_id.into(), + self.instance_id.into(), + ] + } } /// Metric tracking vCPU usage by state. @@ -92,6 +123,9 @@ impl KstatTarget for VirtualMachine { // First, we need to map the instance's control plane UUID to the // instance ID. We'll find this through the `vmm::vm:vm_name` // kstat, which lists the instance's UUID as its name. + // + // Note that if this code is run from within a Propolis zone, there is + // exactly one `vmm` kstat instance in any case. let instance_id = self.instance_id.to_string(); let instance = kstats .iter() @@ -104,8 +138,21 @@ impl KstatTarget for VirtualMachine { // this particular VM. For now, we produce only vCPU usage metrics, but // others may be chained in the future. let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| { - kstat.ks_instance == instance - && kstat.ks_name.starts_with(VCPU_KSTAT_PREFIX) + // Filter out those that don't match our kstat instance. + if kstat.ks_instance != instance { + return false; + } + + // Filter out those which are neither a vCPU stat of any kind, nor + // for one of the vCPU IDs we know to be active. + let Some(suffix) = kstat.ks_name.strip_prefix(VCPU_KSTAT_PREFIX) + else { + return false; + }; + let Ok(vcpu_id) = suffix.parse::() else { + return false; + }; + vcpu_id < self.n_vcpus }); produce_vcpu_usage(self, vcpu_stats) } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index d94db80ad5..92336b616b 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -236,6 +236,7 @@ struct InstanceInner { zone_bundler: ZoneBundler, // Data used to start / stop collection of instance-related metrics. + tracking_instance_metrics: bool, metrics_manager: MetricsManager, // Object representing membership in the "instance manager". @@ -398,26 +399,26 @@ impl InstanceInner { // // TODO(ben) -- need to do this once, or ignore duplicate target // error, not every time we get a state update from propolis. - // - // TODO(ben) -- need to take number of vcpus. It looks like - // Propolis and the kernel hypervisor currently creates a kstat - // for all _possible_ vcpus, even those that don't exist? - if let Err(e) = self - .metrics_manager - .track_instance( - &self.id(), - &self.metadata, - self.properties.vcpus.into(), - INSTANCE_SAMPLE_INTERVAL, - ) - .await - { - error!( - self.log, - "failed to track instance metrics, \ - no samples will be produced"; - "error" => ?e, - ); + if !self.tracking_instance_metrics { + if let Err(e) = self + .metrics_manager + .track_instance( + &self.id(), + &self.metadata, + self.properties.vcpus.into(), + INSTANCE_SAMPLE_INTERVAL, + ) + .await + { + error!( + self.log, + "failed to track instance metrics, \ + no samples will be produced"; + "error" => ?e, + ); + } else { + self.tracking_instance_metrics = true; + } } Ok(Reaction::Continue) } @@ -745,6 +746,7 @@ impl Instance { storage, zone_builder_factory, zone_bundler, + tracking_instance_metrics: false, metrics_manager, instance_ticket: ticket, metadata, diff --git a/sled-agent/src/metrics.rs b/sled-agent/src/metrics.rs index 23edfa1f4e..bbff8e96be 100644 --- a/sled-agent/src/metrics.rs +++ b/sled-agent/src/metrics.rs @@ -217,13 +217,14 @@ impl MetricsManager { &self, instance_id: &Uuid, metadata: &InstanceMetadata, - _n_vcpus: u32, + n_vcpus: u32, interval: Duration, ) -> Result<(), Error> { let vm = virtual_machine::VirtualMachine { silo_id: metadata.silo_id, project_id: metadata.project_id, instance_id: *instance_id, + n_vcpus, }; let details = CollectionDetails::never(interval); let id = self From d1b954feaa52884f8ba68f8acb217dee494486f5 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Wed, 24 Jan 2024 20:02:47 +0000 Subject: [PATCH 5/5] More wip --- Cargo.lock | 18 +++--- Cargo.toml | 12 +++- .../instruments/src/kstat/virtual_machine.rs | 2 + oximeter/producer/src/lib.rs | 22 ++++--- sled-agent/src/instance.rs | 61 ++----------------- sled-agent/src/instance_manager.rs | 7 --- sled-agent/src/params.rs | 9 +++ sled-agent/src/sim/sled_agent.rs | 3 +- sled-agent/src/sled_agent.rs | 1 - 9 files changed, 50 insertions(+), 85 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ed5c82d4fe..941c867a26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -455,7 +455,7 @@ dependencies = [ [[package]] name = "bhyve_api" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=1e25649e8c2ac274bd04adfe0513dd14a482058c#1e25649e8c2ac274bd04adfe0513dd14a482058c" +source = "git+https://github.com/oxidecomputer/propolis?branch=vcpu-usage-stats#8ed26acc9a7d06264ed7ac490741b0eea39f2d27" dependencies = [ "bhyve_api_sys", "libc", @@ -465,7 +465,7 @@ dependencies = [ [[package]] name = "bhyve_api_sys" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=1e25649e8c2ac274bd04adfe0513dd14a482058c#1e25649e8c2ac274bd04adfe0513dd14a482058c" +source = "git+https://github.com/oxidecomputer/propolis?branch=vcpu-usage-stats#8ed26acc9a7d06264ed7ac490741b0eea39f2d27" dependencies = [ "libc", "strum", @@ -1906,7 +1906,7 @@ dependencies = [ [[package]] name = "dropshot" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "async-stream", "async-trait", @@ -1952,7 +1952,7 @@ dependencies = [ [[package]] name = "dropshot_endpoint" version = "0.9.1-dev" -source = "git+https://github.com/oxidecomputer/dropshot?branch=main#711a7490d81416731cfe0f9fef366ed5f266a0ee" +source = "git+https://github.com/oxidecomputer/dropshot?branch=main#29ae98d1f909c6832661408a4c03f929e8afa6e9" dependencies = [ "proc-macro2", "quote", @@ -6323,7 +6323,7 @@ dependencies = [ [[package]] name = "propolis-client" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=1e25649e8c2ac274bd04adfe0513dd14a482058c#1e25649e8c2ac274bd04adfe0513dd14a482058c" +source = "git+https://github.com/oxidecomputer/propolis?branch=vcpu-usage-stats#8ed26acc9a7d06264ed7ac490741b0eea39f2d27" dependencies = [ "async-trait", "base64", @@ -6344,7 +6344,7 @@ dependencies = [ [[package]] name = "propolis-mock-server" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=1e25649e8c2ac274bd04adfe0513dd14a482058c#1e25649e8c2ac274bd04adfe0513dd14a482058c" +source = "git+https://github.com/oxidecomputer/propolis?branch=vcpu-usage-stats#8ed26acc9a7d06264ed7ac490741b0eea39f2d27" dependencies = [ "anyhow", "atty", @@ -6374,7 +6374,7 @@ dependencies = [ [[package]] name = "propolis_types" version = "0.0.0" -source = "git+https://github.com/oxidecomputer/propolis?rev=1e25649e8c2ac274bd04adfe0513dd14a482058c#1e25649e8c2ac274bd04adfe0513dd14a482058c" +source = "git+https://github.com/oxidecomputer/propolis?branch=vcpu-usage-stats#8ed26acc9a7d06264ed7ac490741b0eea39f2d27" dependencies = [ "schemars", "serde", @@ -9591,9 +9591,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" dependencies = [ "getrandom 0.2.10", "serde", diff --git a/Cargo.toml b/Cargo.toml index d97236b632..15288663cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -301,9 +301,15 @@ prettyplease = "0.2.16" proc-macro2 = "1.0" progenitor = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branch = "main" } -bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } -propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } -propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } + +# TODO(ben): Put back to main before merge. +#bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } +#propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } +#propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "1e25649e8c2ac274bd04adfe0513dd14a482058c" } +bhyve_api = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" } +propolis-client = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" } +propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", branch = "vcpu-usage-stats" } + proptest = "1.4.0" quote = "1.0" rand = "0.8.5" diff --git a/oximeter/instruments/src/kstat/virtual_machine.rs b/oximeter/instruments/src/kstat/virtual_machine.rs index 72f991cf60..7b3dcb1119 100644 --- a/oximeter/instruments/src/kstat/virtual_machine.rs +++ b/oximeter/instruments/src/kstat/virtual_machine.rs @@ -25,6 +25,8 @@ use oximeter::Sample; use oximeter::Target; use uuid::Uuid; +// TODO(ben): Add tests for changes to the kstat-based schema. + /// A single virtual machine instance. #[derive(Clone, Debug)] pub struct VirtualMachine { diff --git a/oximeter/producer/src/lib.rs b/oximeter/producer/src/lib.rs index 2354f9c217..3fecaadf4f 100644 --- a/oximeter/producer/src/lib.rs +++ b/oximeter/producer/src/lib.rs @@ -38,8 +38,8 @@ pub enum Error { #[error("Error running producer HTTP server: {0}")] Server(String), - #[error("Error registering as metric producer: {0}")] - RegistrationError(String), + #[error("Error registering as metric producer: {msg}")] + RegistrationError { retryable: bool, msg: String }, #[error("Producer registry and config UUIDs do not match")] UuidMismatch, @@ -251,11 +251,19 @@ pub async fn register( ) -> Result<(), Error> { let client = nexus_client::Client::new(&format!("http://{}", address), log.clone()); - client - .cpapi_producers_post(&server_info.into()) - .await - .map(|_| ()) - .map_err(|msg| Error::RegistrationError(msg.to_string())) + client.cpapi_producers_post(&server_info.into()).await.map(|_| ()).map_err( + |err| { + let retryable = match &err { + nexus_client::Error::CommunicationError(..) => true, + nexus_client::Error::ErrorResponse(resp) => { + resp.status().is_server_error() + } + _ => false, + }; + let msg = err.to_string(); + Error::RegistrationError { retryable, msg } + }, + ) } /// Handle a request to pull available metric data from a [`ProducerRegistry`]. diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 92336b616b..c20701e6c7 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -9,8 +9,6 @@ use crate::common::instance::{ PublishedVmmState, }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; -use crate::metrics::MetricsManager; -use crate::metrics::INSTANCE_SAMPLE_INTERVAL; use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleCause; use crate::params::ZoneBundleMetadata; @@ -235,15 +233,8 @@ struct InstanceInner { // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, - // Data used to start / stop collection of instance-related metrics. - tracking_instance_metrics: bool, - metrics_manager: MetricsManager, - // Object representing membership in the "instance manager". instance_ticket: InstanceTicket, - - // Metadata used to track statistics for this instance. - metadata: InstanceMetadata, } impl InstanceInner { @@ -390,38 +381,7 @@ impl InstanceInner { self.terminate().await?; Ok(Reaction::Terminate) } - None => { - // TODO-robustness: If we fail to register the actual kstats for - // this instance, we should produce missing samples, rather than - // nothing at all. - // - // See https://github.com/oxidecomputer/omicron/issues/4863. - // - // TODO(ben) -- need to do this once, or ignore duplicate target - // error, not every time we get a state update from propolis. - if !self.tracking_instance_metrics { - if let Err(e) = self - .metrics_manager - .track_instance( - &self.id(), - &self.metadata, - self.properties.vcpus.into(), - INSTANCE_SAMPLE_INTERVAL, - ) - .await - { - error!( - self.log, - "failed to track instance metrics, \ - no samples will be produced"; - "error" => ?e, - ); - } else { - self.tracking_instance_metrics = true; - } - } - Ok(Reaction::Continue) - } + None => Ok(Reaction::Continue), } } @@ -583,18 +543,6 @@ impl InstanceInner { ); } - // Stop tracking instance-related metrics. - if let Err(e) = - self.metrics_manager.stop_tracking_instance(self.id()).await - { - error!( - self.log, - "Failed to stop tracking instance metrics"; - "instance_id" => %self.id(), - "error" => ?e, - ); - } - // Ensure that no zone exists. This succeeds even if no zone was ever // created. // NOTE: we call`Zones::halt_and_remove_logged` directly instead of @@ -674,7 +622,6 @@ impl Instance { port_manager, storage, zone_bundler, - metrics_manager, zone_builder_factory, } = services; @@ -716,6 +663,9 @@ impl Instance { id, name: hardware.properties.hostname.clone(), description: "Test description".to_string(), + metadata: propolis_client::types::InstanceMetadata::from( + metadata.clone(), + ), image_id: Uuid::nil(), bootrom_id: Uuid::nil(), // TODO: Align the byte type w/propolis. @@ -746,10 +696,7 @@ impl Instance { storage, zone_builder_factory, zone_bundler, - tracking_instance_metrics: false, - metrics_manager, instance_ticket: ticket, - metadata, }; let inner = Arc::new(Mutex::new(instance)); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 61acf6d15f..cee3c9d4e3 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -6,7 +6,6 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; -use crate::metrics::MetricsManager; use crate::nexus::NexusClientWithResolver; use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; @@ -79,7 +78,6 @@ struct InstanceManagerInternal { port_manager: PortManager, storage: StorageHandle, zone_bundler: ZoneBundler, - metrics_manager: MetricsManager, zone_builder_factory: ZoneBuilderFactory, } @@ -89,7 +87,6 @@ pub(crate) struct InstanceManagerServices { pub port_manager: PortManager, pub storage: StorageHandle, pub zone_bundler: ZoneBundler, - pub metrics_manager: MetricsManager, pub zone_builder_factory: ZoneBuilderFactory, } @@ -100,7 +97,6 @@ pub struct InstanceManager { impl InstanceManager { /// Initializes a new [`InstanceManager`] object. - #[allow(clippy::too_many_arguments)] pub fn new( log: Logger, nexus_client: NexusClientWithResolver, @@ -108,7 +104,6 @@ impl InstanceManager { port_manager: PortManager, storage: StorageHandle, zone_bundler: ZoneBundler, - metrics_manager: MetricsManager, zone_builder_factory: ZoneBuilderFactory, ) -> Result { Ok(InstanceManager { @@ -123,7 +118,6 @@ impl InstanceManager { port_manager, storage, zone_bundler, - metrics_manager, zone_builder_factory, }), }) @@ -281,7 +275,6 @@ impl InstanceManager { port_manager: self.inner.port_manager.clone(), storage: self.inner.storage.clone(), zone_bundler: self.inner.zone_bundler.clone(), - metrics_manager: self.inner.metrics_manager.clone(), zone_builder_factory: self .inner .zone_builder_factory diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 0d9df8723d..e6f5fae1ce 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -88,6 +88,15 @@ pub struct InstanceMetadata { pub project_id: Uuid, } +impl From for propolis_client::types::InstanceMetadata { + fn from(md: InstanceMetadata) -> propolis_client::types::InstanceMetadata { + propolis_client::types::InstanceMetadata { + silo_id: md.silo_id, + project_id: md.project_id, + } + } +} + /// The body of a request to ensure that a instance and VMM are known to a sled /// agent. #[derive(Serialize, Deserialize, JsonSchema)] diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 2b0e84a012..60783a8fc6 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -237,7 +237,7 @@ impl SledAgent { hardware: InstanceHardware, instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, - _metadata: InstanceMetadata, + metadata: InstanceMetadata, ) -> Result { // respond with a fake 500 level failure if asked to ensure an instance // with more than 16 CPUs. @@ -287,6 +287,7 @@ impl SledAgent { id: propolis_id, name: hardware.properties.hostname.clone(), description: "sled-agent-sim created instance".to_string(), + metadata: metadata.into(), image_id: Uuid::default(), bootrom_id: Uuid::default(), memory: hardware.properties.memory.to_whole_mebibytes(), diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 5539c721f3..c9665e4993 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -466,7 +466,6 @@ impl SledAgent { port_manager.clone(), storage_manager.clone(), long_running_task_handles.zone_bundler.clone(), - metrics_manager.clone(), ZoneBuilderFactory::default(), )?;