From 8705d657d0de1cd7fcc3464b1a26906dec6c403b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 23 Aug 2024 11:22:32 -0700 Subject: [PATCH] make the test way fancier I realized the test could read the actual SP simulator config file, and figure out how many metrics should be present. That way, if someone changes the config file later, it'll still work. --- nexus/tests/integration_tests/metrics.rs | 169 +++++++++++++++++------ 1 file changed, 125 insertions(+), 44 deletions(-) diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs index 58ff6c6f64..6fac010d3c 100644 --- a/nexus/tests/integration_tests/metrics.rs +++ b/nexus/tests/integration_tests/metrics.rs @@ -27,6 +27,7 @@ use oximeter::types::FieldValue; use oximeter::types::Measurement; use oximeter::TimeseriesSchema; use std::borrow::Borrow; +use std::collections::HashMap; use uuid::Uuid; pub async fn query_for_metrics( @@ -595,9 +596,9 @@ async fn test_mgs_metrics( cptestctx: &ControlPlaneTestContext, ) { // Make a MGS + let (mut mgs_config, sp_sim_config) = + gateway_test_utils::setup::load_test_config(); let mgs = { - let (mut mgs_config, sp_sim_config) = - gateway_test_utils::setup::load_test_config(); // munge the already-parsed MGS config file to point it at the test // Nexus' address. mgs_config.metrics.get_or_insert_with(Default::default).dev = @@ -615,72 +616,152 @@ async fn test_mgs_metrics( .await }; - // Wait until the MGS registers as a producer with Oximeter. - wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; - cptestctx.oximeter.force_collect().await; + // Let's look at all the simulated SP components in the config file which + // have sensor readings, so we can assert that there are timeseries for all + // of them. + let all_sp_configs = { + let gimlet_configs = + sp_sim_config.simulated_sps.gimlet.iter().map(|g| &g.common); + let sidecar_configs = + sp_sim_config.simulated_sps.sidecar.iter().map(|s| &s.common); + gimlet_configs.chain(sidecar_configs) + }; + // XXX(eliza): yes, this code is repetitive. We could probably make it a + // little elss ugly with nested hash maps, but like...I already wrote it, so + // you don't have to. :) + // + // TODO(eliza): presently, we just expect that the number of timeseries for + // each serial number and sensor type lines up. If we wanted to be *really* + // fancy, we could also assert that all the component IDs, component kinds, + // and measurement values line up with the config. But, honestly, it's + // pretty unlikely that a bug in MGS' sensor metrics subsystem would mess + // that up --- the most important thing is just to make sure that the sensor + // data is *present*, as that should catch most regressions. + let mut temp_sensors = HashMap::new(); + let mut current_sensors = HashMap::new(); + let mut voltage_sensors = HashMap::new(); + let mut power_sensors = HashMap::new(); + let mut input_voltage_sensors = HashMap::new(); + let mut input_current_sensors = HashMap::new(); + let mut fan_speed_sensors = HashMap::new(); + for sp in all_sp_configs { + let mut temp = 0; + let mut current = 0; + let mut voltage = 0; + let mut input_voltage = 0; + let mut input_current = 0; + let mut power = 0; + let mut speed = 0; + for component in &sp.components { + for sensor in &component.sensors { + use gateway_messages::measurement::MeasurementKind as Kind; + match sensor.def.kind { + Kind::Temperature => temp += 1, + Kind::Current => current += 1, + Kind::Voltage => voltage += 1, + Kind::InputVoltage => input_voltage += 1, + Kind::InputCurrent => input_current += 1, + Kind::Speed => speed += 1, + Kind::Power => power += 1, + } + } + } + temp_sensors.insert(sp.serial_number.clone(), temp); + current_sensors.insert(sp.serial_number.clone(), current); + voltage_sensors.insert(sp.serial_number.clone(), voltage); + input_voltage_sensors.insert(sp.serial_number.clone(), input_voltage); + input_current_sensors.insert(sp.serial_number.clone(), input_current); + fan_speed_sensors.insert(sp.serial_number.clone(), speed); + power_sensors.insert(sp.serial_number.clone(), power); + } - async fn get_timeseries( + async fn check_all_timeseries_present( cptestctx: &ControlPlaneTestContext, name: &str, - ) -> oxql_types::Table { - let table = timeseries_query(&cptestctx, &format!("get {name}")) + expected: HashMap, + ) { + let metric_name = format!("hardware_component:{name}"); + eprintln!("\n=== checking timeseries for {metric_name} ===\n"); + + if expected.values().all(|&v| v == 0) { + eprintln!( + "-> SP sim config contains no {name} sensors, skipping it" + ); + return; + } + + let table = timeseries_query(&cptestctx, &format!("get {metric_name}")) .await .into_iter() - .find(|t| t.name() == name); - match table { + .find(|t| t.name() == metric_name); + let table = match table { Some(table) => table, - None => panic!("missing table for {name}"), - } - } + None => panic!("missing table for {metric_name}"), + }; - #[track_caller] - fn check_all_serials_present(table: oxql_types::Table) { - let mut sim_gimlet_00 = 0; - let mut sim_gimlet_01 = 0; + let mut found = expected + .keys() + .map(|serial| (serial.clone(), 0)) + .collect::>(); for timeseries in table.timeseries() { let fields = ×eries.fields; let n_points = timeseries.points.len(); - eprintln!("found timeseries: {fields:?} ({n_points} points)"); - assert!(n_points > 0, "timeseries {fields:?} should have points"); - let serial_str = match timeseries.fields.get("chassis_serial") { + assert!( + n_points > 0, + "{metric_name} timeseries {fields:?} should have points" + ); + let serial_str: &str = match timeseries.fields.get("chassis_serial") + { Some(FieldValue::String(s)) => s.borrow(), Some(x) => panic!( - "`chassis_serial` field should be a string, but got: {x:?}" + "{metric_name} `chassis_serial` field should be a string, but got: {x:?}" ), None => { - panic!("timeseries should have a `chassis_serial` field") + panic!("{metric_name} timeseries should have a `chassis_serial` field") } }; - match serial_str { - "SimGimlet00" => sim_gimlet_00 += 1, - "SimGimlet01" => sim_gimlet_01 += 1, - // if someone adds sensor readings to the fake sidecar later, - // that's okay... - _ => eprintln!("bonus simulated chassis serial {serial_str:?}"), + if let Some(count) = found.get_mut(serial_str) { + *count += 1; + } else { + panic!( + "{metric_name} timeseries had an unexpected chassis serial \ + number {serial_str:?} (not in the config file)", + ); } } - assert!( - sim_gimlet_00 > 0, - "expected at least one timeseries from SimGimlet00 in {table:#?}" - ); - assert!( - sim_gimlet_01 > 0, - "expected at least one timeseries from SimGimlet01 in {table:#?}" + eprintln!("-> {metric_name}: found timeseries: {found:#?}"); + assert_eq!( + found, expected, + "number of {metric_name} timeseries didn't match expected in {table:#?}", ); + eprintln!("-> okay, looks good!"); } - let temp_metrics = - get_timeseries(&cptestctx, "hardware_component:temperature").await; - check_all_serials_present(temp_metrics); + // Wait until the MGS registers as a producer with Oximeter. + wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await; - let voltage_metrics = - get_timeseries(&cptestctx, "hardware_component:voltage").await; - check_all_serials_present(voltage_metrics); + // ...and collect its samples. + cptestctx.oximeter.force_collect().await; - let current_metrics = - get_timeseries(&cptestctx, "hardware_component:current").await; - check_all_serials_present(current_metrics); + check_all_timeseries_present(&cptestctx, "temperature", temp_sensors).await; + check_all_timeseries_present(&cptestctx, "voltage", voltage_sensors).await; + check_all_timeseries_present(&cptestctx, "current", current_sensors).await; + check_all_timeseries_present(&cptestctx, "power", power_sensors).await; + check_all_timeseries_present( + &cptestctx, + "input_voltage", + input_voltage_sensors, + ) + .await; + check_all_timeseries_present( + &cptestctx, + "input_current", + input_current_sensors, + ) + .await; + check_all_timeseries_present(&cptestctx, "fan_speed", fan_speed_sensors) + .await; // Because the `ControlPlaneTestContext` isn't managing the MGS we made for // this test, we are responsible for removing its logs.