Skip to content

Commit

Permalink
Retry MGS metrics checks if timeseries are missing (#6458)
Browse files Browse the repository at this point in the history
The test `integration_tests::metrics::test_mgs_metrics` has failed
flakily a few times on `main` and PRs (see #6454).

This is due to a race condition where we attempt to test that the
expected timeseries exist when they haven't yet been collected. The test
waits for the Oximeter producer endpoint to be *registered*, and then
immediately triggers the metrics collection:
https://github.com/oxidecomputer/omicron/blob/a0cdce7395cec5813fc697172b8a28a69d27093d/nexus/tests/integration_tests/metrics.rs#L741-L745

However, if we look at the logs, we see that when we trigger Oximeter to
collect the metrics, the SP pollers haven't produced any samples yet. We
can see that Oximeter does collect metrics from the MGS instance, it
just doesn't have anything yet:


https://buildomat.eng.oxide.computer/wg/0/artefact/01J6AC5R3TN7HA7J5XR46XM9JM/nxMhUQ2DddPGyS7EJUzyESwgw1GGhoLMU8bPmZEIGVtDZ9q6/01J6AC6HPHGKFQN3CY1YJ15JEC/01J6AF1QQQNVH0P76YGCVTX5JX/test_all-33bc5def8da77203-test_mgs_metrics.130753.4.log?format=x-bunyan#L106

This commit fixes this issue by wrapping the code that checks whether
the expected timeseries exist in a `poll::wait_for_condition`, allowing
them to be retried if some (or all) timeseries are not yet present. We
force Oximeter to collect any new samples on each retry. If we encounter
an unexpected timeseries, the test panics immediately, but if some
expected timeseries aren't present, we retry for up to 30 seconds. If
they're still not there, *then* the test fails.

Fixes #6454
  • Loading branch information
hawkw authored Aug 27, 2024
1 parent c9b3f9e commit 9506722
Showing 1 changed file with 84 additions and 38 deletions.
122 changes: 84 additions & 38 deletions nexus/tests/integration_tests/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -690,60 +690,106 @@ async fn test_mgs_metrics(
return;
}

let table = timeseries_query(&cptestctx, &format!("get {metric_name}"))
.await
.into_iter()
.find(|t| t.name() == metric_name);
let table = match table {
Some(table) => table,
None => panic!("missing table for {metric_name}"),
let query = format!("get {metric_name}");

// MGS polls SP sensor data once every second. It's possible that, when
// we triggered Oximeter to collect samples from MGS, it may not have
// run a poll yet, so retry this a few times to avoid a flaky failure if
// no simulated SPs have been polled yet.
//
// We really don't need to wait that long to know that the sensor
// metrics will never be present. This could probably be shorter
// than 30 seconds, but I want to be fairly generous to make sure
// there are no flaky failures even when things take way longer than
// expected...
const MAX_RETRY_DURATION: Duration = Duration::from_secs(30);
let result = wait_for_condition(
|| async {
match check_inner(cptestctx, &metric_name, &query, &expected).await {
Ok(_) => Ok(()),
Err(e) => {
eprintln!("{e}; will try again to ensure all samples are collected");
Err(CondCheckError::<()>::NotYet)
}
}
},
&Duration::from_secs(1),
&MAX_RETRY_DURATION,
)
.await;
if result.is_err() {
panic!(
"failed to find expected timeseries when running OxQL query \
{query:?} within {MAX_RETRY_DURATION:?}"
)
};

let mut found = expected
.keys()
.map(|serial| (serial.clone(), 0))
.collect::<HashMap<_, usize>>();
for timeseries in table.timeseries() {
let fields = &timeseries.fields;
let n_points = timeseries.points.len();
assert!(
n_points > 0,
"{metric_name} timeseries {fields:?} should have points"
);
let serial_str: &str = match timeseries.fields.get("chassis_serial")
// Note that *some* of these checks panic if they fail, but others call
// `anyhow::ensure!`. This is because, if we don't see all the expected
// timeseries, it's possible that this is because some sensor polls
// haven't completed yet, so we'll retry those checks a few times. On
// the other hand, if we see malformed timeseries, or timeseries that we
// don't expect to exist, that means something has gone wrong, and we
// will fail the test immediately.
async fn check_inner(
cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
name: &str,
query: &str,
expected: &HashMap<String, usize>,
) -> anyhow::Result<()> {
cptestctx.oximeter.force_collect().await;
let table = timeseries_query(&cptestctx, &query)
.await
.into_iter()
.find(|t| t.name() == name)
.ok_or_else(|| {
anyhow::anyhow!("failed to find table for {query}")
})?;

let mut found = expected
.keys()
.map(|serial| (serial.clone(), 0))
.collect::<HashMap<_, usize>>();
for timeseries in table.timeseries() {
let fields = &timeseries.fields;
let n_points = timeseries.points.len();
anyhow::ensure!(
n_points > 0,
"{name} timeseries {fields:?} should have points"
);
let serial_str: &str = match timeseries.fields.get("chassis_serial")
{
Some(FieldValue::String(s)) => s.borrow(),
Some(x) => panic!(
"{metric_name} `chassis_serial` field should be a string, but got: {x:?}"
"{name} `chassis_serial` field should be a string, but got: {x:?}"
),
None => {
panic!("{metric_name} timeseries should have a `chassis_serial` field")
panic!("{name} timeseries should have a `chassis_serial` field")
}
};
if let Some(count) = found.get_mut(serial_str) {
*count += 1;
} else {
panic!(
"{metric_name} timeseries had an unexpected chassis serial \
number {serial_str:?} (not in the config file)",
);
if let Some(count) = found.get_mut(serial_str) {
*count += 1;
} else {
panic!(
"{name} timeseries had an unexpected chassis serial \
number {serial_str:?} (not in the config file)",
);
}
}
}

eprintln!("-> {metric_name}: found timeseries: {found:#?}");
assert_eq!(
found, expected,
"number of {metric_name} timeseries didn't match expected in {table:#?}",
);
eprintln!("-> okay, looks good!");
eprintln!("-> {name}: found timeseries: {found:#?}");
anyhow::ensure!(
&found == expected,
"number of {name} timeseries didn't match expected in {table:#?}",
);
eprintln!("-> okay, looks good!");
Ok(())
}
}

// Wait until the MGS registers as a producer with Oximeter.
wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await;

// ...and collect its samples.
cptestctx.oximeter.force_collect().await;

check_all_timeseries_present(&cptestctx, "temperature", temp_sensors).await;
check_all_timeseries_present(&cptestctx, "voltage", voltage_sensors).await;
check_all_timeseries_present(&cptestctx, "current", current_sensors).await;
Expand Down

0 comments on commit 9506722

Please sign in to comment.