Skip to content

Commit

Permalink
[wicket] attempt to fix test_inventory (#6456)
Browse files Browse the repository at this point in the history
I haven't been able to reproduce this locally, but this is my best guess
as to what's going wrong here: MGS/wicketd learns about SPs but due to a
race/load on the system, misses out on populating their state and
instead leaves it empty. That causes the SPs to be filtered out here:

https://github.com/oxidecomputer/omicron/blob/7a6f45c5504bb092ce738d165cc88736ba4a9092/wicketd/src/rss_config.rs#L129

This theory is buttressed by the fact that in failing logs, the returned
inventory is a lot smaller than what I'm seeing locally.

For example, in the logs for [this failing
test](https://buildomat.eng.oxide.computer/wg/0/details/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA)
I see [a 1430 byte
response](https://buildomat.eng.oxide.computer/wg/0/artefact/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA/01J69ENP3EF3A212GVAGEMBDVQ/mod-ff551cc639cd8d16-test_inventory.21679.0.log?format=x-bunyan#L640):

```
test_inventory (wicketd test client): client response
    result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(45364), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "e68141e2-4c4f-46ec-a49b-9f8aa11a3410", "content-length": "1430", "date": "Tue, 27 Aug 2024 08:13:01 GMT"} })
```

But in passing runs locally, I see a much larger 8654 byte response
([full
logs](https://gist.github.com/sunshowers/b9c1868ba4c8c4bd3eec49cc4b56516d)):

```
19:32:43.847Z DEBG test_inventory (wicketd test client): client response
    result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(44183), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "8b48dae0-025d-426a-82f0-1dd8323670d5", "content-length": "8654", "date": "Tue, 27 Aug 2024 19:32:43 GMT"} })
```

Based on this theory, this PR changes the exit condition for the poll
loop to also consider all of the SP states being present.

In case there's something else going on, the PR also adds a bunch of
additional logging.

Fixes #6300.
  • Loading branch information
sunshowers authored Aug 27, 2024
1 parent a24fa8c commit e3ec364
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 11 deletions.
2 changes: 2 additions & 0 deletions wicketd/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ impl WicketdApi for WicketdApiImpl {
config.update_with_inventory_and_bootstrap_peers(
&inventory,
&ctx.bootstrap_peers,
&ctx.log,
);

Ok(HttpResponseOk((&*config).into()))
Expand All @@ -101,6 +102,7 @@ impl WicketdApi for WicketdApiImpl {
config.update_with_inventory_and_bootstrap_peers(
&inventory,
&ctx.bootstrap_peers,
&ctx.log,
);
config
.update(body.into_inner(), ctx.baseboard.as_ref())
Expand Down
12 changes: 11 additions & 1 deletion wicketd/src/rss_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use omicron_common::api::external::AllowedSourceIps;
use omicron_common::api::external::SwitchLocation;
use once_cell::sync::Lazy;
use sled_hardware_types::Baseboard;
use slog::debug;
use slog::warn;
use std::collections::btree_map;
use std::collections::BTreeMap;
Expand Down Expand Up @@ -115,6 +116,7 @@ impl CurrentRssConfig {
&mut self,
inventory: &RackV1Inventory,
bootstrap_peers: &BootstrapPeers,
log: &slog::Logger,
) {
let bootstrap_sleds = bootstrap_peers.sleds();

Expand All @@ -126,7 +128,15 @@ impl CurrentRssConfig {
return None;
}

let state = sp.state.as_ref()?;
let Some(state) = sp.state.as_ref() else {
debug!(
log,
"in update_with_inventory_and_bootstrap_peers, \
filtering out SP with no state";
"sp" => ?sp,
);
return None;
};
let baseboard = Baseboard::new_gimlet(
state.serial_number.clone(),
state.model.clone(),
Expand Down
39 changes: 29 additions & 10 deletions wicketd/tests/integration_tests/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use super::setup::WicketdTestContext;
use gateway_messages::SpPort;
use gateway_test_utils::setup as gateway_setup;
use sled_hardware_types::Baseboard;
use slog::{info, warn};
use wicket::OutputKind;
use wicket_common::inventory::{SpIdentifier, SpType};
use wicket_common::rack_setup::BootstrapSledDescription;
Expand All @@ -32,20 +33,38 @@ async fn test_inventory() {
.into_inner();
match response {
GetInventoryResponse::Response { inventory, .. } => {
break inventory
}
GetInventoryResponse::Unavailable => {
// Keep polling wicketd until it receives its first results from MGS.
tokio::time::sleep(Duration::from_millis(100)).await;
// Ensure that the SP state is populated -- if it's not,
// then the `configured-bootstrap-sleds` command below
// might return an empty list.
let sp_state_none: Vec<_> = inventory
.sps
.iter()
.filter(|sp| sp.state.is_none())
.collect();
if sp_state_none.is_empty() {
break inventory;
}

warn!(
wicketd_testctx.log(),
"SP state not yet populated for some SPs, retrying";
"sps" => ?sp_state_none
)
}
GetInventoryResponse::Unavailable => {}
}

// Keep polling wicketd until it receives its first results from MGS.
tokio::time::sleep(Duration::from_millis(100)).await;
}
};
let inventory =
tokio::time::timeout(Duration::from_secs(10), inventory_fut)
.await
.expect("get_inventory completed within 10 seconds");

info!(wicketd_testctx.log(), "inventory returned"; "inventory" => ?inventory);

// 4 SPs attached to the inventory.
assert_eq!(inventory.sps.len(), 4);

Expand All @@ -70,17 +89,17 @@ async fn test_inventory() {
serde_json::from_slice(&stdout).expect("stdout is valid JSON");

// This only tests the case that we get sleds back with no current
// bootstrap IP. This does provide svalue: it check that the command
// exists, accesses data within wicket, and returns it in the schema we
// expect. But it does not test the case where a sled does have a
// bootstrap IP.
// bootstrap IP. This does provide some value: it checks that the
// command exists, accesses data within wicket, and returns it in the
// schema we expect. But it does not test the case where a sled does
// have a bootstrap IP.
//
// Unfortunately, that's a difficult thing to test today. Wicket gets
// that information by enumerating the IPs on the bootstrap network and
// reaching out to the bootstrap_agent on them directly to ask them who
// they are. Our testing setup does not have a way to provide such an
// IP, or run a bootstrap_agent on an IP to respond. We should update
// this test when we do have that capabilitiy.
// this test when we do have that capability.
assert_eq!(
response,
vec![
Expand Down

0 comments on commit e3ec364

Please sign in to comment.