[wicket] attempt to fix test_inventory (#6456)

I haven't been able to reproduce this locally, but this is my best guess as to what's going wrong here: MGS/wicketd learns about SPs but due to a race/load on the system, misses out on populating their state and instead leaves it empty. That causes the SPs to be filtered out here: https://github.com/oxidecomputer/omicron/blob/7a6f45c5504bb092ce738d165cc88736ba4a9092/wicketd/src/rss_config.rs#L129 This theory is buttressed by the fact that in failing logs, the returned inventory is a lot smaller than what I'm seeing locally. For example, in the logs for [this failing test](https://buildomat.eng.oxide.computer/wg/0/details/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA) I see [a 1430 byte response](https://buildomat.eng.oxide.computer/wg/0/artefact/01J69AR918WAQNFKSBS85EAQPV/kkFMDYhAM3Vxb5ujRHlyAO9thmIAc7mHjHuicct0gS2bL8xu/01J69ARHYXXSKXKG8J49SRZVTA/01J69ENP3EF3A212GVAGEMBDVQ/mod-ff551cc639cd8d16-test_inventory.21679.0.log?format=x-bunyan#L640): ``` test_inventory (wicketd test client): client response result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(45364), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "e68141e2-4c4f-46ec-a49b-9f8aa11a3410", "content-length": "1430", "date": "Tue, 27 Aug 2024 08:13:01 GMT"} }) ``` But in passing runs locally, I see a much larger 8654 byte response ([full logs](https://gist.github.com/sunshowers/b9c1868ba4c8c4bd3eec49cc4b56516d)): ``` 19:32:43.847Z DEBG test_inventory (wicketd test client): client response result = Ok(Response { url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Ipv6(::1)), port: Some(44183), path: "/inventory", query: None, fragment: None }, status: 200, headers: {"content-type": "application/json", "x-request-id": "8b48dae0-025d-426a-82f0-1dd8323670d5", "content-length": "8654", "date": "Tue, 27 Aug 2024 19:32:43 GMT"} }) ``` Based on this theory, this PR changes the exit condition for the poll loop to also consider all of the SP states being present. In case there's something else going on, the PR also adds a bunch of additional logging. Fixes #6300.
oxidecomputer · Aug 27, 2024 · e3ec364 · e3ec364
1 parent a24fa8c
commit e3ec364
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 11 deletions.
diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs
@@ -82,6 +82,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
 
         Ok(HttpResponseOk((&*config).into()))
@@ -101,6 +102,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
         config
             .update(body.into_inner(), ctx.baseboard.as_ref())

diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs
@@ -26,6 +26,7 @@ use omicron_common::api::external::AllowedSourceIps;
 use omicron_common::api::external::SwitchLocation;
 use once_cell::sync::Lazy;
 use sled_hardware_types::Baseboard;
+use slog::debug;
 use slog::warn;
 use std::collections::btree_map;
 use std::collections::BTreeMap;
@@ -115,6 +116,7 @@ impl CurrentRssConfig {
         &mut self,
         inventory: &RackV1Inventory,
         bootstrap_peers: &BootstrapPeers,
+        log: &slog::Logger,
     ) {
         let bootstrap_sleds = bootstrap_peers.sleds();
 
@@ -126,7 +128,15 @@ impl CurrentRssConfig {
                     return None;
                 }
 
-                let state = sp.state.as_ref()?;
+                let Some(state) = sp.state.as_ref() else {
+                    debug!(
+                        log,
+                        "in update_with_inventory_and_bootstrap_peers, \
+                         filtering out SP with no state";
+                        "sp" => ?sp,
+                    );
+                    return None;
+                };
                 let baseboard = Baseboard::new_gimlet(
                     state.serial_number.clone(),
                     state.model.clone(),

diff --git a/wicketd/tests/integration_tests/inventory.rs b/wicketd/tests/integration_tests/inventory.rs
@@ -10,6 +10,7 @@ use super::setup::WicketdTestContext;
 use gateway_messages::SpPort;
 use gateway_test_utils::setup as gateway_setup;
 use sled_hardware_types::Baseboard;
+use slog::{info, warn};
 use wicket::OutputKind;
 use wicket_common::inventory::{SpIdentifier, SpType};
 use wicket_common::rack_setup::BootstrapSledDescription;
@@ -32,20 +33,38 @@ async fn test_inventory() {
                 .into_inner();
             match response {
                 GetInventoryResponse::Response { inventory, .. } => {
-                    break inventory
-                }
-                GetInventoryResponse::Unavailable => {
-                    // Keep polling wicketd until it receives its first results from MGS.
-                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    // Ensure that the SP state is populated -- if it's not,
+                    // then the `configured-bootstrap-sleds` command below
+                    // might return an empty list.
+                    let sp_state_none: Vec<_> = inventory
+                        .sps
+                        .iter()
+                        .filter(|sp| sp.state.is_none())
+                        .collect();
+                    if sp_state_none.is_empty() {
+                        break inventory;
+                    }
+
+                    warn!(
+                        wicketd_testctx.log(),
+                        "SP state not yet populated for some SPs, retrying";
+                        "sps" => ?sp_state_none
+                    )
                 }
+                GetInventoryResponse::Unavailable => {}
             }
+
+            // Keep polling wicketd until it receives its first results from MGS.
+            tokio::time::sleep(Duration::from_millis(100)).await;
         }
     };
     let inventory =
         tokio::time::timeout(Duration::from_secs(10), inventory_fut)
             .await
             .expect("get_inventory completed within 10 seconds");
 
+    info!(wicketd_testctx.log(), "inventory returned"; "inventory" => ?inventory);
+
     // 4 SPs attached to the inventory.
     assert_eq!(inventory.sps.len(), 4);
 
@@ -70,17 +89,17 @@ async fn test_inventory() {
             serde_json::from_slice(&stdout).expect("stdout is valid JSON");
 
         // This only tests the case that we get sleds back with no current
-        // bootstrap IP. This does provide svalue: it check that the command
-        // exists, accesses data within wicket, and returns it in the schema we
-        // expect. But it does not test the case where a sled does have a
-        // bootstrap IP.
+        // bootstrap IP. This does provide some value: it checks that the
+        // command exists, accesses data within wicket, and returns it in the
+        // schema we expect. But it does not test the case where a sled does
+        // have a bootstrap IP.
         //
         // Unfortunately, that's a difficult thing to test today. Wicket gets
         // that information by enumerating the IPs on the bootstrap network and
         // reaching out to the bootstrap_agent on them directly to ask them who
         // they are. Our testing setup does not have a way to provide such an
         // IP, or run a bootstrap_agent on an IP to respond. We should update
-        // this test when we do have that capabilitiy.
+        // this test when we do have that capability.
         assert_eq!(
             response,
             vec![