diff --git a/Cargo.lock b/Cargo.lock index e5130b6b33..83795604d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4405,6 +4405,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "nexus-inventory" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "futures", + "gateway-client", + "gateway-messages", + "nexus-db-model", + "nexus-db-queries", + "strum", + "uuid", +] + [[package]] name = "nexus-test-interface" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 9498157b28..b59ae40160 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/test-interface", "nexus/test-utils-macros", "nexus/test-utils", @@ -100,6 +101,7 @@ default-members = [ "nexus/db-model", "nexus/db-queries", "nexus/defaults", + "nexus/inventory", "nexus/types", "oxide-client", "oximeter-client", @@ -226,6 +228,7 @@ nexus-client = { path = "nexus-client" } nexus-db-model = { path = "nexus/db-model" } nexus-db-queries = { path = "nexus/db-queries" } nexus-defaults = { path = "nexus/defaults" } +nexus-inventory = { path = "nexus/inventory" } omicron-certificates = { path = "certificates" } omicron-passwords = { path = "passwords" } nexus-test-interface = { path = "nexus/test-interface" } @@ -365,8 +368,8 @@ tufaceous = { path = "tufaceous" } tufaceous-lib = { path = "tufaceous-lib" } unicode-width = "0.1.10" update-engine = { path = "update-engine" } -uuid = { version = "1.4.1", features = ["serde", "v4"] } usdt = "0.3" +uuid = { version = "1.4.1", features = ["serde", "v4"] } walkdir = "2.4" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml new file mode 100644 index 0000000000..6a16f45b36 --- /dev/null +++ b/nexus/inventory/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "nexus-inventory" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-messages.workspace = true +nexus-db-model.workspace = true +nexus-db-queries.workspace = true +strum.workspace = true +uuid.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs new file mode 100644 index 0000000000..adf345a0f5 --- /dev/null +++ b/nexus/inventory/src/builder.rs @@ -0,0 +1,230 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Interface for building [`Collection`] dynamically + +use crate::BaseboardId; +use crate::Caboose; +use crate::Collection; +use crate::RotState; +use crate::ServiceProcessor; +use anyhow::anyhow; +use chrono::DateTime; +use chrono::Utc; +use gateway_client::types::SpState; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use strum::EnumIter; + +// XXX-dap add rack id + +#[derive(Clone, Copy, Debug, EnumIter)] +pub enum CabooseWhich { + SpSlot0, + SpSlot1, + RotSlotA, + RotSlotB, +} + +#[derive(Debug)] +pub struct CollectionBuilder { + errors: Vec, + time_started: DateTime, + creator: String, + comment: String, + baseboards: BTreeSet>, + cabooses: BTreeSet>, + sps: BTreeMap, ServiceProcessor>, + // ignition_found: Vec, + // ignition_powered_off: Vec, + // ignition_missing: Vec, +} + +impl CollectionBuilder { + pub fn new(creator: &str, comment: &str) -> Self { + CollectionBuilder { + errors: vec![], + time_started: Utc::now(), + creator: creator.to_owned(), + comment: comment.to_owned(), + baseboards: BTreeSet::new(), + cabooses: BTreeSet::new(), + sps: BTreeMap::new(), + // ignition_found: vec![], + // ignition_powered_off: vec![], + // ignition_missing: vec![], + } + } + + pub fn build(self) -> Collection { + Collection { + errors: self.errors, + time_started: self.time_started, + time_done: Utc::now(), + creator: self.creator, + comment: self.comment, + baseboards: self.baseboards, + cabooses: self.cabooses, + sps: self.sps, + } + } + + // XXX-dap I think this just belongs in the caller. + // pub fn found_ignition( + // &mut self, + // sp_id: SpIdentifier, + // ignition: SpIgnition, + // ) { + // let mut vec = match ignition { + // SpIgnition::Yes { power: true, .. } => &mut self.ignition_found, + // SpIgnition::Yes { power: false, .. } => { + // &mut self.ignition_powered_off + // } + // SpIgnition::No => &mut self.ignition_missing, + // }; + // + // vec.push(sp_id); + // } + + // XXX-dap this model here, where we invoke enum_ignition() and then + // powered_on_sps() and expect to do enum_sp() after that...it's promising, + // but I think it currently assumes that SpIdentifiers will be consistent + // across MGS instances. It would be better to avoid this if we can. And + // we should be able to. + // + // I think it's really more like: + // + // - found_mgs_client(mgs_client) + // - enum_mgs_clients(): for each: + // - found_ignition(client, ignition) + // - next_wanted() returns an (mgs_client, sp_id, SpInfoWanted) + // - SpInfoWanted = State | Caboose1 | Caboose2 | ... + // - this tells the caller exactly what to do next + // - am I putting too much control flow into this struct? wasn't the + // idea to make this simple and passive and _not_ include all that + // logic? That's a nice idea but the problem is that intrinsically one + // needs the state about what information we have about what SPs and + // which MGS's we've tried in order to decide what to do next + // - but this approach is not concurrency-friendly, and I don't really + // see a way to make it concurrency-friendly without baking the whole + // thing into this struct, which, again, defeats the point. + // + // That brings me back to: let's make this thing really simple. Caller + // reports what it's found. Caller is responsible for control flow and + // figuring out what's next and in what order. + // + // Next step: rethink this struct, then go back to collector.rs and write a + // driver. + //pub fn powered_on_sps(&self) -> impl Iterator { + // self.ignition_found.iter().cloned(); + //} + + pub fn found_sp_state( + &mut self, + source: &str, + sp_state: SpState, + ) -> Arc { + let baseboard = Self::enum_item( + &mut self.baseboards, + BaseboardId { + serial_number: sp_state.serial_number, + part_number: sp_state.model, + }, + ); + + let rot = RotState::try_from(sp_state.rot).ok(); + let _ = self.sps.entry(baseboard.clone()).or_insert_with(|| { + ServiceProcessor { + baseboard: baseboard.clone(), + time_collected: Utc::now(), + source: source.to_owned(), + hubris_archive: sp_state.hubris_archive_id, + power_state: sp_state.power_state, + rot, + sp_slot0_caboose: None, + sp_slot1_caboose: None, + rot_slot_a_caboose: None, + rot_slot_b_caboose: None, + } + }); + + baseboard + } + + pub fn sp_found_caboose_already( + &self, + baseboard: &BaseboardId, + which: CabooseWhich, + ) -> bool { + self.sps + .get(baseboard) + .map(|sp| { + let ptr = match which { + CabooseWhich::SpSlot0 => &sp.sp_slot0_caboose, + CabooseWhich::SpSlot1 => &sp.sp_slot1_caboose, + CabooseWhich::RotSlotA => &sp.rot_slot_a_caboose, + CabooseWhich::RotSlotB => &sp.rot_slot_b_caboose, + }; + ptr.is_some() + }) + .unwrap_or(false) + } + + pub fn found_sp_caboose( + &mut self, + baseboard: &BaseboardId, + which: CabooseWhich, + caboose: Caboose, + ) -> Result<(), anyhow::Error> { + let caboose = Self::enum_item(&mut self.cabooses, caboose); + let sp = self.sps.get_mut(baseboard).ok_or_else(|| { + anyhow!("reporting caboose for unknown baseboard: {:?}", baseboard) + })?; + let ptr = match which { + CabooseWhich::SpSlot0 => &mut sp.sp_slot0_caboose, + CabooseWhich::SpSlot1 => &mut sp.sp_slot1_caboose, + CabooseWhich::RotSlotA => &mut sp.rot_slot_a_caboose, + CabooseWhich::RotSlotB => &mut sp.rot_slot_b_caboose, + }; + + if let Some(already) = ptr { + let error = if *already == caboose { + anyhow!("reported multiple times (same value)",) + } else { + anyhow!( + "reported caboose multiple times (previously {:?}, \ + now {:?}, keeping only the first one)", + already, + caboose + ) + }; + Err(error.context(format!( + "baseboard {:?} caboose {:?}", + baseboard, which + ))) + } else { + *ptr = Some(caboose); + Ok(()) + } + } + + fn enum_item( + items: &mut BTreeSet>, + item: T, + ) -> Arc { + match items.get(&item) { + Some(found_item) => found_item.clone(), + None => { + let new_item = Arc::new(item); + items.insert(new_item.clone()); + new_item + } + } + } + + pub fn found_error(&mut self, error: anyhow::Error) { + self.errors.push(error); + } +} diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs new file mode 100644 index 0000000000..d8e0db048f --- /dev/null +++ b/nexus/inventory/src/collector.rs @@ -0,0 +1,155 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collection of inventory from Omicron components + +use crate::builder::CabooseWhich; +use crate::builder::CollectionBuilder; +use crate::Caboose; +use crate::Collection; +use anyhow::Context; +use std::sync::Arc; +use strum::IntoEnumIterator; + +// XXX-dap rename to Enumerator? +pub struct Collector { + mgs_clients: Vec>, + in_progress: CollectionBuilder, +} + +impl Collector { + pub fn new( + creator: &str, + comment: &str, + mgs_clients: &[Arc], + ) -> Self { + Collector { + mgs_clients: mgs_clients.to_vec(), + in_progress: CollectionBuilder::new(creator, comment), + } + } + + // XXX-dap TODO-doc, especially errors + pub async fn enumerate(mut self) -> Result { + // We're about to do a bunch of asynchronous operations. With a + // combination of async, futures, and some cleverness, we could do much + // of this in parallel. But this code path is not remotely + // latency-sensitive. And there's real risk of overloading our + // downstream services. So we just do one step at a time. This also + // keeps the code pretty simple. + + let clients = self.mgs_clients.clone(); + for client in clients { + self.enumerate_client(&client).await?; + } + + Ok(self.in_progress.build()) + } + + pub async fn enumerate_client( + &mut self, + client: &gateway_client::Client, + ) -> Result<(), anyhow::Error> { + // First, see which SPs MGS can see via Ignition. + let ignition_result = client.ignition_list().await.with_context(|| { + format!("MGS {:?}: listing ignition targets", client.baseurl()) + }); + + // Select only the SPs that appear powered on. + let sps = match ignition_result { + Err(error) => { + self.in_progress.found_error(error); + return Ok(()); + } + + Ok(targets) => { + targets.into_inner().into_iter().filter_map(|sp_ignition| { + match sp_ignition.details { + gateway_client::types::SpIgnition::No => None, + gateway_client::types::SpIgnition::Yes { + power: false, + .. + } => None, + gateway_client::types::SpIgnition::Yes { + power: true, + .. + } => Some(sp_ignition.id), + } + }) + } + }; + + // Fetch the state and caboose information for each SP. + for sp in sps { + // First, fetch the state of the SP. + let result = + client.sp_get(sp.type_, sp.slot).await.with_context(|| { + format!( + "MGS {:?}: fetching state of SP {:?}", + client.baseurl(), + sp + ) + }); + + let sp_state = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => response.into_inner(), + }; + + let baseboard_id = + self.in_progress.found_sp_state(client.baseurl(), sp_state); + + // For each caboose that we care about, if it hasn't been fetched + // already, fetch it. Generally, we'd only get here for the first + // MGS client. Assuming that one succeeds, the others will skip + // all these iterations. + for which in CabooseWhich::iter() { + if self + .in_progress + .sp_found_caboose_already(&baseboard_id, which) + { + continue; + } + + let (component, slot) = match which { + CabooseWhich::SpSlot0 => ("sp", 0), + CabooseWhich::SpSlot1 => ("sp", 1), + CabooseWhich::RotSlotA => ("rot", 0), + CabooseWhich::RotSlotB => ("rot", 1), + }; + + let result = client + .sp_component_caboose_get( + sp.type_, sp.slot, component, slot, + ) + .await + .with_context(|| { + format!( + "MGS {:?}: SP {:?}: caboose {:?}", + client.baseurl(), + sp, + which + ) + }); + let caboose = match result { + Err(error) => { + self.in_progress.found_error(error); + continue; + } + Ok(response) => Caboose::from(response.into_inner()), + }; + self.in_progress.found_sp_caboose( + &baseboard_id, + which, + caboose, + )?; + } + } + + Ok(()) + } +} diff --git a/nexus/inventory/src/lib.rs b/nexus/inventory/src/lib.rs new file mode 100644 index 0000000000..e61e18bcb0 --- /dev/null +++ b/nexus/inventory/src/lib.rs @@ -0,0 +1,125 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Omicron component inventory +//! XXX-dap TODO-doc +//! +//! This is currently inside Nexus, but it's expected to have few dependencies +//! on parts of Nexus (beyond the database crates) and could conceivably be put +//! into other components. + +pub use collector::Collector; + +use anyhow::anyhow; +use chrono::DateTime; +use chrono::Utc; +use gateway_client::types::PowerState; +use gateway_client::types::RotSlot; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; + +mod builder; +mod collector; + +/// Results of collecting inventory from various Omicron components +#[derive(Debug)] +pub struct Collection { + /// errors encountered during collection + pub errors: Vec, + /// time the collection started + pub time_started: DateTime, + /// time the collection eneded + pub time_done: DateTime, + /// name of the agent doing the collecting (generally, this Nexus's uuid) + pub creator: String, + /// reason for triggering this collection + pub comment: String, + + pub baseboards: BTreeSet>, + pub cabooses: BTreeSet>, + pub sps: BTreeMap, ServiceProcessor>, +} + +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct BaseboardId { + pub serial_number: String, + pub part_number: String, +} + +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct Caboose { + pub board: String, + pub git_commit: String, + pub name: String, + pub version: String, +} + +impl From for Caboose { + fn from(c: gateway_client::types::SpComponentCaboose) -> Self { + Caboose { + board: c.board, + git_commit: c.git_commit, + name: c.name, + // The MGS API uses an `Option` here because old SP versions did not + // supply it. But modern SP versions do. So we should never hit + // this `unwrap_or()`. + version: c.version.unwrap_or(String::from("")), + } + } +} + +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct ServiceProcessor { + pub baseboard: Arc, + pub time_collected: DateTime, + pub source: String, + + pub hubris_archive: String, + pub power_state: PowerState, + pub rot: Option, + + pub sp_slot0_caboose: Option>, + pub sp_slot1_caboose: Option>, + pub rot_slot_a_caboose: Option>, + pub rot_slot_b_caboose: Option>, +} + +#[derive(Clone, Debug, Ord, Eq, PartialOrd, PartialEq)] +pub struct RotState { + pub active_slot: RotSlot, + pub persistent_boot_preference: RotSlot, + pub pending_persistent_boot_preference: Option, + pub transient_boot_preference: Option, + pub slot_a_sha3_256_digest: Option, + pub slot_b_sha3_256_digest: Option, +} + +impl TryFrom for RotState { + type Error = anyhow::Error; + fn try_from( + value: gateway_client::types::RotState, + ) -> Result { + match value { + gateway_client::types::RotState::Enabled { + active, + pending_persistent_boot_preference, + persistent_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + transient_boot_preference, + } => Ok(RotState { + active_slot: active, + persistent_boot_preference, + pending_persistent_boot_preference, + transient_boot_preference, + slot_a_sha3_256_digest, + slot_b_sha3_256_digest, + }), + gateway_client::types::RotState::CommunicationFailed { + message, + } => Err(anyhow!("communication with SP failed: {}", message)), + } + } +}